[clang] [analyzer] Switch to PostStmt callbacks in ArrayBoundV2 (PR #72107)

via cfe-commits cfe-commits at lists.llvm.org
Mon Nov 13 03:59:03 PST 2023


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-clang-static-analyzer-1

Author: None (DonatNagyE)

<details>
<summary>Changes</summary>

...instead of the currently used, more abstract Location callback. The main advantage of this change is that after it the checker will check `array[index].field` while the previous implementation ignored this situation (because here the ElementRegion is wrapped in a FieldRegion object). This improvement fixes PR #<!-- -->70187.

Note that after this change `&array[idx]` will be handled as an access to the `idx`th element of `array`, which is technically incorrect but matches the programmer intuitions. In my opinion it's more helpful if the report points to the source location where the indexing happens (instead of the location where a pointer is finally dereferenced).

This change introduces false positives in the exceptional corner case when the past-the-end pointer of an array is formed as `&arr[length]`. I think this is just unimportant pedantery (because it's cleaner to write the past-the-end pointer as `(arr+length)` and that form isn't affected by this checker), but if it does appear in real code, then we could introduce a special case for it.

In addition to this primary improvement, this change tweaks the message for the tainted index/offset case (using the more concrete information that's available now) and clarifies/improves a few testcases.

The main change of this commit (replacing `check::Location` with `check::PostStmt<...>` callbacks) was already proposed in my change https://reviews.llvm.org/D150446 and https://reviews.llvm.org/D159107 by @<!-- -->steakhal. Those reviews were both abandoned, but the issues were unrelated to the change that is introduced in this PR.

---
Full diff: https://github.com/llvm/llvm-project/pull/72107.diff


3 Files Affected:

- (modified) clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp (+60-36) 
- (modified) clang/test/Analysis/out-of-bounds-diagnostics.c (+67-7) 
- (modified) clang/test/Analysis/taint-diagnostic-visitor.c (+2-2) 


``````````diff
diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
index ffc7236d1e2551a..ef3ae42d57c6e88 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
@@ -34,20 +34,37 @@ using llvm::formatv;
 namespace {
 enum OOB_Kind { OOB_Precedes, OOB_Exceeds, OOB_Taint };
 
-class ArrayBoundCheckerV2 :
-    public Checker<check::Location> {
+struct Messages {
+  std::string Short, Full;
+};
+
+class ArrayBoundCheckerV2 : public Checker<check::PostStmt<ArraySubscriptExpr>,
+                                           check::PostStmt<UnaryOperator>,
+                                           check::PostStmt<MemberExpr>> {
   BugType BT{this, "Out-of-bound access"};
   BugType TaintBT{this, "Out-of-bound access", categories::TaintedData};
 
+  void performCheck(const Expr *E, CheckerContext &C) const;
+
   void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, OOB_Kind Kind,
-                 NonLoc Offset, std::string RegName, std::string Msg) const;
+                 NonLoc Offset, Messages Msgs) const;
 
   static bool isFromCtypeMacro(const Stmt *S, ASTContext &AC);
 
 public:
-  void checkLocation(SVal l, bool isLoad, const Stmt *S,
-                     CheckerContext &C) const;
+  void checkPostStmt(const ArraySubscriptExpr *E, CheckerContext &C) const {
+    performCheck(E, C);
+  }
+  void checkPostStmt(const UnaryOperator *E, CheckerContext &C) const {
+    if (E->getOpcode() == UO_Deref)
+      performCheck(E, C);
+  }
+  void checkPostStmt(const MemberExpr *E, CheckerContext &C) const {
+    if (E->isArrow())
+      performCheck(E->getBase(), C);
+  }
 };
+
 } // anonymous namespace
 
 /// For a given Location that can be represented as a symbolic expression
@@ -217,16 +234,19 @@ static std::string getShortMsg(OOB_Kind Kind, std::string RegName) {
   return formatv(ShortMsgTemplates[Kind], RegName);
 }
 
-static std::string getPrecedesMsg(std::string RegName, NonLoc Offset) {
+static Messages getPrecedesMsgs(const SubRegion *Region, NonLoc Offset) {
+  std::string RegName = getRegionName(Region);
   SmallString<128> Buf;
   llvm::raw_svector_ostream Out(Buf);
   Out << "Access of " << RegName << " at negative byte offset";
   if (auto ConcreteIdx = Offset.getAs<nonloc::ConcreteInt>())
     Out << ' ' << ConcreteIdx->getValue();
-  return std::string(Buf);
+  return {getShortMsg(OOB_Precedes, RegName), std::string(Buf)};
 }
-static std::string getExceedsMsg(ASTContext &ACtx, std::string RegName,
-                                 NonLoc Offset, NonLoc Extent, SVal Location) {
+
+static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region,
+                               NonLoc Offset, NonLoc Extent, SVal Location) {
+  std::string RegName = getRegionName(Region);
   const auto *EReg = Location.getAsRegion()->getAs<ElementRegion>();
   assert(EReg && "this checker only handles element access");
   QualType ElemType = EReg->getElementType();
@@ -273,20 +293,18 @@ static std::string getExceedsMsg(ASTContext &ACtx, std::string RegName,
       Out << "s";
   }
 
-  return std::string(Buf);
-}
-static std::string getTaintMsg(std::string RegName) {
-  SmallString<128> Buf;
-  llvm::raw_svector_ostream Out(Buf);
-  Out << "Access of " << RegName
-      << " with a tainted offset that may be too large";
-  return std::string(Buf);
+  return {getShortMsg(OOB_Exceeds, RegName), std::string(Buf)};
 }
 
-void ArrayBoundCheckerV2::checkLocation(SVal Location, bool IsLoad,
-                                        const Stmt *LoadS,
-                                        CheckerContext &C) const {
+static Messages getTaintMsgs(const SubRegion *Region, const char *OffsetName) {
+  std::string RegName = getRegionName(Region);
+  return {formatv("Potential out of bound access to {0} with tainted {1}",
+                  RegName, OffsetName),
+          formatv("Access of {0} with a tainted {1} that may be too large",
+                  RegName, OffsetName)};
+}
 
+void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
   // NOTE: Instead of using ProgramState::assumeInBound(), we are prototyping
   // some new logic here that reasons directly about memory region extents.
   // Once that logic is more mature, we can bring it back to assumeInBound()
@@ -297,12 +315,14 @@ void ArrayBoundCheckerV2::checkLocation(SVal Location, bool IsLoad,
   // have some flexibility in defining the base region, we can achieve
   // various levels of conservatism in our buffer overflow checking.
 
+  const SVal Location = C.getSVal(E);
+
   // The header ctype.h (from e.g. glibc) implements the isXXXXX() macros as
   //   #define isXXXXX(arg) (LOOKUP_TABLE[arg] & BITMASK_FOR_XXXXX)
   // and incomplete analysis of these leads to false positives. As even
   // accurate reports would be confusing for the users, just disable reports
   // from these macros:
-  if (isFromCtypeMacro(LoadS, C.getASTContext()))
+  if (isFromCtypeMacro(E, C.getASTContext()))
     return;
 
   ProgramStateRef State = C.getState();
@@ -331,9 +351,8 @@ void ArrayBoundCheckerV2::checkLocation(SVal Location, bool IsLoad,
 
     if (PrecedesLowerBound && !WithinLowerBound) {
       // We know that the index definitely precedes the lower bound.
-      std::string RegName = getRegionName(Reg);
-      std::string Msg = getPrecedesMsg(RegName, ByteOffset);
-      reportOOB(C, PrecedesLowerBound, OOB_Precedes, ByteOffset, RegName, Msg);
+      Messages Msgs = getPrecedesMsgs(Reg, ByteOffset);
+      reportOOB(C, PrecedesLowerBound, OOB_Precedes, ByteOffset, Msgs);
       return;
     }
 
@@ -350,17 +369,25 @@ void ArrayBoundCheckerV2::checkLocation(SVal Location, bool IsLoad,
     if (ExceedsUpperBound) {
       if (!WithinUpperBound) {
         // We know that the index definitely exceeds the upper bound.
-        std::string RegName = getRegionName(Reg);
-        std::string Msg = getExceedsMsg(C.getASTContext(), RegName, ByteOffset,
-                                        *KnownSize, Location);
-        reportOOB(C, ExceedsUpperBound, OOB_Exceeds, ByteOffset, RegName, Msg);
+        Messages Msgs = getExceedsMsgs(C.getASTContext(), Reg, ByteOffset,
+                                       *KnownSize, Location);
+        reportOOB(C, ExceedsUpperBound, OOB_Exceeds, ByteOffset, Msgs);
         return;
       }
       if (isTainted(State, ByteOffset)) {
-        // Both cases are possible, but the index is tainted, so report.
+        // Both cases are possible, but the offset is tainted, so report.
         std::string RegName = getRegionName(Reg);
-        std::string Msg = getTaintMsg(RegName);
-        reportOOB(C, ExceedsUpperBound, OOB_Taint, ByteOffset, RegName, Msg);
+
+        // Diagnostic detail: "tainted offset" is always correct, but the
+        // common case is that 'idx' is tainted in 'arr[idx]' and then it's
+        // nicer to say "tainted index".
+        const char *OffsetName = "offset";
+        if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(E))
+          if (isTainted(State, ASE->getIdx(), C.getLocationContext()))
+            OffsetName = "index";
+
+        Messages Msgs = getTaintMsgs(Reg, OffsetName);
+        reportOOB(C, ExceedsUpperBound, OOB_Taint, ByteOffset, Msgs);
         return;
       }
     }
@@ -374,17 +401,14 @@ void ArrayBoundCheckerV2::checkLocation(SVal Location, bool IsLoad,
 
 void ArrayBoundCheckerV2::reportOOB(CheckerContext &C,
                                     ProgramStateRef ErrorState, OOB_Kind Kind,
-                                    NonLoc Offset, std::string RegName,
-                                    std::string Msg) const {
+                                    NonLoc Offset, Messages Msgs) const {
 
   ExplodedNode *ErrorNode = C.generateErrorNode(ErrorState);
   if (!ErrorNode)
     return;
 
-  std::string ShortMsg = getShortMsg(Kind, RegName);
-
   auto BR = std::make_unique<PathSensitiveBugReport>(
-      Kind == OOB_Taint ? TaintBT : BT, ShortMsg, Msg, ErrorNode);
+      Kind == OOB_Taint ? TaintBT : BT, Msgs.Short, Msgs.Full, ErrorNode);
 
   // Track back the propagation of taintedness.
   if (Kind == OOB_Taint)
diff --git a/clang/test/Analysis/out-of-bounds-diagnostics.c b/clang/test/Analysis/out-of-bounds-diagnostics.c
index 4f5d0d0bc91c54a..e549e86b09c2312 100644
--- a/clang/test/Analysis/out-of-bounds-diagnostics.c
+++ b/clang/test/Analysis/out-of-bounds-diagnostics.c
@@ -9,6 +9,14 @@ void arrayUnderflow(void) {
   // expected-note at -2 {{Access of 'array' at negative byte offset -12}}
 }
 
+int underflowWithDeref(void) {
+  int *p = array;
+  --p;
+  return *p;
+  // expected-warning at -1 {{Out of bound access to memory preceding 'array'}}
+  // expected-note at -2 {{Access of 'array' at negative byte offset -4}}
+}
+
 int scanf(const char *restrict fmt, ...);
 
 void taintedIndex(void) {
@@ -17,6 +25,17 @@ void taintedIndex(void) {
   // expected-note at -1 {{Taint originated here}}
   // expected-note at -2 {{Taint propagated to the 2nd argument}}
   array[index] = 5;
+  // expected-warning at -1 {{Potential out of bound access to 'array' with tainted index}}
+  // expected-note at -2 {{Access of 'array' with a tainted index that may be too large}}
+}
+
+void taintedOffset(void) {
+  int index;
+  scanf("%d", &index);
+  // expected-note at -1 {{Taint originated here}}
+  // expected-note at -2 {{Taint propagated to the 2nd argument}}
+  int *p = array + index;
+  p[0] = 5;
   // expected-warning at -1 {{Potential out of bound access to 'array' with tainted offset}}
   // expected-note at -2 {{Access of 'array' with a tainted offset that may be too large}}
 }
@@ -27,6 +46,29 @@ void arrayOverflow(void) {
   // expected-note at -2 {{Access of 'array' at index 12, while it holds only 10 'int' elements}}
 }
 
+void flippedOverflow(void) {
+  12[array] = 5;
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
+  // expected-note at -2 {{Access of 'array' at index 12, while it holds only 10 'int' elements}}
+}
+
+int *afterTheEndPtr(void) {
+  // FIXME: this is an unusual but standard-compliant way of writing (array + 10).
+  // I think people who rely on this confusing corner case deserve a warning,
+  // but if this is widespread, then we could introduce a special case for it
+  // in the checker.
+  return &array[10];
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
+  // expected-note at -2 {{Access of 'array' at index 10, while it holds only 10 'int' elements}}
+}
+
+int *afterAfterTheEndPtr(void) {
+  // This is UB, it's invalid to form an after-after-the-end pointer.
+  return &array[11];
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
+  // expected-note at -2 {{Access of 'array' at index 11, while it holds only 10 'int' elements}}
+}
+
 int scalar;
 int scalarOverflow(void) {
   return (&scalar)[1];
@@ -41,12 +83,6 @@ int oneElementArrayOverflow(void) {
   // expected-note at -2 {{Access of 'oneElementArray' at index 1, while it holds only a single 'int' element}}
 }
 
-short convertedArray(void) {
-  return ((short*)array)[47];
-  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
-  // expected-note at -2 {{Access of 'array' at index 47, while it holds only 20 'short' elements}}
-}
-
 struct vec {
   int len;
   double elems[64];
@@ -64,6 +100,28 @@ double arrayInStructPtr(struct vec *pv) {
   // expected-note at -2 {{Access of the field 'elems' at index 64, while it holds only 64 'double' elements}}
 }
 
+struct item {
+  int a, b;
+} itemArray[20] = {0};
+
+int structOfArrays(void) {
+  return itemArray[35].a;
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'itemArray'}}
+  // expected-note at -2 {{Access of 'itemArray' at index 35, while it holds only 20 'struct item' elements}}
+}
+
+int structOfArraysArrow(void) {
+  return (itemArray + 35)->b;
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'itemArray'}}
+  // expected-note at -2 {{Access of 'itemArray' at index 35, while it holds only 20 'struct item' elements}}
+}
+
+short convertedArray(void) {
+  return ((short*)array)[47];
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
+  // expected-note at -2 {{Access of 'array' at index 47, while it holds only 20 'short' elements}}
+}
+
 struct two_bytes {
   char lo, hi;
 };
@@ -85,7 +143,9 @@ int intFromString(void) {
 }
 
 int intFromStringDivisible(void) {
-  // However, this is reported with indices/elements, because the extent happens to be a multiple of 4.
+  // However, this is reported with indices/elements, because the extent
+  // (of the string that consists of 'a', 'b', 'c' and '\0') happens to be a
+  // multiple of 4 bytes (= sizeof(int)).
   return ((const int*)"abc")[20];
   // expected-warning at -1 {{Out of bound access to memory after the end of the string literal}}
   // expected-note at -2 {{Access of the string literal at index 20, while it holds only a single 'int' element}}
diff --git a/clang/test/Analysis/taint-diagnostic-visitor.c b/clang/test/Analysis/taint-diagnostic-visitor.c
index 67dc67e627b3b9c..a3fa1639bffeee5 100644
--- a/clang/test/Analysis/taint-diagnostic-visitor.c
+++ b/clang/test/Analysis/taint-diagnostic-visitor.c
@@ -29,8 +29,8 @@ int taintDiagnosticOutOfBound(void) {
   int Array[] = {1, 2, 3, 4, 5};
   scanf("%d", &index); // expected-note {{Taint originated here}}
                        // expected-note at -1 {{Taint propagated to the 2nd argument}}
-  return Array[index]; // expected-warning {{Potential out of bound access to 'Array' with tainted offset}}
-                       // expected-note at -1 {{Access of 'Array' with a tainted offset that may be too large}}
+  return Array[index]; // expected-warning {{Potential out of bound access to 'Array' with tainted index}}
+                       // expected-note at -1 {{Access of 'Array' with a tainted index that may be too large}}
 }
 
 int taintDiagnosticDivZero(int operand) {

``````````

</details>


https://github.com/llvm/llvm-project/pull/72107


More information about the cfe-commits mailing list