[clang] [llvm] [NFC][analyzer] Extract bounds checking library (PR #202372)

Mon Jun 29 02:08:16 PDT 2026

================
@@ -0,0 +1,211 @@
+//===- BoundsChecking.h - Bounds checking related APIs ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines APIs for performing a bounds check (i.e. comparing a
+//  symbolic Offset value to zero and a symbolic Extent value) and composing
+//  descriptions that explain its results.
+//
+//  This is intended as a replacement for `ProgramState::assumeInBound` to
+//  avoid its incorrect logic and compensate for deficiencies of other parts of
+//  the analyzer.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_STATICANALYZER_CORE_PATHSENSITIVE_BOUNDSCHECKING_H
+#define LLVM_CLANG_STATICANALYZER_CORE_PATHSENSITIVE_BOUNDSCHECKING_H
+#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
+#include "llvm/Support/FormatVariadic.h"
+#include <optional>
+
+namespace clang {
+namespace ento {
+
+/// If `E` is an array subscript expression with a base that is "clean" (= not
+/// modified by pointer arithmetic = the beginning of a memory region), return
+/// it as a pointer to ArraySubscriptExpr; otherwise return nullptr.
+/// This helper function is used by two separate heuristics that are only valid
+/// in these "clean" cases.
+const ArraySubscriptExpr *getAsCleanArraySubscriptExpr(const Expr *E,
+                                                       const CheckerContext &C);
+
+class SizeUnit {
+  QualType AsType;
+  int64_t AsCharUnits;
+
+  SizeUnit() : AsType(), AsCharUnits(1) {}
+
+public:
+  SizeUnit(QualType T, const ASTContext &ACtx)
+      : AsType(T), AsCharUnits(ACtx.getTypeSizeInChars(T).getQuantity()) {
+    assert(!T.isNull());
+  }
+
+  static SizeUnit bytes() { return SizeUnit(); }
+
+  bool isBytes() const { return AsType.isNull(); }
+
+  /// If `E` is a "clean" array subscript expression, return the type of the
+  /// accessed element; otherwise return 'Bytes' because that's the best (or
+  /// least bad) option for the assumption messages that use this.
+  static SizeUnit forExpr(const Expr *E, const CheckerContext &C) {
+    const auto *ASE = getAsCleanArraySubscriptExpr(E, C);
+    if (!ASE)
+      return bytes();
+
+    return SizeUnit(ASE->getType(), C.getASTContext());
+  }
+
+  /// Return the element type that is "natural" for reporting out-of-bounds
+  /// memory access to 'Location'.
+  /// FIXME: It is unfortunate that this heuristic differs from the heuristic
+  /// used for reporting assumption (`SizeUnit::forExpr`).
+  static SizeUnit forSVal(SVal Location, const ASTContext &ACtx) {
+    const auto *EReg = Location.getAsRegion()->getAs<ElementRegion>();
+    assert(EReg && "this checker only handles element access");
+    return SizeUnit(EReg->getElementType(), ACtx);
+  }
+
+  int64_t asCharUnits() const { return AsCharUnits; }
+
+  std::string asExtentDesc() const {
+    if (isBytes())
+      return "the extent of";
+    return llvm::formatv("the number of '{0}' elements in",
+                         AsType.getAsString());
+  }
+
+  std::string asElementName() const {
+    if (isBytes())
+      return "byte";
+    return llvm::formatv("'{0}' element", AsType.getAsString());
+  }
+
+  std::string getOffsetName() const {
+    return isBytes() ? "byte offset" : "index";
+  }
+
+  /// Try to divide `Val1` and `Val2` (in place) by `this->asCharUnits()` and
+  /// return true if it can be performed without remainder. The values `Val1`
+  /// and `Val2` may be nullopt and in that case the corresponding division is
+  /// considered to be successful.
+  bool tryConvertValuesFromBytes(std::optional<int64_t> &Val1,
+                                 std::optional<int64_t> &Val2) const;
+};
+
+struct Messages {
+  std::string Short, Full;
+};
+
+enum class BadOffsetKind { Negative, Overflowing, Indeterminate };
+
+constexpr llvm::StringLiteral Adjectives[] = {"a negative", "an overflowing",
+                                              "a negative or overflowing"};
+inline StringRef asAdjective(BadOffsetKind Problem) {
+  return Adjectives[static_cast<int>(Problem)];
+}
----------------
NagyDonat wrote:

> Currently, it will silently overflow when passing `Indeterminate`.

No, it does not overflow, `Indeterminate` is equal to 2.

For me it was a traditional pattern that `enum {A, B, C};` and `string Array[] = {"A", "B", "C"};` are in a natural correspondence (i.e. `Array[A] == "A"`, `Array[B] == "B"` `Array[C] == "C"`) because everything is zero-based. Also, I felt that the `enum class` is enough guarantee to rule out overflows (only an explicitly fishy cast could create an overflowing `BadOffsetKind`).

However, I could switch to another way of defining this enum -> string mapping if you think that this is confusing. 

What would be your preferred solution for introducing this sort of mapping? Just using `std::array` or some other pattern (e.g. perhaps a switch?).

https://github.com/llvm/llvm-project/pull/202372