[clang] nonblocking/nonallocating attributes: 2nd pass caller/callee analysis (PR #99656)

Thu Jul 25 20:45:12 PDT 2024

================
@@ -2397,6 +2397,1262 @@ class UnsafeBufferUsageReporter : public UnsafeBufferUsageHandler {
 };
 } // namespace
 
+// =============================================================================
+
+namespace FXAnalysis {
+
+enum class DiagnosticID : uint8_t {
+  None = 0, // sentinel for an empty Diagnostic
+  Throws,
+  Catches,
+  CallsObjC,
+  AllocatesMemory,
+  HasStaticLocal,
+  AccessesThreadLocal,
+
+  // These only apply to callees, where the analysis stops at the Decl
+  DeclDisallowsInference,
+
+  CallsDeclWithoutEffect,
+  CallsExprWithoutEffect,
+};
+
+// Holds an effect diagnosis, potentially for the entire duration of the
+// analysis phase, in order to refer to it when explaining why a caller has been
+// made unsafe by a callee.
+struct Diagnostic {
+  FunctionEffect Effect;
+  DiagnosticID ID = DiagnosticID::None;
+  SourceLocation Loc;
+  const Decl *Callee = nullptr; // only valid for Calls*
+
+  Diagnostic() = default;
+
+  Diagnostic(const FunctionEffect &Effect, DiagnosticID ID, SourceLocation Loc,
+             const Decl *Callee = nullptr)
+      : Effect(Effect), ID(ID), Loc(Loc), Callee(Callee) {}
+};
+
+enum class SpecialFuncType : uint8_t { None, OperatorNew, OperatorDelete };
+enum class CallType {
+  // unknown: probably function pointer
+  Unknown,
+  Function,
+  Virtual,
+  Block
+};
+
+// Return whether a function's effects CAN be verified.
+// The question of whether it SHOULD be verified is independent.
+static bool functionIsVerifiable(const FunctionDecl *FD) {
+  if (!(FD->hasBody() || FD->isInlined())) {
+    // externally defined; we couldn't verify if we wanted to.
+    return false;
+  }
+  if (FD->isTrivial()) {
+    // Otherwise `struct x { int a; };` would have an unverifiable default
+    // constructor.
+    return true;
+  }
+  return true;
+}
+
+/// A mutable set of FunctionEffect, for use in places where any conditions
+/// have been resolved or can be ignored.
+class EffectSet {
+  // This implementation optimizes footprint, since we hold one of these for
+  // every function visited, which, due to inference, can be many more functions
+  // than have declared effects.
+
+  template <typename T, typename SizeT, SizeT Capacity> struct FixedVector {
+    SizeT Count = 0;
+    T Items[Capacity] = {};
+
+    using value_type = T;
+
+    using iterator = T *;
+    using const_iterator = const T *;
+    iterator begin() { return &Items[0]; }
+    iterator end() { return &Items[Count]; }
+    const_iterator begin() const { return &Items[0]; }
+    const_iterator end() const { return &Items[Count]; }
+    const_iterator cbegin() const { return &Items[0]; }
+    const_iterator cend() const { return &Items[Count]; }
+
+    void insert(iterator I, const T &Value) {
+      assert(Count < Capacity);
+      iterator E = end();
+      if (I != E)
+        std::copy_backward(I, E, E + 1);
+      *I = Value;
+      ++Count;
+    }
+
+    void push_back(const T &Value) {
+      assert(Count < Capacity);
+      Items[Count++] = Value;
+    }
+  };
+
+  // As long as FunctionEffect is only 1 byte, and there are only 2 verifiable
+  // effects, this fixed-size vector with a capacity of 7 is more than
+  // sufficient and is only 8 bytes.
+  FixedVector<FunctionEffect, uint8_t, 7> Impl;
+
+public:
+  EffectSet() = default;
+  explicit EffectSet(FunctionEffectsRef FX) { insert(FX); }
+
+  operator ArrayRef<FunctionEffect>() const {
+    return ArrayRef(Impl.cbegin(), Impl.cend());
+  }
+
+  using iterator = const FunctionEffect *;
+  iterator begin() const { return Impl.cbegin(); }
+  iterator end() const { return Impl.cend(); }
+
+  void insert(const FunctionEffect &Effect) {
+    FunctionEffect *Iter = Impl.begin();
+    FunctionEffect *End = Impl.end();
+    // linear search; lower_bound is overkill for a tiny vector like this
+    for (; Iter != End; ++Iter) {
+      if (*Iter == Effect)
+        return;
+      if (Effect < *Iter)
+        break;
+    }
+    Impl.insert(Iter, Effect);
+  }
+  void insert(const EffectSet &Set) {
+    for (const FunctionEffect &Item : Set) {
+      // push_back because set is already sorted
+      Impl.push_back(Item);
+    }
+  }
+  void insert(FunctionEffectsRef FX) {
+    for (const FunctionEffectWithCondition &EC : FX) {
+      assert(EC.Cond.getCondition() ==
+             nullptr); // should be resolved by now, right?
+      // push_back because set is already sorted
+      Impl.push_back(EC.Effect);
+    }
+  }
+  bool contains(const FunctionEffect::Kind EK) const {
+    for (const FunctionEffect &E : Impl)
+      if (E.kind() == EK)
+        return true;
+    return false;
+  }
+
+  void dump(llvm::raw_ostream &OS) const;
+
+  static EffectSet difference(ArrayRef<FunctionEffect> LHS,
+                              ArrayRef<FunctionEffect> RHS) {
+    EffectSet Result;
+    std::set_difference(LHS.begin(), LHS.end(), RHS.begin(), RHS.end(),
+                        std::back_inserter(Result.Impl));
+    return Result;
+  }
+};
+
+LLVM_DUMP_METHOD void EffectSet::dump(llvm::raw_ostream &OS) const {
+  OS << "Effects{";
+  bool First = true;
+  for (const FunctionEffect &Effect : *this) {
+    if (!First)
+      OS << ", ";
+    else
+      First = false;
+    OS << Effect.name();
+  }
+  OS << "}";
+}
+
+// Transitory, more extended information about a callable, which can be a
+// function, block, function pointer, etc.
+struct CallableInfo {
+  // CDecl holds the function's definition, if any.
+  // FunctionDecl if CallType::Function or Virtual
+  // BlockDecl if CallType::Block
+  const Decl *CDecl;
+  mutable std::optional<std::string> MaybeName;
+  SpecialFuncType FuncType = SpecialFuncType::None;
+  EffectSet Effects;
+  CallType CType = CallType::Unknown;
+
+  CallableInfo(Sema &SemaRef, const Decl &CD,
+               SpecialFuncType FT = SpecialFuncType::None)
+      : CDecl(&CD), FuncType(FT) {
+    FunctionEffectsRef FXRef;
+
+    if (auto *FD = dyn_cast<FunctionDecl>(CDecl)) {
+      // Use the function's definition, if any.
+      if (const FunctionDecl *Def = FD->getDefinition())
+        CDecl = FD = Def;
+      CType = CallType::Function;
+      if (auto *Method = dyn_cast<CXXMethodDecl>(FD);
+          Method && Method->isVirtual())
+        CType = CallType::Virtual;
+      FXRef = FD->getFunctionEffects();
+    } else if (auto *BD = dyn_cast<BlockDecl>(CDecl)) {
+      CType = CallType::Block;
+      FXRef = BD->getFunctionEffects();
+    } else if (auto *VD = dyn_cast<ValueDecl>(CDecl)) {
+      // ValueDecl is function, enum, or variable, so just look at its type.
+      FXRef = FunctionEffectsRef::get(VD->getType());
+    }
+    Effects = EffectSet(FXRef);
+  }
+
+  bool isDirectCall() const {
+    return CType == CallType::Function || CType == CallType::Block;
+  }
+
+  bool isVerifiable() const {
+    switch (CType) {
+    case CallType::Unknown:
+    case CallType::Virtual:
+      break;
+    case CallType::Block:
+      return true;
+    case CallType::Function:
+      return functionIsVerifiable(dyn_cast<FunctionDecl>(CDecl));
+    }
+    return false;
+  }
+
+  /// Generate a name for logging and diagnostics.
+  std::string name(Sema &Sem) const {
+    if (!MaybeName) {
+      std::string Name;
+      llvm::raw_string_ostream OS(Name);
+
+      if (auto *FD = dyn_cast<FunctionDecl>(CDecl))
+        FD->getNameForDiagnostic(OS, Sem.getPrintingPolicy(),
+                                 /*Qualified=*/true);
+      else if (auto *BD = dyn_cast<BlockDecl>(CDecl))
+        OS << "(block " << BD->getBlockManglingNumber() << ")";
+      else if (auto *VD = dyn_cast<NamedDecl>(CDecl))
+        VD->printQualifiedName(OS);
+      MaybeName = Name;
+    }
+    return *MaybeName;
+  }
+};
+
+// ----------
+// Map effects to single diagnostics, to hold the first (of potentially many)
+// diagnostics pertaining to an effect, per function.
+class EffectToDiagnosticMap {
+  // Since we currently only have a tiny number of effects (typically no more
+  // than 1), use a sorted SmallVector with an inline capacity of 1. Since it
+  // is often empty, use a unique_ptr to the SmallVector.
+  // Note that Diagnostic itself contains a FunctionEffect which is the key.
+  using ImplVec = llvm::SmallVector<Diagnostic, 1>;
+  std::unique_ptr<ImplVec> Impl;
+
+public:
+  // Insert a new diagnostic if we do not already have one for its effect.
+  void maybeInsert(const Diagnostic &Diag) {
+    if (Impl == nullptr)
+      Impl = std::make_unique<ImplVec>();
+    auto *Iter = _find(Diag.Effect);
+    if (Iter != Impl->end() && Iter->Effect == Diag.Effect)
+      return;
+
+    Impl->insert(Iter, Diag);
+  }
+
+  const Diagnostic *lookup(FunctionEffect Key) {
+    if (Impl == nullptr)
+      return nullptr;
+
+    auto *Iter = _find(Key);
+    if (Iter != Impl->end() && Iter->Effect == Key)
+      return &*Iter;
+
+    return nullptr;
+  }
+
+  size_t size() const { return Impl ? Impl->size() : 0; }
+
+private:
+  ImplVec::iterator _find(const FunctionEffect &key) {
+    // A linear search suffices for a tiny number of possible effects.
+    auto *End = Impl->end();
+    for (auto *Iter = Impl->begin(); Iter != End; ++Iter)
+      if (!(Iter->Effect < key))
+        return Iter;
+    return End;
+  }
+};
+
+// ----------
+// State pertaining to a function whose AST is walked and whose effect analysis
+// is dependent on a subsequent analysis of other functions.
+class PendingFunctionAnalysis {
+  friend class CompleteFunctionAnalysis;
+
+public:
+  struct DirectCall {
+    const Decl *Callee;
+    SourceLocation CallLoc;
+    // Not all recursive calls are detected, just enough
+    // to break cycles.
+    bool Recursed = false;
+
+    DirectCall(const Decl *D, SourceLocation CallLoc)
+        : Callee(D), CallLoc(CallLoc) {}
+  };
+
+  // We always have two disjoint sets of effects to verify:
+  // 1. Effects declared explicitly by this function.
+  // 2. All other inferrable effects needing verification.
+  EffectSet DeclaredVerifiableEffects;
+  EffectSet FXToInfer;
+
+private:
+  // Diagnostics pertaining to the function's explicit effects.
+  SmallVector<Diagnostic, 0> DiagnosticsForExplicitFX;
+
+  // Diagnostics pertaining to other, non-explicit, inferrable effects.
+  EffectToDiagnosticMap InferrableEffectToFirstDiagnostic;
+
+  // These unverified direct calls are what keeps the analysis "pending",
+  // until the callees can be verified.
+  SmallVector<DirectCall, 0> UnverifiedDirectCalls;
+
+public:
+  PendingFunctionAnalysis(
+      Sema &Sem, const CallableInfo &CInfo,
+      ArrayRef<FunctionEffect> AllInferrableEffectsToVerify) {
+    DeclaredVerifiableEffects = CInfo.Effects;
+
+    // Check for effects we are not allowed to infer
+    EffectSet InferrableFX;
+
+    for (const FunctionEffect &effect : AllInferrableEffectsToVerify) {
+      if (effect.canInferOnFunction(*CInfo.CDecl))
+        InferrableFX.insert(effect);
+      else {
+        // Add a diagnostic for this effect if a caller were to
+        // try to infer it.
+        InferrableEffectToFirstDiagnostic.maybeInsert(
+            Diagnostic(effect, DiagnosticID::DeclDisallowsInference,
+                       CInfo.CDecl->getLocation()));
----------------
Sirraide wrote:

The way this is phrased is a bit confusing to me: From what I gather, this is about the fact that we can’t infer e.g. `nonblocking` on a function that is specifically marked as `blocking`, which, fair enough, but shouldn’t we, in that case, report that it’s specifically been marked as blocking instead of reporting the fact that we’re unable to infer whether it is nonblocking?

The diagnostic wording for this is also a bit weird. ‘function does not permit inference of X’ is not particularly helpful. It should probably include the effect, i.e. ‘function cannot be X because it is declared as Y’.

https://github.com/llvm/llvm-project/pull/99656