[clang] [llvm] [clang] Add/enhance documentation for some important classes. (PR #109795)

Sam McCall via cfe-commits cfe-commits at lists.llvm.org
Tue Sep 24 06:03:43 PDT 2024


https://github.com/sam-mccall created https://github.com/llvm/llvm-project/pull/109795

None

>From 8dd7d0afc65526f152a02cbd5772ba9882cc2614 Mon Sep 17 00:00:00 2001
From: Sam McCall <sam.mccall at gmail.com>
Date: Tue, 24 Sep 2024 15:02:36 +0200
Subject: [PATCH] [clang] Add/enhance documentation for some important classes.

---
 clang/include/clang/AST/DeclBase.h            | 52 +++++++------
 clang/include/clang/AST/Stmt.h                | 18 ++++-
 clang/include/clang/AST/Type.h                | 40 +++++-----
 clang/include/clang/AST/TypeLoc.h             | 78 ++++++++++++++++++-
 clang/include/clang/Basic/SourceManager.h     | 56 ++++++++++---
 .../llvm/Support/FileSystem/UniqueID.h        | 11 +--
 llvm/include/llvm/Support/VirtualFileSystem.h | 19 ++++-
 7 files changed, 209 insertions(+), 65 deletions(-)

diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
index ee662ed73d7e0e..8b76cd43c1d62a 100644
--- a/clang/include/clang/AST/DeclBase.h
+++ b/clang/include/clang/AST/DeclBase.h
@@ -76,13 +76,26 @@ enum AvailabilityResult {
   AR_Unavailable
 };
 
-/// Decl - This represents one declaration (or definition), e.g. a variable,
-/// typedef, function, struct, etc.
+/// A Decl describes a declaration (or definition) of a variable, function, etc.
+/// This is the base class for a hierarchy of Decls: VarDecl, FunctionDecl...
 ///
-/// Note: There are objects tacked on before the *beginning* of Decl
-/// (and its subclasses) in its Decl::operator new(). Proper alignment
-/// of all subclasses (not requiring more than the alignment of Decl) is
-/// asserted in DeclBase.cpp.
+/// The declarations form a tree rooted at the TranslationUnitDecl.
+/// The non-leaf nodes of this tree are DeclContexts (as well as being Decls).
+///
+/// Decls are also the AST's representation of the things being declared.
+/// So a VarDecl* may refer either to a specific declaration of a variable, or
+/// to the variable itself - pointing at an arbitrary declaration of it.
+/// (Declarations of the same variable are linked in a "redecl chain", and
+/// the first entry is the canonical representative when one is needed).
+///
+/// Some entities have zero declarations in code, like implicit constructors.
+/// For these, a Decl is synthesized and marked "implicit". Lexical information
+/// like SourceLocations may not be meaningful for such Decls.
+///
+/// Like other AST nodes, Decls are allocated within an ASTContext, using
+/// factory functions like FooDecl::Create(Ctx, ...).
+/// This may allocate leading data before the object (see Decl::operator new)
+/// and trailing data after it (see e.g. CXXConstructorDecl::Create).
 class alignas(8) Decl {
 public:
   /// Lists the kind of concrete classes of Decl.
@@ -1416,23 +1429,18 @@ enum class OMPDeclareReductionInitKind;
 enum class ObjCImplementationControl;
 enum class LinkageSpecLanguageIDs;
 
-/// DeclContext - This is used only as base class of specific decl types that
-/// can act as declaration contexts. These decls are (only the top classes
-/// that directly derive from DeclContext are mentioned, not their subclasses):
+/// A declaration context is a Decl that can contain other declarations.
+///
+/// Declarations form a tree rooted at the single TranslationUnitDecl, and
+/// the non-leaf nodes are DeclContexts.
+///
+/// Contexts are important for name lookup, which usually involves querying
+/// several contexts in sequence. (However some local lookup scopes such as
+/// CompoundStmt are not DeclContexts - see clang::Scope).
 ///
-///   TranslationUnitDecl
-///   ExternCContext
-///   NamespaceDecl
-///   TagDecl
-///   OMPDeclareReductionDecl
-///   OMPDeclareMapperDecl
-///   FunctionDecl
-///   ObjCMethodDecl
-///   ObjCContainerDecl
-///   LinkageSpecDecl
-///   ExportDecl
-///   BlockDecl
-///   CapturedDecl
+/// DeclContext is a second base class for the relevant Decl subclasses,
+/// e.g. FunctionDecl inherits from both DeclaratorDecl and DeclContext.
+/// It is safe to cast<Decl>(&DC), as DeclContexts are always Decls.
 class DeclContext {
   /// For makeDeclVisibleInContextImpl
   friend class ASTDeclReader;
diff --git a/clang/include/clang/AST/Stmt.h b/clang/include/clang/AST/Stmt.h
index 7aed83e9c68bb7..07133c245f0027 100644
--- a/clang/include/clang/AST/Stmt.h
+++ b/clang/include/clang/AST/Stmt.h
@@ -79,8 +79,24 @@ enum class StringLiteralKind;
 // AST classes for statements.
 //===----------------------------------------------------------------------===//
 
-/// Stmt - This represents one statement.
+/// A statement or expression in the program.
 ///
+/// This is the base for the hierarchy of statements (ForStmt, ReturnStmt...)
+/// as well as expressions (Expr, CastExpr, IntegerLiteral...).
+/// Classing expressions as Stmt allows them to appear as statements without
+/// needing an extra "expression-statement" node.
+///
+/// Statements can have children and so form trees. e.g. `while (i>0) i--;`
+///
+///   WhileStmt
+///   |-BinaryOperator >
+///   | |-DeclRefExpr i
+///   | `-IntegerLiteral 0
+///   `-UnaryOperator --
+///       DeclRefExpr i
+///
+/// These trees are usually rooted at function bodies, and attach to the rest
+/// of the AST via FunctionDecls.
 class alignas(void *) Stmt {
 public:
   enum StmtClass {
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index dc87b84153e74a..4af66bf571abf6 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -1802,32 +1802,32 @@ enum class ArraySizeModifier;
 enum class ElaboratedTypeKeyword;
 enum class VectorKind;
 
-/// The base class of the type hierarchy.
+/// A type in the program, such as `int` or `vector<bool>`.
+/// This the base class for a hierarchy of types: PointerType, BuiltinType etc.
 ///
-/// A central concept with types is that each type always has a canonical
-/// type.  A canonical type is the type with any typedef names stripped out
-/// of it or the types it references.  For example, consider:
+/// Types appear throughout the AST: expressions and (some) declarations have
+/// types; casts, new-expressions, and template-arguments refer to types, etc.
+/// A Type is not tied to a specific place where it was written (see TypeLoc).
 ///
-///  typedef int  foo;
-///  typedef foo* bar;
-///    'int *'    'foo *'    'bar'
+/// For each distinct type (per language rules) there is one canonical Type.
+/// Compound types are formed as trees of simpler types.
+/// e.g `int*`: a PointerType(pointee = BuiltinType(kind = Int)).
+/// Types are interned: you can compare canonical type equality by pointer.
 ///
-/// There will be a Type object created for 'int'.  Since int is canonical, its
-/// CanonicalType pointer points to itself.  There is also a Type for 'foo' (a
-/// TypedefType).  Its CanonicalType pointer points to the 'int' Type.  Next
-/// there is a PointerType that represents 'int*', which, like 'int', is
-/// canonical.  Finally, there is a PointerType type for 'foo*' whose canonical
-/// type is 'int*', and there is a TypedefType for 'bar', whose canonical type
-/// is also 'int*'.
+/// There are also non-canonical "sugar" types, which also describe e.g. what
+/// typedef was used. For example:
 ///
-/// Non-canonical types are useful for emitting diagnostics, without losing
-/// information about typedefs being used.  Canonical types are useful for type
-/// comparisons (they allow by-pointer equality tests) and useful for reasoning
-/// about whether something has a particular form (e.g. is a function type),
-/// because they implicitly, recursively, strip all typedefs out of a type.
+///   using Integer = int; // BuiltinType(kind = int)
+///   Integer* x;          // PointerType(pointee = TypedefType(decl = Integer))
 ///
-/// Types, once created, are immutable.
+/// The Type obtained from the VarDecl for x reflects how that declaration was
+/// written. This is useful for diagnostic messages and so on.
+/// Each Type has a pointer to its canonical type, and these should be used for
+/// semantic checks: are two types equal, is this a function type, etc.
 ///
+/// All types are allocated within the ASTContext, interned, and immutable.
+/// For compactness, qualifiers (`const int`) do not create distinct Types.
+/// See QualType - essentially a Type* with cv-qualifiers stored in low bits. 
 class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
 public:
   enum TypeClass {
diff --git a/clang/include/clang/AST/TypeLoc.h b/clang/include/clang/AST/TypeLoc.h
index 62ca52e508ba20..67e636aa4030c6 100644
--- a/clang/include/clang/AST/TypeLoc.h
+++ b/clang/include/clang/AST/TypeLoc.h
@@ -7,7 +7,8 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// Defines the clang::TypeLoc interface and its subclasses.
+/// Defines the clang::TypeLoc interface and its subclasses, which model
+/// both syntax and semantics of types written in source code.
 //
 //===----------------------------------------------------------------------===//
 
@@ -52,10 +53,79 @@ class UnresolvedUsingTypenameDecl;
   class Class##TypeLoc;
 #include "clang/AST/TypeLocNodes.def"
 
-/// Base wrapper for a particular "section" of type source info.
+/// A TypeLoc describes an occurrence of type written in source code.
+/// This is the base class for a hierarchy: PointerTypeLoc, BuiltinTypeLoc etc,
+/// which is parallel to the hierarchy of Types.
 ///
-/// A client should use the TypeLoc subclasses through castAs()/getAs()
-/// in order to get at the actual information.
+/// The following code has three TypeLocs:
+///   int x;
+///   int* y;
+/// - a BuiltinTypeLoc for `int` on line 1
+/// - a PointerTypeLoc for `int*` on line 2
+/// - a BuiltinTypeLoc for `int` on line 2. (== PointerTypeLoc.getPointeeLoc())
+///
+/// TypeLocs describe both the type and how it was written. A PointerTypeLoc
+/// contains the SourceLocation of the `*`, and a TypeLoc describing how the
+/// pointee type was written. It also contains the PointerType which fully
+/// describes the type's semantics.
+///
+/// In general TypeLocs and Types are not 1:1 - expressions have Types but
+/// not TypeLocs, and multiple TypeLocs can name the same Type (`int`, above).
+///
+/// TypeLocs are passed by value, and are most easily understood as non-owning
+/// reference types (like llvm::StringRef).
+///
+/// ====== Data model and layout =====
+///
+/// A TypeLoc is a fat pointer (Type*, void* data).
+///
+/// - The Type is sugared, so captures written structure (use of typedefs etc).
+/// - The data buffer stores location information to augment this Type, followed
+///   by location information to augment any Types nested within it.
+///
+/// TypeLoc does not own its buffer - TypeSourceInfo is the owning equivalent.
+/// AST nodes have pointers to allocated TypeSourceInfo objects, and their
+/// public methods expose TypeLoc "views" of these.
+///
+/// ----------------------------------
+///
+/// This data model is best motivated by an example:
+/// Naively, a PointerTypeLoc for `int* x` could look like:
+///
+///   PointerTypeLoc(
+///      type = PointerType(pointee = BuiltinType(kind = Int)),
+///      starLoc = <loc1>,
+///      pointeeLoc = BuiltinTypeLoc(
+///        type = BuiltinType(kind = Int),
+///        keywordLoc = <loc2>
+///      )
+///  )
+///
+/// There is a lot of redundancy here: the PointerTypeLoc structure mirrors
+/// the corresponding PointerType, and we're just adding some SourceLocations.
+/// 
+/// Instead, this more compact representation is used:
+///
+///   PointerTypeLoc(
+///     data = [<loc1><loc2>],  // not owned!
+///     type = PointerType(pointee = BuiltinType(kind = Int)),
+///   )
+///
+/// The front of the data buffer has the starLoc "local" to PointerTypeLoc.
+/// The back of the data buffer has the data for the inner types.
+///
+/// PointerTypeLoc's implementation is:
+///   getStarLoc() returns data[0].
+///   getPointeeLoc() returns TypeLoc(&data[1], type->getPointeeType())
+///
+/// ----------------------------------
+///
+/// One quirk here is how the TypeLoc inheritance hierarchy and casting work.
+///
+/// All subclasses of TypeLoc have exactly the same layout: (Type*, void*).
+/// We downcast a TypeLoc to e.g. PointerTypeLoc *by value*, by simply creating
+/// a PointerTypeLoc with the same type and buffer.
+/// (The cast can be checked by examining the Type).
 class TypeLoc {
 protected:
   // The correctness of this relies on the property that, for Type *Ty,
diff --git a/clang/include/clang/Basic/SourceManager.h b/clang/include/clang/Basic/SourceManager.h
index e0f1ea435d54e4..a1c7733da4aee3 100644
--- a/clang/include/clang/Basic/SourceManager.h
+++ b/clang/include/clang/Basic/SourceManager.h
@@ -648,18 +648,54 @@ class InBeforeInTUCacheEntry {
 /// instances.
 using ModuleBuildStack = ArrayRef<std::pair<std::string, FullSourceLoc>>;
 
-/// This class handles loading and caching of source files into memory.
+/// The SourceManager describes the compiler's view of source code.
 ///
-/// This object owns the MemoryBuffer objects for all of the loaded
-/// files and assigns unique FileID's for each unique \#include chain.
+/// This includes:
+///   - sources before preprocessing: raw code from disk
+///   - code after preprocessing e.g. expanded from an `assert()` macro
+///   - the relationship between the two, e.g. where the `assert()` was written
 ///
-/// The SourceManager can be queried for information about SourceLocation
-/// objects, turning them into either spelling or expansion locations. Spelling
-/// locations represent where the bytes corresponding to a token came from and
-/// expansion locations represent where the location is in the user's view. In
-/// the case of a macro expansion, for example, the spelling location indicates
-/// where the expanded token came from and the expansion location specifies
-/// where it was expanded.
+/// SourceManager is designed to represent this information compactly. 
+/// AST nodes hold SourceLocations pointing at tokens that were parsed,
+/// so that diagnostics can point to relevant source code (including macros).
+/// A SourceLocation is a 32-bit integer, and is only meaningful together with
+/// the SourceManager which maintains the tables needed to decode it.
+///
+/// The SourceManager sits above the FileManager, which reads source files and
+/// exposes them as FileEntrys.
+/// SourceManager does not generally know about tokens or AST nodes, the lexer
+/// and parser are layered above SourceManager.
+/// 
+/// ====== SourceLocations, FileIDs, and SLocEntrys =======
+///
+/// A SourceLocation can point at any byte of code. Rather than store
+/// (file, offset) pairs, it is a single offset into a buffer of all files
+/// concatenated together.
+///
+/// [--file1--][----file2----][file3]
+///               ^
+/// This buffer does not exist in memory. Instead SourceManager keeps an array
+/// of SLocEntry objects, each describing one file and its offset in the buffer.
+/// Each entry is assigned a FileID, and SourceManager can encode/decode a
+/// SourceLocation into a (FileID, file offset) pair: see getDeomposedLoc().
+///
+/// The SLocEntry holds the cached source code, and preprocessing metadata:
+/// the SourceLocation where the file was #included.
+///
+/// SourceLocations can also point at code produced by macro expansion.
+/// To achieve this, every macro expansion has its own SLocEntry and FileID.
+///
+/// [--file1--][----file2----][expn of INT_MAX][file3][another expn of INT_MAX]
+///                                ^
+/// Again, the expanded code does not exist in memory, only as address space.
+/// In this case, the SLocEntry stores two SourceLocations:
+///  - expansion location: where the macro was used (i.e. the text "INT_MAX")
+///  - spelling location: the macro's definition (i.e. the text "2147483647")
+/// Either or both may be relevant to the user.
+/// See get[Immediate]ExpansionRange and get[Immediate]SpellingLoc.
+///
+/// Nested macros are also handled by this scheme: e.g. the expansion location
+/// may itself be inside a macro expansion.
 class SourceManager : public RefCountedBase<SourceManager> {
   /// DiagnosticsEngine object.
   DiagnosticsEngine &Diag;
diff --git a/llvm/include/llvm/Support/FileSystem/UniqueID.h b/llvm/include/llvm/Support/FileSystem/UniqueID.h
index 0d5367236e8dcf..0b16ca0c83a8e9 100644
--- a/llvm/include/llvm/Support/FileSystem/UniqueID.h
+++ b/llvm/include/llvm/Support/FileSystem/UniqueID.h
@@ -5,11 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// This file is cut out of llvm/Support/FileSystem.h to allow UniqueID to be
-// reused without bloating the includes.
-//
-//===----------------------------------------------------------------------===//
 
 #ifndef LLVM_SUPPORT_FILESYSTEM_UNIQUEID_H
 #define LLVM_SUPPORT_FILESYSTEM_UNIQUEID_H
@@ -23,6 +18,12 @@ namespace llvm {
 namespace sys {
 namespace fs {
 
+/// fs::UniqueID is the identity of a file, independent of its path.
+///
+/// Files with multiple paths (hard links) will have the same UniqueID.
+/// Symlinks have a UniqueID distinct from the file they point to.
+///
+/// UniqueID corresponds to the inode number on POSIX platforms.
 class UniqueID {
   uint64_t Device;
   uint64_t File;
diff --git a/llvm/include/llvm/Support/VirtualFileSystem.h b/llvm/include/llvm/Support/VirtualFileSystem.h
index 2531c075f262d7..131f9d2686998c 100644
--- a/llvm/include/llvm/Support/VirtualFileSystem.h
+++ b/llvm/include/llvm/Support/VirtualFileSystem.h
@@ -7,7 +7,10 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// Defines the virtual file system interface vfs::FileSystem.
+/// Provides abstract filesystem APIs to decouple from OS-level file access.
+///
+/// This allows replacing access to physical files on disk by in-memory data,
+/// for testing or other purposes.
 //
 //===----------------------------------------------------------------------===//
 
@@ -44,7 +47,10 @@ class Twine;
 
 namespace vfs {
 
-/// The result of a \p status operation.
+/// Metadata about a file or directory, obtained through a virtual filesystem.
+///
+/// Obtained from a path (VirtualFileSystem::status) or an open File.
+/// This loosely corresponds to a POSIX `struct stat`, but also knows its path.
 class Status {
   std::string Name;
   llvm::sys::fs::UniqueID UID;
@@ -261,7 +267,14 @@ class recursive_directory_iterator {
   void no_push() { State->HasNoPushRequest = true; }
 };
 
-/// The virtual file system interface.
+/// A vfs::FileSystem abstracts read-only filesystem access.
+///
+/// The standard implementation (getRealFileSystem()) forwards to the operating
+/// system's filesystem APIs. Others can expose in-memory virtual files, or
+/// wrap an underlying filesystem to change its behavior.
+///
+/// "VFS-clean" code avoids direct OS IO APIs, and can be deployed in more
+/// environments, such as crash reproducers with llvm::FileCollector.
 class FileSystem : public llvm::ThreadSafeRefCountedBase<FileSystem>,
                    public RTTIExtends<FileSystem, RTTIRoot> {
 public:



More information about the cfe-commits mailing list