[clang] [clang] Implement __attribute__((format_matches)) (PR #116708)

via cfe-commits cfe-commits at lists.llvm.org
Mon Nov 18 14:53:45 PST 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-clang

Author: None (apple-fcloutier)

<details>
<summary>Changes</summary>

This implements ``__attribute__((format_matches))``, as described in the RFC: https://discourse.llvm.org/t/rfc-format-attribute-attribute-format-like/83076

The ``format`` attribute only allows the compiler to check that a format string matches its arguments. If the format string is passed independently of its arguments, there is no way to have the compiler check it. ``format_matches(flavor, fmtidx, example)`` allows the compiler to check format strings against the ``example`` format string instead of against format arguments. See the changes to AttrDocs.td in this diff for more information.

Implementation-wise, this change subclasses CheckPrintfHandler and CheckScanfHandler to allow them to collect specifiers into arrays, and implements comparing that two specifiers are equivalent. `checkFormatStringExpr` gets a new `ReferenceFormatString` argument that is piped down when calling a function with the `format_matches` attribute (and is `nullptr` otherwise); this is the string that the actual format string is compared against.

Although this change does not enable -Wformat-nonliteral by default, IMO, all the pieces are now in place such that it could be.

---

Patch is 80.51 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/116708.diff


12 Files Affected:

- (modified) clang/docs/ReleaseNotes.rst (+44) 
- (modified) clang/include/clang/AST/FormatString.h (+2-1) 
- (modified) clang/include/clang/Basic/Attr.td (+10) 
- (modified) clang/include/clang/Basic/AttrDocs.td (+97) 
- (modified) clang/include/clang/Basic/DiagnosticSemaKinds.td (+21) 
- (modified) clang/include/clang/Sema/Sema.h (+27-10) 
- (modified) clang/lib/AST/AttrImpl.cpp (+5) 
- (modified) clang/lib/AST/FormatString.cpp (+91) 
- (modified) clang/lib/Sema/SemaChecking.cpp (+607-147) 
- (modified) clang/lib/Sema/SemaDeclAttr.cpp (+93-27) 
- (modified) clang/lib/Sema/SemaObjC.cpp (+2-1) 
- (added) clang/test/Sema/format-string-matches.c (+122) 


``````````diff
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 0efe62f1804cd0a..66e4758fe89e243 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -357,6 +357,50 @@ Non-comprehensive list of changes in this release
 
 - ``__builtin_reduce_add`` function can now be used in constant expressions.
 
+- There is a new ``format_matches`` attribute to complement the existing
+  ``format`` attribute. ``format_matches`` allows the compiler to verify that
+  a format string argument is equivalent to a reference format string: it is
+  useful when a function accepts a format string without its accompanying
+  arguments to format. For instance:
+
+  .. code-block:: c
+
+    static int status_code;
+    static const char *status_string;
+
+    void print_status(const char *fmt) {
+      fprintf(stderr, fmt, status_code, status_string);
+      // ^ warning: format string is not a string literal [-Wformat-nonliteral]
+    }
+
+    void stuff(void) {
+      print_status("%s (%#08x)\n");
+      // order of %s and %x is swapped but there is no diagnostic
+    }
+  
+  Before the introducion of ``format_matches``, this code cannot be verified
+  at compile-time. ``format_matches`` plugs that hole:
+
+  .. code-block:: c
+
+    __attribute__((format_matches(printf, 1, "%x %s")))
+    void print_status(const char *fmt) {
+      fprintf(stderr, fmt, status_code, status_string);
+      // ^ `fmt` verified as if it was "%x %s" here; no longer triggers
+      //   -Wformat-nonliteral, would warn if arguments did not match "%x %s"
+    }
+
+    void stuff(void) {
+      print_status("%s (%#08x)\n");
+      // warning: format specifier 's' is incompatible with 'x'
+      // warning: format specifier 'x' is incompatible with 's'
+    }
+
+  Like with ``format``, the first argument is the format string flavor and the
+  second argument is the index of the format string parameter.
+  ``format_matches`` accepts an example valid format string as its third
+  argument. For more information, see the Clang attributes documentation.
+
 New Compiler Flags
 ------------------
 
diff --git a/clang/include/clang/AST/FormatString.h b/clang/include/clang/AST/FormatString.h
index a074dd23e2ad4c7..3560766433fe220 100644
--- a/clang/include/clang/AST/FormatString.h
+++ b/clang/include/clang/AST/FormatString.h
@@ -292,7 +292,7 @@ class ArgType {
   };
 
 private:
-  const Kind K;
+  Kind K;
   QualType T;
   const char *Name = nullptr;
   bool Ptr = false;
@@ -338,6 +338,7 @@ class ArgType {
   }
 
   MatchKind matchesType(ASTContext &C, QualType argTy) const;
+  MatchKind matchesArgType(ASTContext &C, const ArgType &other) const;
 
   QualType getRepresentativeType(ASTContext &C) const;
 
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 6035a563d5fce77..7723e631aa2529c 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1807,6 +1807,16 @@ def Format : InheritableAttr {
   let Documentation = [FormatDocs];
 }
 
+def FormatMatches : InheritableAttr {
+  let Spellings = [GCC<"format_matches">];
+  let Args = [IdentifierArgument<"Type">, IntArgument<"FormatIdx">, ExprArgument<"ExpectedFormat">];
+  let AdditionalMembers = [{
+    StringLiteral *getFormatString() const;
+  }];
+  let Subjects = SubjectList<[ObjCMethod, Block, HasFunctionProto]>;
+  let Documentation = [FormatMatchesDocs];
+}
+
 def FormatArg : InheritableAttr {
   let Spellings = [GCC<"format_arg">];
   let Args = [ParamIdxArgument<"FormatIdx">];
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 2fdceca163ee63b..2ca1bf1a454271a 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -3738,6 +3738,103 @@ behavior of the program is undefined.
   }];
 }
 
+def FormatMatchesDocs : Documentation {
+  let Category = DocCatFunction;
+  let Content = [{
+
+The ``format`` attribute is the basis for the enforcement of diagnostics in the
+``-Wformat`` family, but it only handles the case where the format string is
+passed along with the arguments it is going to format. It cannot handle the case
+where the format string and the format arguments are passed separately from each
+other. For instance:
+
+.. code-block:: c
+
+  static const char *first_name;
+  static double todays_temperature;
+  static int wind_speed;
+
+  void say_hi(const char *fmt) {
+    printf(fmt, first_name, todays_temperature); // warning: format string is not a string literal
+    printf(fmt, first_name, wind_speed); // warning: format string is not a string literal
+  }
+
+  int main() {
+    say_hi("hello %s, it is %g degrees outside");
+    say_hi("hello %s, it is %d degrees outside!"); // no diagnostic
+  }
+
+In this example, ``fmt`` is expected to format a ``const char *`` and a
+``double``, but these values are not passed to ``say_hi``. Without the
+``format`` attribute (which cannot apply in this case), the -Wformat-nonliteral
+diagnostic triggers in the body of ``say_hi``, and incorrect ``say_hi`` call
+sites do not trigger a diagnostic.
+
+To complement the ``format`` attribute, Clang also defines the
+``format_matches`` attribute. Its syntax is similar to the ``format``
+attribute's, but instead of taking the index of the first formatted value
+argument, it takes a C string literal with the expected specifiers:
+
+.. code-block:: c
+
+  static const char *first_name;
+  static double todays_temperature;
+  static int wind_speed;
+
+  __attribute__((__format_matches__(printf, 1, "%s %g")))
+  void say_hi(const char *fmt) {
+    printf(fmt, first_name, todays_temperature); // no dignostic
+    printf(fmt, first_name, wind_speed); // warning: format specifies type 'int' but the argument has type 'double'
+  }
+
+  int main() {
+    say_hi("hello %s, it is %g degrees outside");
+    say_hi("it is %g degrees outside, have a good day %s!");
+    // warning: format specifies 'double' where 'const char *' is required
+    // warning: format specifies 'const char *' where 'double' is required
+  }
+
+The third argument to ``format_matches`` is expected to evaluate to a **C string
+literal** even when the format string would normally be a different type for the
+given flavor, like a ``CFStringRef`` or a ``NSString *``.
+
+In the implementation of a function with the ``format_matches`` attribute,
+format verification works as if the format string was identical to the one
+specified in the attribute.
+
+At the call sites of functions with the ``format_matches`` attribute, format
+verification instead compares the two format strings to evaluate their
+equivalence. Each format flavor defines equivalence between format specifiers.
+Generally speaking, two specifiers are equivalent if they format the same type.
+For instance, in the ``printf`` flavor, ``%2i`` and ``%-0.5d`` are compatible.
+When ``-Wformat-signedness`` is disabled, ``%d`` and ``%u`` are compatible. For
+a negative example, ``%ld`` is incompatible with ``%d``.
+
+Do note the following un-obvious cases:
+
+* Passing ``NULL`` as the format string is accepted without diagnostics.
+* When the format string is not NULL, it cannot _miss_ specifiers, even in
+  trailing positions. For instance, ``%d`` is not accepted when the required
+  format is ``%d %d %d``.
+* Specifiers for integers as small or smaller than ``int`` (such as ``%hhd``)
+  are all mutually compatible because standard argument promotion ensures that
+  integers are at least the size of ``int`` when passed as variadic arguments.
+  With ``-Wformat-signedness``, mixing specifier for types with a different
+  signedness still results in a diagnostic.
+* Format strings expecting a variable modifier (such as ``%*s``) are
+  incompatible with format strings that would itemize the variable modifiers
+  (such as ``%i %s``), even if the two specify ABI-compatible argument lists.
+* All pointer specifiers, modifiers aside, are mutually incompatible. For
+  instance, ``%s`` is not compatible with ``%p``, and ``%p`` is not compatible
+  with ``%n``, and ``%hhn`` is incompatible with ``%s``, even if the pointers
+  are ABI-compatible or identical on the selected platform. However, ``%0.5s``
+  is compatible with ``%s``, since the difference only exists in modifier flags.
+  This is not overridable with ``-Wformat-pedantic`` or its inverse, which
+  control similar behavior in ``-Wformat``.
+
+  }];
+}
+
 def FlagEnumDocs : Documentation {
   let Category = DocCatDecl;
   let Content = [{
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 3caf471d3037f9e..0e7ffc95f8b53ef 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -7687,6 +7687,7 @@ def warn_format_nonliteral_noargs : Warning<
 def warn_format_nonliteral : Warning<
   "format string is not a string literal">,
   InGroup<FormatNonLiteral>, DefaultIgnore;
+def err_format_nonliteral : Error<"format string is not a string literal">;
 
 def err_unexpected_interface : Error<
   "unexpected interface name %0: expected expression">;
@@ -9998,6 +9999,8 @@ def note_previous_declaration_as : Note<
 
 def warn_printf_insufficient_data_args : Warning<
   "more '%%' conversions than data arguments">, InGroup<FormatInsufficientArgs>;
+def warn_format_cmp_insufficient_specifiers : Warning<
+  "fewer specifiers in format string than expected">, InGroup<FormatInsufficientArgs>;
 def warn_printf_data_arg_not_used : Warning<
   "data argument not used by format string">, InGroup<FormatExtraArgs>;
 def warn_format_invalid_conversion : Warning<
@@ -10115,6 +10118,24 @@ def note_format_fix_specifier : Note<"did you mean to use '%0'?">;
 def note_printf_c_str: Note<"did you mean to call the %0 method?">;
 def note_format_security_fixit: Note<
   "treat the string as an argument to avoid this">;
+def warn_format_cmp_role_mismatch : Warning<
+  "format argument is %select{a value|an indirect field width|an indirect "
+  "precision|an auxiliary value}0, but it should be %select{a value|an indirect "
+  "field width|an indirect precision|an auxiliary value}1">, InGroup<Format>;
+def warn_format_cmp_modifierfor_mismatch : Warning<
+  "format argument modifies specifier at position %0, but it should modify "
+  "specifier at position %1">, InGroup<Format>;
+def warn_format_cmp_sensitivity_mismatch : Warning<
+  "argument sensitivity is %select{unspecified|private|public|sensitive}0, but "
+  "it should be %select{unspecified|private|public|sensitive}1">, InGroup<Format>;
+def warn_format_cmp_specifier_mismatch : Warning<
+  "format specifier '%0' is incompatible with '%1'">, InGroup<Format>;
+def warn_format_cmp_specifier_sign_mismatch : Warning<
+  "signedness of format specifier '%0' is incompatible with '%1'">, InGroup<Format>;
+def warn_format_cmp_specifier_mismatch_pedantic : Extension<
+  warn_format_cmp_specifier_sign_mismatch.Summary>, InGroup<FormatPedantic>;
+def note_format_cmp_with : Note<
+  "comparing with this %select{specifier|format string}0">;
 
 def warn_null_arg : Warning<
   "null passed to a callee that requires a non-null argument">,
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index d6f3508a5243f36..22df0bc2923db4e 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -2150,6 +2150,7 @@ class Sema final : public SemaBase {
     FAPK_Fixed,    // values to format are fixed (no C-style variadic arguments)
     FAPK_Variadic, // values to format are passed as variadic arguments
     FAPK_VAList,   // values to format are passed in a va_list
+    FAPK_Elsewhere, // values to format are not passed to this function
   };
 
   // Used to grab the relevant information from a FormatAttr and a
@@ -2160,12 +2161,15 @@ class Sema final : public SemaBase {
     FormatArgumentPassingKind ArgPassingKind;
   };
 
-  /// Given a FunctionDecl's FormatAttr, attempts to populate the
-  /// FomatStringInfo parameter with the FormatAttr's correct format_idx and
-  /// firstDataArg. Returns true when the format fits the function and the
-  /// FormatStringInfo has been populated.
-  static bool getFormatStringInfo(const FormatAttr *Format, bool IsCXXMember,
-                                  bool IsVariadic, FormatStringInfo *FSI);
+  /// Given a function and its FormatAttr or FormatMatchesAttr info, attempts to
+  /// populate the FomatStringInfo parameter with the attribute's correct
+  /// format_idx and firstDataArg. Returns true when the format fits the
+  /// function and the FormatStringInfo has been populated.
+  static bool getFormatStringInfo(const Decl *Function, unsigned FormatIdx,
+                                  unsigned FirstArg, FormatStringInfo *FSI);
+  static bool getFormatStringInfo(unsigned FormatIdx, unsigned FirstArg,
+                                  bool IsCXXMember, bool IsVariadic,
+                                  FormatStringInfo *FSI);
 
   // Used by C++ template instantiation.
   ExprResult BuiltinShuffleVector(CallExpr *TheCall);
@@ -2188,7 +2192,9 @@ class Sema final : public SemaBase {
     FST_Syslog,
     FST_Unknown
   };
+  static FormatStringType GetFormatStringType(StringRef FormatFlavor);
   static FormatStringType GetFormatStringType(const FormatAttr *Format);
+  static FormatStringType GetFormatStringType(const FormatMatchesAttr *Format);
 
   bool FormatStringHasSArg(const StringLiteral *FExpr);
 
@@ -2535,11 +2541,17 @@ class Sema final : public SemaBase {
                             VariadicCallType CallType, SourceLocation Loc,
                             SourceRange Range,
                             llvm::SmallBitVector &CheckedVarArgs);
+  bool CheckFormatString(const FormatMatchesAttr *Format,
+                         ArrayRef<const Expr *> Args, bool IsCXXMember,
+                         VariadicCallType CallType, SourceLocation Loc,
+                         SourceRange Range,
+                         llvm::SmallBitVector &CheckedVarArgs);
   bool CheckFormatArguments(ArrayRef<const Expr *> Args,
-                            FormatArgumentPassingKind FAPK, unsigned format_idx,
-                            unsigned firstDataArg, FormatStringType Type,
-                            VariadicCallType CallType, SourceLocation Loc,
-                            SourceRange range,
+                            FormatArgumentPassingKind FAPK,
+                            const StringLiteral *ReferenceFormatString,
+                            unsigned format_idx, unsigned firstDataArg,
+                            FormatStringType Type, VariadicCallType CallType,
+                            SourceLocation Loc, SourceRange range,
                             llvm::SmallBitVector &CheckedVarArgs);
 
   void CheckInfNaNFunction(const CallExpr *Call, const FunctionDecl *FDecl);
@@ -4527,6 +4539,11 @@ class Sema final : public SemaBase {
   FormatAttr *mergeFormatAttr(Decl *D, const AttributeCommonInfo &CI,
                               IdentifierInfo *Format, int FormatIdx,
                               int FirstArg);
+  FormatMatchesAttr *mergeFormatMatchesAttr(Decl *D,
+                                            const AttributeCommonInfo &CI,
+                                            IdentifierInfo *Format,
+                                            int FormatIdx,
+                                            StringLiteral *FormatStr);
 
   /// AddAlignedAttr - Adds an aligned attribute to a particular declaration.
   void AddAlignedAttr(Decl *D, const AttributeCommonInfo &CI, Expr *E,
diff --git a/clang/lib/AST/AttrImpl.cpp b/clang/lib/AST/AttrImpl.cpp
index f198a9acf8481fd..5a169f815637f6f 100644
--- a/clang/lib/AST/AttrImpl.cpp
+++ b/clang/lib/AST/AttrImpl.cpp
@@ -270,4 +270,9 @@ unsigned AlignedAttr::getAlignment(ASTContext &Ctx) const {
   return Ctx.getTargetDefaultAlignForAttributeAligned();
 }
 
+StringLiteral *FormatMatchesAttr::getFormatString() const {
+  // This cannot go in headers because StringLiteral and Expr may be incomplete.
+  return cast<StringLiteral>(getExpectedFormat());
+}
+
 #include "clang/AST/AttrImpl.inc"
diff --git a/clang/lib/AST/FormatString.cpp b/clang/lib/AST/FormatString.cpp
index e892c1592df9869..5d3b56fc4e71377 100644
--- a/clang/lib/AST/FormatString.cpp
+++ b/clang/lib/AST/FormatString.cpp
@@ -595,6 +595,97 @@ ArgType::matchesType(ASTContext &C, QualType argTy) const {
   llvm_unreachable("Invalid ArgType Kind!");
 }
 
+static analyze_format_string::ArgType::MatchKind
+integerTypeMatch(ASTContext &C, QualType A, QualType B, bool CheckSign) {
+  using MK = analyze_format_string::ArgType::MatchKind;
+
+  uint64_t IntSize = C.getTypeSize(C.IntTy);
+  uint64_t ASize = C.getTypeSize(A);
+  uint64_t BSize = C.getTypeSize(B);
+  if (std::max(ASize, IntSize) != std::max(BSize, IntSize))
+    return MK::NoMatch;
+  if (CheckSign && A->isSignedIntegerType() != B->isSignedIntegerType())
+    return MK::NoMatchSignedness;
+  if (ASize != BSize)
+    return MK::MatchPromotion;
+  return MK::Match;
+}
+
+analyze_format_string::ArgType::MatchKind
+ArgType::matchesArgType(ASTContext &C, const ArgType &Other) const {
+  using AK = analyze_format_string::ArgType::Kind;
+
+  // Per matchesType.
+  if (K == AK::InvalidTy || Other.K == AK::InvalidTy)
+    return NoMatch;
+  if (K == AK::UnknownTy || Other.K == AK::UnknownTy)
+    return Match;
+
+  // Handle whether either (or both, or neither) sides has Ptr set,
+  // in addition to whether either (or both, or neither) sides is a SpecificTy
+  // that is a pointer.
+  ArgType Left = *this;
+  bool LeftWasPointer = false;
+  ArgType Right = Other;
+  bool RightWasPointer = false;
+  if (Left.Ptr) {
+    Left.Ptr = false;
+    LeftWasPointer = true;
+  } else if (Left.K == AK::SpecificTy && Left.T->isPointerType()) {
+    Left.T = Left.T->getPointeeType();
+    LeftWasPointer = true;
+  }
+  if (Right.Ptr) {
+    Right.Ptr = false;
+    RightWasPointer = true;
+  } else if (Right.K == AK::SpecificTy && Right.T->isPointerType()) {
+    Right.T = Right.T->getPointeeType();
+    RightWasPointer = true;
+  }
+
+  if (LeftWasPointer != RightWasPointer)
+    return NoMatch;
+
+  // Ensure that if at least one side is a SpecificTy, then Left is a
+  // SpecificTy.
+  if (Right.K == AK::SpecificTy)
+    std::swap(Left, Right);
+
+  if (Left.K == AK::SpecificTy) {
+    if (Right.K == AK::SpecificTy) {
+      auto Canon1 = C.getCanonicalType(Left.T);
+      auto Canon2 = C.getCanonicalType(Right.T);
+      if (Canon1 == Canon2)
+        return Match;
+
+      auto *BT1 = QualType(Canon1)->getAs<BuiltinType>();
+      auto *BT2 = QualType(Canon2)->getAs<BuiltinType>();
+      if (BT1 == nullptr || BT2 == nullptr)
+        return NoMatch;
+      if (BT1 == BT2)
+        return Match;
+
+      if (!LeftWasPointer && BT1->isInteger() && BT2->isInteger())
+        return integerTypeMatch(C, Canon1, Canon2, true);
+      return NoMatch;
+    } else if (Right.K == AK::AnyCharTy) {
+      if (!LeftWasPointer && Left.T->isIntegerType())
+        return integerTypeMatch(C, Left.T, C.CharTy, false);
+      return NoMatch;
+    } else if (Right.K == AK::WIntTy) {
+      if (!LeftWasPointer && Left.T->isIntegerType())
+        return integerTypeMatch(C, Left.T, C.WIntTy, true);
+      return NoMatch;
+    }
+    // It's hypothetically possible to create an AK::SpecificTy ArgType
+    // that matches another kind of ArgType, but in practice Clang doesn't
+    // do that, so ignore that case.
+    return NoMatch;
+  }
+
+  return Left.K == Right.K ? Match : NoMatch;
+}
+
 ArgType ArgType::makeVectorType(ASTContext &C, unsigned NumElts) const {
   // Check for valid vector element types.
   if (T.isNull())
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 2d4a7cd287b70d7..27fd147fda7708b 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3016,17 +3016,33 @@ bool Sema::ValueIsRunOfOnes(CallExpr *TheCall, unsigned ArgNum) {
          << ArgNum << Arg->getSourceRange();
 }
 
-bool Sema::getFormatStringInfo(const FormatAttr *Format, bool IsCXXMember,
-                               bool IsVariadic, FormatStringInfo *FSI) {
-  if (Format->getFirstArg() == 0)
+bool Sema::getFormatStringInfo(const Decl *D, unsigned FormatIdx,
+                               unsigned FirstArg, FormatStringInfo *FSI) {
+  ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/116708


More information about the cfe-commits mailing list