[cfe-commits] r41076 - in /cfe/trunk: Sema/Sema.h Sema/SemaChecking.cpp Sema/SemaExpr.cpp include/clang/Basic/DiagnosticKinds.def test/Sema/format-strings.c

Ted Kremenek kremenek at apple.com
Tue Aug 14 10:39:50 PDT 2007


Author: kremenek
Date: Tue Aug 14 12:39:48 2007
New Revision: 41076

URL: http://llvm.org/viewvc/llvm-project?rev=41076&view=rev
Log:
Added support for additional format string checking for the printf
family of functions.  Previous functionality only included checking to
see if the format string was a string literal.  Now we check parse the
format string (if it is a literal) and perform the following checks:

(1) Warn if: number conversions (e.g. "%d") != number data arguments.

(2) Warn about missing format strings  (e.g., "printf()").

(3) Warn if the format string is not a string literal.

(4) Warn about the use se of '%n' conversion.  This conversion is
    discouraged for security reasons.

(5) Warn about malformed conversions.  For example '%;', '%v'; these
    are not valid.

(6) Warn about empty format strings; e.g. printf("").  Although these
    can be optimized away by the compiler, they can be indicative of
    broken programmer logic.  We may need to add additional support to
    see when such cases occur within macro expansion to avoid false
    positives.

(7) Warn if the string literal is wide; e.g. L"%d".

(8) Warn if we detect a '\0' character WITHIN the format string.

Test cases are included.

Modified:
    cfe/trunk/Sema/Sema.h
    cfe/trunk/Sema/SemaChecking.cpp
    cfe/trunk/Sema/SemaExpr.cpp
    cfe/trunk/include/clang/Basic/DiagnosticKinds.def
    cfe/trunk/test/Sema/format-strings.c

Modified: cfe/trunk/Sema/Sema.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/Sema/Sema.h?rev=41076&r1=41075&r2=41076&view=diff

==============================================================================
--- cfe/trunk/Sema/Sema.h (original)
+++ cfe/trunk/Sema/Sema.h Tue Aug 14 12:39:48 2007
@@ -423,10 +423,14 @@
   // Extra semantic analysis beyond the C type system
   private:
   
-  void CheckFunctionCall(Expr *Fn, FunctionDecl *FDecl,
+  void CheckFunctionCall(Expr *Fn,
+                         SourceLocation LParenLoc, SourceLocation RParenLoc,
+                         FunctionDecl *FDecl,
                          Expr** Args, unsigned NumArgsInCall);
 
-  void CheckPrintfArguments(Expr *Fn, unsigned id_idx, FunctionDecl *FDecl,
+  void CheckPrintfArguments(Expr *Fn,
+                            SourceLocation LParenLoc, SourceLocation RParenLoc,
+                            bool HasVAListArg, FunctionDecl *FDecl,
                             unsigned format_idx, Expr** Args,
                             unsigned NumArgsInCall);
 };

Modified: cfe/trunk/Sema/SemaChecking.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/Sema/SemaChecking.cpp?rev=41076&r1=41075&r2=41076&view=diff

==============================================================================
--- cfe/trunk/Sema/SemaChecking.cpp (original)
+++ cfe/trunk/Sema/SemaChecking.cpp Tue Aug 14 12:39:48 2007
@@ -29,7 +29,9 @@
 /// CheckFunctionCall - Check a direct function call for various correctness
 /// and safety properties not strictly enforced by the C type system.
 void
-Sema::CheckFunctionCall(Expr *Fn, FunctionDecl *FDecl,
+Sema::CheckFunctionCall(Expr *Fn,
+                        SourceLocation LParenLoc, SourceLocation RParenLoc,
+                        FunctionDecl *FDecl,
                         Expr** Args, unsigned NumArgsInCall) {
                         
   // Get the IdentifierInfo* for the called function.
@@ -37,55 +39,287 @@
   
   // Search the KnownFunctionIDs for the identifier.
   unsigned i = 0, e = id_num_known_functions;
-  for ( ; i != e; ++i) { if (KnownFunctionIDs[i] == FnInfo) break; }
-  if( i == e ) return;
+  for (; i != e; ++i) { if (KnownFunctionIDs[i] == FnInfo) break; }
+  if (i == e) return;
   
   // Printf checking.
   if (i <= id_vprintf) {
-    // Retrieve the index of the format string parameter.
+    // Retrieve the index of the format string parameter and determine
+    // if the function is passed a va_arg argument.
     unsigned format_idx = 0;
+    bool HasVAListArg = false;
+    
     switch (i) {
       default: assert(false && "No format string argument index.");
       case id_printf:    format_idx = 0; break;
       case id_fprintf:   format_idx = 1; break;
       case id_sprintf:   format_idx = 1; break;
       case id_snprintf:  format_idx = 2; break;
-      case id_vsnprintf: format_idx = 2; break;
-      case id_asprintf:  format_idx = 1; break;
-      case id_vasprintf: format_idx = 1; break;
-      case id_vfprintf:  format_idx = 1; break;
-      case id_vsprintf:  format_idx = 1; break;
-      case id_vprintf:   format_idx = 1; break;
-    }    
-    CheckPrintfArguments(Fn, i, FDecl, format_idx, Args, NumArgsInCall);
+      case id_asprintf:  format_idx = 1; HasVAListArg = true; break;
+      case id_vsnprintf: format_idx = 2; HasVAListArg = true; break;
+      case id_vasprintf: format_idx = 1; HasVAListArg = true; break;
+      case id_vfprintf:  format_idx = 1; HasVAListArg = true; break;
+      case id_vsprintf:  format_idx = 1; HasVAListArg = true; break;
+      case id_vprintf:   format_idx = 0; HasVAListArg = true; break;
+    }
+    
+    CheckPrintfArguments(Fn, LParenLoc, RParenLoc, HasVAListArg,
+			 FDecl, format_idx, Args, NumArgsInCall);
   }
 }
 
 /// CheckPrintfArguments - Check calls to printf (and similar functions) for
-/// correct use of format strings.  Improper format strings to functions in
-/// the printf family can be the source of bizarre bugs and very serious
-/// security holes.  A good source of information is available in the following
-/// paper (which includes additional references):
+/// correct use of format strings.  
+///
+///  HasVAListArg - A predicate indicating whether the printf-like
+///    function is passed an explicit va_arg argument (e.g., vprintf)
+///
+///  format_idx - The index into Args for the format string.
+///
+/// Improper format strings to functions in the printf family can be
+/// the source of bizarre bugs and very serious security holes.  A
+/// good source of information is available in the following paper
+/// (which includes additional references):
 ///
 ///  FormatGuard: Automatic Protection From printf Format String
 ///  Vulnerabilities, Proceedings of the 10th USENIX Security Symposium, 2001.
+///
+/// Functionality implemented:
+///
+///  We can statically check the following properties for string
+///  literal format strings for non v.*printf functions (where the
+///  arguments are passed directly):
+//
+///  (1) Are the number of format conversions equal to the number of
+///      data arguments?
+///
+///  (2) Does each format conversion correctly match the type of the
+///      corresponding data argument?  (TODO)
+///
+/// Moreover, for all printf functions we can:
+///
+///  (3) Check for a missing format string (when not caught by type checking).
+///
+///  (4) Check for no-operation flags; e.g. using "#" with format
+///      conversion 'c'  (TODO)
+///
+///  (5) Check the use of '%n', a major source of security holes.
+///
+///  (6) Check for malformed format conversions that don't specify anything.
+///
+///  (7) Check for empty format strings.  e.g: printf("");
+///
+///  (8) Check that the format string is a wide literal.
+///
+/// All of these checks can be done by parsing the format string.
+///
+/// For now, we ONLY do (1), (3), (5), (6), (7), and (8).
 void
-Sema::CheckPrintfArguments(Expr *Fn, unsigned id_idx, FunctionDecl *FDecl,
+Sema::CheckPrintfArguments(Expr *Fn, 
+                           SourceLocation LParenLoc, SourceLocation RParenLoc,
+                           bool HasVAListArg, FunctionDecl *FDecl,
                            unsigned format_idx, Expr** Args, 
                            unsigned NumArgsInCall) {
-                           
-  assert( format_idx < NumArgsInCall );
-
+  // CHECK: printf-like function is called with no format string.  
+  if (format_idx >= NumArgsInCall) {
+    Diag(RParenLoc, diag::warn_printf_missing_format_string, 
+         Fn->getSourceRange());
+    return;
+  }
+  
   // CHECK: format string is not a string literal.
   // 
-  // Dynamically generated format strings are difficult to automatically
-  // vet at compile time.  Requiring that format strings are string literals
-  // (1) permits the checking of format strings by the compiler and thereby
-  // (2) can practically remove the source of many format string exploits.
-
+  // Dynamically generated format strings are difficult to
+  // automatically vet at compile time.  Requiring that format strings
+  // are string literals: (1) permits the checking of format strings by
+  // the compiler and thereby (2) can practically remove the source of
+  // many format string exploits.
   StringLiteral *FExpr = dyn_cast<StringLiteral>(Args[format_idx]);
   
-  if ( FExpr == NULL )
-    Diag( Args[format_idx]->getLocStart(), 
-          diag::warn_printf_not_string_constant, Fn->getSourceRange() );
-}
\ No newline at end of file
+  if (FExpr == NULL) {
+    Diag(Args[format_idx]->getLocStart(), 
+         diag::warn_printf_not_string_constant, Fn->getSourceRange());
+    return;
+  }
+
+  // CHECK: is the format string a wide literal?
+  if (FExpr->isWide()) {
+    Diag(Args[format_idx]->getLocStart(),
+         diag::warn_printf_format_string_is_wide_literal,
+         Fn->getSourceRange());
+    return;
+  }
+
+  // Str - The format string.  NOTE: this is NOT null-terminated!
+  const char * const Str = FExpr->getStrData();
+
+  // CHECK: empty format string?
+  const unsigned StrLen = FExpr->getByteLength();
+  
+  if (StrLen == 0) {
+    Diag(Args[format_idx]->getLocStart(),
+         diag::warn_printf_empty_format_string, Fn->getSourceRange());
+    return;
+  }
+
+  // We process the format string using a binary state machine.  The
+  // current state is stored in CurrentState.
+  enum {
+    state_OrdChr,
+    state_Conversion
+  } CurrentState = state_OrdChr;
+  
+  // numConversions - The number of conversions seen so far.  This is
+  //  incremented as we traverse the format string.
+  unsigned numConversions = 0;
+
+  // numDataArgs - The number of data arguments after the format
+  //  string.  This can only be determined for non vprintf-like
+  //  functions.  For those functions, this value is 1 (the sole
+  //  va_arg argument).
+  unsigned numDataArgs = NumArgsInCall-(format_idx+1);
+
+  // Inspect the format string.
+  unsigned StrIdx = 0;
+  
+  // LastConversionIdx - Index within the format string where we last saw
+  //  a '%' character that starts a new format conversion.
+  unsigned LastConversionIdx = 0;
+  
+  for ( ; StrIdx < StrLen ; ++StrIdx ) {
+
+    // Is the number of detected conversion conversions greater than
+    // the number of matching data arguments?  If so, stop.
+    if (!HasVAListArg && numConversions > numDataArgs) break;
+    
+    // Handle "\0"
+    if(Str[StrIdx] == '\0' ) {
+      // The string returned by getStrData() is not null-terminated,
+      // so the presence of a null character is likely an error.
+    
+      SourceLocation Loc =
+      PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),StrIdx+1);
+    
+      Diag(Loc, diag::warn_printf_format_string_contains_null_char,
+           Fn->getSourceRange());
+    
+      return;
+    }
+    
+    // Ordinary characters (not processing a format conversion).
+    if (CurrentState == state_OrdChr) {
+      if (Str[StrIdx] == '%') {
+        CurrentState = state_Conversion;
+        LastConversionIdx = StrIdx;
+      }
+      continue;
+    }
+
+    // Seen '%'.  Now processing a format conversion.
+    switch (Str[StrIdx]) {
+      // Characters which can terminate a format conversion
+      // (e.g. "%d").  Characters that specify length modifiers or
+      // other flags are handled by the default case below.
+      //
+      // TODO: additional checks will go into the following cases.
+      case 'i':
+      case 'd':
+      case 'o': 
+      case 'u': 
+      case 'x':
+      case 'X':
+      case 'D':
+      case 'O':
+      case 'U':
+      case 'e':
+      case 'E':
+      case 'f':
+      case 'F':
+      case 'g':
+      case 'G':
+      case 'a':
+      case 'A':
+      case 'c':
+      case 'C':
+      case 'S':
+      case 's':
+      case 'P': 
+        ++numConversions;
+        CurrentState = state_OrdChr;
+        break;
+
+      // CHECK: Are we using "%n"?  Issue a warning.
+      case 'n': {
+        ++numConversions;
+        CurrentState = state_OrdChr;
+        SourceLocation Loc = 
+          PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),
+                                     LastConversionIdx+1);
+                                     
+        Diag(Loc, diag::warn_printf_write_back, Fn->getSourceRange());
+        break;
+      }
+                    
+      // Handle "%%"
+      case '%':
+        // Sanity check: Was the first "%" character the previous one?
+        // If not, we will assume that we have a malformed format
+        // conversion, and that the current "%" character is the start
+        // of a new conversion.
+        if (StrIdx - LastConversionIdx == 1)
+          CurrentState = state_OrdChr; 
+        else {
+          // Issue a warning: invalid format conversion.
+          SourceLocation Loc =
+            PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),
+                                       LastConversionIdx+1);
+              
+          Diag(Loc, diag::warn_printf_invalid_conversion, 
+	       std::string(Str+LastConversionIdx, Str+StrIdx),
+               Fn->getSourceRange());
+               
+          // This conversion is broken.  Advance to the next format
+          // conversion.
+          LastConversionIdx = StrIdx;
+          ++numConversions;
+        }
+        
+        break;
+                
+      default:
+        // This case catches all other characters: flags, widths, etc.
+        // We should eventually process those as well.
+        break;
+    }
+  }
+
+  if (CurrentState == state_Conversion) {
+    // Issue a warning: invalid format conversion.
+    SourceLocation Loc =
+      PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),
+                                 LastConversionIdx+1);
+    
+    Diag(Loc, diag::warn_printf_invalid_conversion,
+	 std::string(Str+LastConversionIdx, Str+StrIdx),
+         Fn->getSourceRange());
+    return;
+  }
+  
+  if (!HasVAListArg) {
+    // CHECK: Does the number of format conversions exceed the number
+    //        of data arguments?
+    if (numConversions > numDataArgs) {
+      SourceLocation Loc =
+        PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),
+                                   LastConversionIdx);
+                                   
+      Diag(Loc, diag::warn_printf_insufficient_data_args,
+           Fn->getSourceRange());
+    }
+    // CHECK: Does the number of data arguments exceed the number of
+    //        format conversions in the format string?
+    else if (numConversions < numDataArgs)
+      Diag(Args[format_idx+numConversions+1]->getLocStart(),
+           diag::warn_printf_too_many_data_args, Fn->getSourceRange());
+  }
+}

Modified: cfe/trunk/Sema/SemaExpr.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/Sema/SemaExpr.cpp?rev=41076&r1=41075&r2=41076&view=diff

==============================================================================
--- cfe/trunk/Sema/SemaExpr.cpp (original)
+++ cfe/trunk/Sema/SemaExpr.cpp Tue Aug 14 12:39:48 2007
@@ -561,7 +561,7 @@
   if (ImplicitCastExpr *IcExpr = dyn_cast<ImplicitCastExpr>(Fn))
     if (DeclRefExpr *DRExpr = dyn_cast<DeclRefExpr>(IcExpr->getSubExpr()))
       if (FunctionDecl *FDecl = dyn_cast<FunctionDecl>(DRExpr->getDecl()))
-        CheckFunctionCall(Fn, FDecl, Args, NumArgsInCall);
+        CheckFunctionCall(Fn, LParenLoc, RParenLoc, FDecl, Args, NumArgsInCall);
 
   return new CallExpr(Fn, Args, NumArgsInCall, resultType, RParenLoc);
 }

Modified: cfe/trunk/include/clang/Basic/DiagnosticKinds.def
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/DiagnosticKinds.def?rev=41076&r1=41075&r2=41076&view=diff

==============================================================================
--- cfe/trunk/include/clang/Basic/DiagnosticKinds.def (original)
+++ cfe/trunk/include/clang/Basic/DiagnosticKinds.def Tue Aug 14 12:39:48 2007
@@ -661,8 +661,24 @@
      
 // Printf checking
 DIAG(warn_printf_not_string_constant, WARNING,
-     "format string is not a string literal")
-
+     "format string is not a string literal (potentially insecure)")
+DIAG(warn_printf_write_back, WARNING,
+     "use of '%n' in format string discouraged (potentially insecure)")
+DIAG(warn_printf_insufficient_data_args, WARNING,
+    "more '%' conversions than data arguments")
+DIAG(warn_printf_too_many_data_args, WARNING,
+    "more data arguments than '%' conversions")
+DIAG(warn_printf_invalid_conversion, WARNING,
+    "invalid conversion '%0'")
+DIAG(warn_printf_missing_format_string, WARNING,
+    "format string missing")
+DIAG(warn_printf_empty_format_string, WARNING,
+   "format string is empty")
+DIAG(warn_printf_format_string_is_wide_literal, WARNING,
+  "format string should not be a wide string")
+DIAG(warn_printf_format_string_contains_null_char, WARNING,
+  "format string contains '\\0' within the string body")
+   
 // Statements.
 DIAG(err_continue_not_in_loop, ERROR,
      "'continue' statement not in loop statement")

Modified: cfe/trunk/test/Sema/format-strings.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Sema/format-strings.c?rev=41076&r1=41075&r2=41076&view=diff

==============================================================================
--- cfe/trunk/test/Sema/format-strings.c (original)
+++ cfe/trunk/test/Sema/format-strings.c Tue Aug 14 12:39:48 2007
@@ -21,3 +21,44 @@
   vsnprintf(buf,2,s,ap); // expected-warning {{mat string is not a string lit}}
 }
 
+void check_writeback_specifier()
+{
+  int x;
+  char *b;
+
+  printf("%n",&x); // expected-warning {{'%n' in format string discouraged}}
+  sprintf(b,"%d%%%n",1, &x); // expected-warning {{'%n' in format string dis}}
+}
+
+void check_invalid_specifier(FILE* fp, char *buf)
+{
+  printf("%s%lb%d","unix",10,20); // expected-warning {{lid conversion '%lb'}}
+  fprintf(fp,"%%%l"); // expected-warning {{lid conversion '%l'}}
+  sprintf(buf,"%%%%%ld%d%d", 1, 2, 3); // no-warning
+  snprintf(buf, 2, "%%%%%ld%;%d", 1, 2, 3); // expected-warning {{sion '%;'}}
+}
+
+void check_null_char_string(char* b)
+{
+  printf("\0this is bogus%d",1); // expected-warning {{string contains '\0'}}
+  snprintf(b,10,"%%%%%d\0%d",1,2); // expected-warning {{string contains '\0'}}
+  printf("%\0d",1); // expected-warning {{string contains '\0'}}
+}
+
+void check_empty_format_string(char* buf)
+{
+  va_list ap;
+  va_start(ap,buf);
+  vprintf("",ap); // expected-warning {{format string is empty}}
+  sprintf(buf,""); // expected-warning {{format string is empty}}
+}
+
+void check_wide_string()
+{
+  char *b;
+  va_list ap;
+  va_start(ap,b);
+
+  printf(L"foo %d",2); // expected-warning {{should not be a wide string}}
+  vasprintf(&b,L"bar %d",2); // expected-warning {{should not be a wide string}}
+}





More information about the cfe-commits mailing list