r264752 - [Sema] Handle UTF-8 invalid format string specifiers

Bruno Cardoso Lopes via cfe-commits cfe-commits at lists.llvm.org
Tue Mar 29 10:35:02 PDT 2016


Author: bruno
Date: Tue Mar 29 12:35:02 2016
New Revision: 264752

URL: http://llvm.org/viewvc/llvm-project?rev=264752&view=rev
Log:
[Sema] Handle UTF-8 invalid format string specifiers

Improve invalid format string specifier handling by printing out
invalid specifiers characters with \x, \u and \U. Previously clang
would print gargabe whenever the character is unprintable.

Example, before:
  NSLog(@"%\u25B9"); => warning: invalid conversion specifier ' [-Wformat-invalid-specifier]
after:
  NSLog(@"%\u25B9"); => warning: invalid conversion specifier '\u25b9' [-Wformat-invalid-specifier]

Differential Revision: http://reviews.llvm.org/D18296

rdar://problem/24672159

Modified:
    cfe/trunk/include/clang/Analysis/Analyses/FormatString.h
    cfe/trunk/lib/Analysis/FormatString.cpp
    cfe/trunk/lib/Analysis/FormatStringParsing.h
    cfe/trunk/lib/Analysis/PrintfFormatString.cpp
    cfe/trunk/lib/Analysis/ScanfFormatString.cpp
    cfe/trunk/lib/Sema/SemaChecking.cpp
    cfe/trunk/test/Sema/format-strings-scanf.c
    cfe/trunk/test/Sema/format-strings.c
    cfe/trunk/test/SemaObjC/format-strings-objc.m

Modified: cfe/trunk/include/clang/Analysis/Analyses/FormatString.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Analysis/Analyses/FormatString.h?rev=264752&r1=264751&r2=264752&view=diff
==============================================================================
--- cfe/trunk/include/clang/Analysis/Analyses/FormatString.h (original)
+++ cfe/trunk/include/clang/Analysis/Analyses/FormatString.h Tue Mar 29 12:35:02 2016
@@ -210,6 +210,7 @@ public:
   unsigned getLength() const {
     return EndScanList ? EndScanList - Position : 1;
   }
+  void setEndScanList(const char *pos) { EndScanList = pos; }
 
   bool isIntArg() const { return (kind >= IntArgBeg && kind <= IntArgEnd) ||
     kind == FreeBSDrArg || kind == FreeBSDyArg; }
@@ -413,11 +414,6 @@ public:
   bool isObjCArg() const { return kind >= ObjCBeg && kind <= ObjCEnd; }
   bool isDoubleArg() const { return kind >= DoubleArgBeg &&
                                     kind <= DoubleArgEnd; }
-  unsigned getLength() const {
-      // Conversion specifiers currently only are represented by
-      // single characters, but we be flexible.
-    return 1;
-  }
 
   static bool classof(const analyze_format_string::ConversionSpecifier *CS) {
     return CS->isPrintfKind();
@@ -546,8 +542,6 @@ public:
   ScanfConversionSpecifier(const char *pos, Kind k)
     : ConversionSpecifier(false, pos, k) {}
 
-  void setEndScanList(const char *pos) { EndScanList = pos; }
-
   static bool classof(const analyze_format_string::ConversionSpecifier *CS) {
     return !CS->isPrintfKind();
   }

Modified: cfe/trunk/lib/Analysis/FormatString.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Analysis/FormatString.cpp?rev=264752&r1=264751&r2=264752&view=diff
==============================================================================
--- cfe/trunk/lib/Analysis/FormatString.cpp (original)
+++ cfe/trunk/lib/Analysis/FormatString.cpp Tue Mar 29 12:35:02 2016
@@ -15,6 +15,7 @@
 #include "FormatStringParsing.h"
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/TargetInfo.h"
+#include "llvm/Support/ConvertUTF.h"
 
 using clang::analyze_format_string::ArgType;
 using clang::analyze_format_string::FormatStringHandler;
@@ -260,6 +261,28 @@ clang::analyze_format_string::ParseLengt
   return true;
 }
 
+bool clang::analyze_format_string::ParseUTF8InvalidSpecifier(
+    const char *SpecifierBegin, const char *FmtStrEnd, unsigned &Len) {
+  if (SpecifierBegin + 1 >= FmtStrEnd)
+    return false;
+
+  const UTF8 *SB = reinterpret_cast<const UTF8 *>(SpecifierBegin + 1);
+  const UTF8 *SE = reinterpret_cast<const UTF8 *>(FmtStrEnd);
+  const char FirstByte = *SB;
+
+  // If the invalid specifier is a multibyte UTF-8 string, return the
+  // total length accordingly so that the conversion specifier can be
+  // properly updated to reflect a complete UTF-8 specifier.
+  unsigned NumBytes = getNumBytesForUTF8(FirstByte);
+  if (NumBytes == 1)
+    return false;
+  if (SB + NumBytes > SE)
+    return false;
+
+  Len = NumBytes + 1;
+  return true;
+}
+
 //===----------------------------------------------------------------------===//
 // Methods on ArgType.
 //===----------------------------------------------------------------------===//

Modified: cfe/trunk/lib/Analysis/FormatStringParsing.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Analysis/FormatStringParsing.h?rev=264752&r1=264751&r2=264752&view=diff
==============================================================================
--- cfe/trunk/lib/Analysis/FormatStringParsing.h (original)
+++ cfe/trunk/lib/Analysis/FormatStringParsing.h Tue Mar 29 12:35:02 2016
@@ -46,7 +46,13 @@ bool ParseArgPosition(FormatStringHandle
 /// FormatSpecifier& argument, and false otherwise.
 bool ParseLengthModifier(FormatSpecifier &FS, const char *&Beg, const char *E,
                          const LangOptions &LO, bool IsScanf = false);
-  
+
+/// Returns true if the invalid specifier in \p SpecifierBegin is a UTF-8
+/// string; check that it won't go further than \p FmtStrEnd and write
+/// up the total size in \p Len.
+bool ParseUTF8InvalidSpecifier(const char *SpecifierBegin,
+                               const char *FmtStrEnd, unsigned &Len);
+
 template <typename T> class SpecifierResult {
   T FS;
   const char *Start;

Modified: cfe/trunk/lib/Analysis/PrintfFormatString.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Analysis/PrintfFormatString.cpp?rev=264752&r1=264751&r2=264752&view=diff
==============================================================================
--- cfe/trunk/lib/Analysis/PrintfFormatString.cpp (original)
+++ cfe/trunk/lib/Analysis/PrintfFormatString.cpp Tue Mar 29 12:35:02 2016
@@ -312,8 +312,13 @@ static PrintfSpecifierResult ParsePrintf
     argIndex++;
 
   if (k == ConversionSpecifier::InvalidSpecifier) {
+    unsigned Len = I - Start;
+    if (ParseUTF8InvalidSpecifier(Start, E, Len)) {
+      CS.setEndScanList(Start + Len);
+      FS.setConversionSpecifier(CS);
+    }
     // Assume the conversion takes one argument.
-    return !H.HandleInvalidPrintfConversionSpecifier(FS, Start, I - Start);
+    return !H.HandleInvalidPrintfConversionSpecifier(FS, Start, Len);
   }
   return PrintfSpecifierResult(Start, FS);
 }

Modified: cfe/trunk/lib/Analysis/ScanfFormatString.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Analysis/ScanfFormatString.cpp?rev=264752&r1=264751&r2=264752&view=diff
==============================================================================
--- cfe/trunk/lib/Analysis/ScanfFormatString.cpp (original)
+++ cfe/trunk/lib/Analysis/ScanfFormatString.cpp Tue Mar 29 12:35:02 2016
@@ -79,7 +79,7 @@ static ScanfSpecifierResult ParseScanfSp
                                                 unsigned &argIndex,
                                                 const LangOptions &LO,
                                                 const TargetInfo &Target) {
-  
+  using namespace clang::analyze_format_string;
   using namespace clang::analyze_scanf;
   const char *I = Beg;
   const char *Start = nullptr;
@@ -210,10 +210,15 @@ static ScanfSpecifierResult ParseScanfSp
   
   // FIXME: '%' and '*' doesn't make sense.  Issue a warning.
   // FIXME: 'ConsumedSoFar' and '*' doesn't make sense.
-  
+
   if (k == ScanfConversionSpecifier::InvalidSpecifier) {
+    unsigned Len = I - Beg;
+    if (ParseUTF8InvalidSpecifier(Beg, E, Len)) {
+      CS.setEndScanList(Beg + Len);
+      FS.setConversionSpecifier(CS);
+    }
     // Assume the conversion takes one argument.
-    return !H.HandleInvalidScanfConversionSpecifier(FS, Beg, I - Beg);
+    return !H.HandleInvalidScanfConversionSpecifier(FS, Beg, Len);
   }
   return ScanfSpecifierResult(Start, FS);
 }

Modified: cfe/trunk/lib/Sema/SemaChecking.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Sema/SemaChecking.cpp?rev=264752&r1=264751&r2=264752&view=diff
==============================================================================
--- cfe/trunk/lib/Sema/SemaChecking.cpp (original)
+++ cfe/trunk/lib/Sema/SemaChecking.cpp Tue Mar 29 12:35:02 2016
@@ -36,6 +36,8 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/Locale.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/raw_ostream.h"
 #include <limits>
@@ -3976,12 +3978,41 @@ CheckFormatHandler::HandleInvalidConvers
     // gibberish when trying to match arguments.
     keepGoing = false;
   }
-  
-  EmitFormatDiagnostic(S.PDiag(diag::warn_format_invalid_conversion)
-                         << StringRef(csStart, csLen),
-                       Loc, /*IsStringLocation*/true,
-                       getSpecifierRange(startSpec, specifierLen));
-  
+
+  StringRef Specifier(csStart, csLen);
+
+  // If the specifier in non-printable, it could be the first byte of a UTF-8
+  // sequence. In that case, print the UTF-8 code point. If not, print the byte
+  // hex value.
+  std::string CodePointStr;
+  if (!llvm::sys::locale::isPrint(*csStart)) {
+    UTF32 CodePoint;
+    const UTF8 **B = reinterpret_cast<const UTF8 **>(&csStart);
+    const UTF8 *E =
+        reinterpret_cast<const UTF8 *>(csStart + csLen);
+    ConversionResult Result =
+        llvm::convertUTF8Sequence(B, E, &CodePoint, strictConversion);
+
+    if (Result != conversionOK) {
+      unsigned char FirstChar = *csStart;
+      CodePoint = (UTF32)FirstChar;
+    }
+
+    llvm::raw_string_ostream OS(CodePointStr);
+    if (CodePoint < 256)
+      OS << "\\x" << llvm::format("%02x", CodePoint);
+    else if (CodePoint <= 0xFFFF)
+      OS << "\\u" << llvm::format("%04x", CodePoint);
+    else
+      OS << "\\U" << llvm::format("%08x", CodePoint);
+    OS.flush();
+    Specifier = CodePointStr;
+  }
+
+  EmitFormatDiagnostic(
+      S.PDiag(diag::warn_format_invalid_conversion) << Specifier, Loc,
+      /*IsStringLocation*/ true, getSpecifierRange(startSpec, specifierLen));
+
   return keepGoing;
 }
 

Modified: cfe/trunk/test/Sema/format-strings-scanf.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Sema/format-strings-scanf.c?rev=264752&r1=264751&r2=264752&view=diff
==============================================================================
--- cfe/trunk/test/Sema/format-strings-scanf.c (original)
+++ cfe/trunk/test/Sema/format-strings-scanf.c Tue Mar 29 12:35:02 2016
@@ -183,3 +183,11 @@ void check_conditional_literal(char *s,
   scanf(i ? "%d" : "%d", i, s); // expected-warning{{data argument not used}}
   scanf(i ? "%s" : "%d", s); // expected-warning{{format specifies type 'int *'}}
 }
+
+void testInvalidNoPrintable(int *a) {
+  scanf("%\u25B9", a); // expected-warning {{invalid conversion specifier '\u25b9'}}
+  scanf("%\xE2\x96\xB9", a); // expected-warning {{invalid conversion specifier '\u25b9'}}
+  scanf("%\U00010348", a); // expected-warning {{invalid conversion specifier '\U00010348'}}
+  scanf("%\xF0\x90\x8D\x88", a); // expected-warning {{invalid conversion specifier '\U00010348'}}
+  scanf("%\xe2", a); // expected-warning {{invalid conversion specifier '\xe2'}}
+}

Modified: cfe/trunk/test/Sema/format-strings.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Sema/format-strings.c?rev=264752&r1=264751&r2=264752&view=diff
==============================================================================
--- cfe/trunk/test/Sema/format-strings.c (original)
+++ cfe/trunk/test/Sema/format-strings.c Tue Mar 29 12:35:02 2016
@@ -642,6 +642,14 @@ void test_qualifiers(volatile int *vip,
   printf("%n", (cip_t)0); // expected-warning{{format specifies type 'int *' but the argument has type 'cip_t' (aka 'const int *')}}
 }
 
+void testInvalidNoPrintable() {
+  printf("%\u25B9"); // expected-warning {{invalid conversion specifier '\u25b9'}}
+  printf("%\xE2\x96\xB9"); // expected-warning {{invalid conversion specifier '\u25b9'}}
+  printf("%\U00010348"); // expected-warning {{invalid conversion specifier '\U00010348'}}
+  printf("%\xF0\x90\x8D\x88"); // expected-warning {{invalid conversion specifier '\U00010348'}}
+  printf("%\xe2"); // expected-warning {{invalid conversion specifier '\xe2'}}
+}
+
 #pragma GCC diagnostic ignored "-Wformat-nonliteral"
 #pragma GCC diagnostic warning "-Wformat-security"
 // <rdar://problem/14178260>

Modified: cfe/trunk/test/SemaObjC/format-strings-objc.m
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/SemaObjC/format-strings-objc.m?rev=264752&r1=264751&r2=264752&view=diff
==============================================================================
--- cfe/trunk/test/SemaObjC/format-strings-objc.m (original)
+++ cfe/trunk/test/SemaObjC/format-strings-objc.m Tue Mar 29 12:35:02 2016
@@ -265,3 +265,11 @@ void testObjCModifierFlags() {
   NSLog(@"%2$[tt]@ %1$[tt]s", @"Foo", @"Bar"); // expected-warning {{object format flags cannot be used with 's' conversion specifier}}
 }
 
+// Test Objective-C invalid no printable specifiers
+void testObjcInvalidNoPrintable(int *a) {
+  NSLog(@"%\u25B9"); // expected-warning {{invalid conversion specifier '\u25b9'}}
+  NSLog(@"%\xE2\x96\xB9"); // expected-warning {{invalid conversion specifier '\u25b9'}}
+  NSLog(@"%\U00010348"); // expected-warning {{invalid conversion specifier '\U00010348'}}
+  NSLog(@"%\xF0\x90\x8D\x88"); // expected-warning {{invalid conversion specifier '\U00010348'}}
+  NSLog(@"%\xe2"); // expected-warning {{input conversion stopped}} expected-warning {{invalid conversion specifier '\xe2'}}
+}




More information about the cfe-commits mailing list