[llvm-branch-commits] [clang] [llvm] Add format string handling (PR #196568)
Abhina Sree via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Jun 19 06:44:45 PDT 2026
https://github.com/abhina-sree updated https://github.com/llvm/llvm-project/pull/196568
>From debd0182db40d0aebf12da43b6e26e497b8fbd63 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Fri, 19 Jun 2026 08:50:48 -0400
Subject: [PATCH 1/9] use LiteralEncoding internally, address other comments
---
clang/include/clang/Basic/DiagnosticLexKinds.td | 2 +-
clang/include/clang/Basic/LangOptions.h | 4 ++--
clang/include/clang/Lex/TextEncoding.h | 8 ++++----
clang/include/clang/Options/Options.td | 4 ++--
clang/lib/Frontend/CompilerInstance.cpp | 2 +-
clang/lib/Frontend/InitPreprocessor.cpp | 12 ++++++------
clang/lib/Lex/LiteralSupport.cpp | 2 +-
clang/lib/Lex/TextEncoding.cpp | 16 ++++++++--------
clang/lib/Sema/SemaExpr.cpp | 2 +-
clang/test/CodeGen/systemz-charset-diag.cpp | 2 +-
clang/test/CodeGen/systemz-charset.c | 10 ++++++++++
11 files changed, 37 insertions(+), 27 deletions(-)
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index f12fa0205b650..3b0b4d87fc006 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -288,7 +288,7 @@ def ext_string_too_long : Extension<"string literal of length %0 exceeds "
def err_character_too_large : Error<
"character too large for enclosing character literal type">;
def err_exec_charset_conversion_failed
- : Error<"conversion to execution encoding failed: '%0'">;
+ : Error<"conversion to literal encoding failed: '%0'">;
def warn_c99_compat_unicode_literal : Warning<
"unicode literals are incompatible with C99">,
InGroup<C99Compat>, DefaultIgnore;
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 5ec31b356d059..bbf47c34b306a 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -618,8 +618,8 @@ class LangOptions : public LangOptionsBase {
/// The allocation token mode.
std::optional<llvm::AllocTokenMode> AllocTokenMode;
- /// Name of the execution encoding to convert the internal encoding to.
- std::string ExecEncoding;
+ /// Name of the literal encoding to convert the internal encoding to.
+ std::string LiteralEncoding;
LangOptions();
diff --git a/clang/include/clang/Lex/TextEncoding.h b/clang/include/clang/Lex/TextEncoding.h
index 770cb3c5eff1a..c892d1fadbc38 100644
--- a/clang/include/clang/Lex/TextEncoding.h
+++ b/clang/include/clang/Lex/TextEncoding.h
@@ -13,18 +13,18 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/TextEncoding.h"
-enum ConversionAction { CA_NoConversion, CA_ToExecEncoding };
+enum ConversionAction { CA_NoConversion, CA_ToLiteralEncoding };
class TextEncoding {
- llvm::StringRef ExecEncoding;
- llvm::TextEncodingConverter *ToExecEncodingConverter = nullptr;
+ llvm::StringRef LiteralEncoding;
+ llvm::TextEncodingConverter *ToLiteralEncodingConverter = nullptr;
public:
llvm::TextEncodingConverter *getConverter(ConversionAction Action) const;
static std::error_code
setConvertersFromOptions(TextEncoding &TE, const clang::LangOptions &Opts);
- llvm::StringRef getExecEncoding() { return ExecEncoding; }
+ llvm::StringRef getLiteralEncoding() { return LiteralEncoding; }
};
#endif
diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td
index 92b62fa8fceb4..bad318f703935 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -7537,10 +7537,10 @@ def tune_cpu : Separate<["-"], "tune-cpu">,
HelpText<"Tune for a specific cpu type">,
MarshallingInfoString<TargetOpts<"TuneCPU">>;
def fexec_charset : Separate<["-"], "fexec-charset">, MetaVarName<"<encoding>">,
- HelpText<"Set the execution <encoding> for ordinary string and character literals. "
+ HelpText<"Set the <encoding> for ordinary string and character literals. "
"Supported character encodings include ISO-8859-1, UTF-8, IBM1047, "
"and possibly those supported by ICU or the host iconv library.">,
- MarshallingInfoString<LangOpts<"ExecEncoding">>;
+ MarshallingInfoString<LangOpts<"LiteralEncoding">>;
def target_cpu : Separate<["-"], "target-cpu">,
HelpText<"Target a specific cpu type">,
MarshallingInfoString<TargetOpts<"CPU">>;
diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index 952eb73c210ff..f4e0f09035fff 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -558,7 +558,7 @@ void CompilerInstance::createPreprocessor(TranslationUnitKind TUKind) {
if (auto EC = TextEncoding::setConvertersFromOptions(PP->getTextEncoding(),
getLangOpts()))
PP->getDiagnostics().Report(clang::diag::err_fe_text_encoding_config)
- << PP->getTextEncoding().getExecEncoding();
+ << PP->getTextEncoding().getLiteralEncoding();
}
// ASTContext
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 15c62e39d9506..eb60e0e674fea 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1036,12 +1036,12 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
// Macros to help identify the narrow and wide character sets. This is set
// to fexec-charset. If fexec-charset is not specified, the default is the
// system charset.
- Builder.defineMacro(
- "__clang_literal_encoding__",
- Twine("\"" +
- (LangOpts.ExecEncoding.empty() ? TI.getDefaultOrdinaryTextEncoding()
- : LangOpts.ExecEncoding) +
- "\""));
+ Builder.defineMacro("__clang_literal_encoding__",
+ Twine("\"" +
+ (LangOpts.LiteralEncoding.empty()
+ ? TI.getDefaultOrdinaryTextEncoding()
+ : LangOpts.LiteralEncoding) +
+ "\""));
if (TI.getTypeWidth(TI.getWCharType()) >= 32) {
// FIXME: 32-bit wchar_t signals UTF-32. This may change
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index 70070e8bb1f2a..e31dcc8c76db6 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -1861,7 +1861,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
const TextEncoding &TE = PP.getTextEncoding();
llvm::TextEncodingConverter *Converter = nullptr;
if (isOrdinary())
- Converter = TE.getConverter(CA_ToExecEncoding);
+ Converter = TE.getConverter(CA_ToLiteralEncoding);
// Unicode escapes representing characters that cannot be correctly
// represented in a single code unit are disallowed in character literals
diff --git a/clang/lib/Lex/TextEncoding.cpp b/clang/lib/Lex/TextEncoding.cpp
index ba878800564f0..393caaadd5d37 100644
--- a/clang/lib/Lex/TextEncoding.cpp
+++ b/clang/lib/Lex/TextEncoding.cpp
@@ -12,8 +12,8 @@
llvm::TextEncodingConverter *
TextEncoding::getConverter(ConversionAction Action) const {
switch (Action) {
- case CA_ToExecEncoding:
- return ToExecEncodingConverter;
+ case CA_ToLiteralEncoding:
+ return ToLiteralEncodingConverter;
default:
return nullptr;
}
@@ -25,17 +25,17 @@ TextEncoding::setConvertersFromOptions(TextEncoding &TEC,
using namespace llvm;
const char *UTF8 = "UTF-8";
- TEC.ExecEncoding =
- Opts.ExecEncoding.empty() ? UTF8 : Opts.ExecEncoding.c_str();
+ TEC.LiteralEncoding =
+ Opts.LiteralEncoding.empty() ? UTF8 : Opts.LiteralEncoding.c_str();
- // Create converter between internal and exec encoding specified
+ // Create converter between internal and literal encoding specified
// in fexec-charset option.
- if (TEC.ExecEncoding == UTF8)
+ if (TEC.LiteralEncoding == UTF8)
return std::error_code();
ErrorOr<TextEncodingConverter> ErrorOrConverter =
- llvm::TextEncodingConverter::create(UTF8, TEC.ExecEncoding);
+ llvm::TextEncodingConverter::create(UTF8, TEC.LiteralEncoding);
if (ErrorOrConverter)
- TEC.ToExecEncodingConverter =
+ TEC.ToLiteralEncodingConverter =
new TextEncodingConverter(std::move(*ErrorOrConverter));
else
return ErrorOrConverter.getError();
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index eea63e2497e06..391ad927af439 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -2241,7 +2241,7 @@ Sema::ActOnStringLiteral(ArrayRef<Token> StringToks, Scope *UDLScope) {
StringToks = ExpandedToks = ExpandFunctionLocalPredefinedMacros(StringToks);
StringLiteralParser Literal(
- StringToks, PP, StringLiteralEvalMethod::Evaluated, CA_ToExecEncoding);
+ StringToks, PP, StringLiteralEvalMethod::Evaluated, CA_ToLiteralEncoding);
if (Literal.hadError)
return ExprError();
diff --git a/clang/test/CodeGen/systemz-charset-diag.cpp b/clang/test/CodeGen/systemz-charset-diag.cpp
index 5b398b4b58af6..4ed94810150a3 100644
--- a/clang/test/CodeGen/systemz-charset-diag.cpp
+++ b/clang/test/CodeGen/systemz-charset-diag.cpp
@@ -1,3 +1,3 @@
// RUN: %clang_cc1 -triple s390x-none-zos -fexec-charset IBM-1047 %s -std=c++17 -emit-llvm -o - -verify
-const char* Computer = "🖥️"; // expected-error-re {{conversion to execution encoding failed: {{.*}}}}
+const char* Computer = "🖥️"; // expected-error-re {{conversion to literal encoding failed: {{.*}}}}
diff --git a/clang/test/CodeGen/systemz-charset.c b/clang/test/CodeGen/systemz-charset.c
index 897b9d2eeefa1..766b6a83f00ff 100644
--- a/clang/test/CodeGen/systemz-charset.c
+++ b/clang/test/CodeGen/systemz-charset.c
@@ -56,3 +56,13 @@ const char *Unicode = "ÿ";
// RUN: not %clang_cc1 -fexec-charset invalid %s 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
// CHECK-ERROR: error: failed to set fexec-charset to 'invalid'
+#define HELLO "Hello "
+#define WORLD "World!"
+#define HELLO_WORLD HELLO WORLD
+const char* hello_macro = HELLO;
+//CHECK: c"\C8\85\93\93\96@\00"
+//CHECK-UTF8 = c"Hello\00"
+
+const char* preprocessor_concatenation = HELLO_WORLD;
+//CHECK: c"\C8\85\93\93\96@\E6\96\99\93\84Z\00"
+//CHECK-UTF8: c"Hello World!\00"
>From b9055262bdec5d31d8e0be8b2d52eeed7c66ca0c Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Fri, 8 May 2026 12:17:22 -0400
Subject: [PATCH 2/9] add ParserConversionAction, do not translate unevaluated
strings
---
clang/include/clang/Parse/Parser.h | 1 +
clang/include/clang/Sema/Sema.h | 4 +++-
clang/lib/Parse/ParseDecl.cpp | 10 ++++++++++
clang/lib/Parse/ParseDeclCXX.cpp | 2 ++
clang/lib/Parse/ParseExpr.cpp | 6 +++---
clang/lib/Parse/Parser.cpp | 4 ++++
clang/lib/Sema/SemaExpr.cpp | 12 ++++++------
clang/test/CodeGen/systemz-charset-diag.cpp | 8 ++++++++
clang/test/CodeGen/systemz-charset.c | 5 +++++
9 files changed, 42 insertions(+), 10 deletions(-)
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index c6c492b4980af..b441998e54040 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -5715,6 +5715,7 @@ class Parser : public CodeCompletionHandler {
bool Finished;
};
ObjCImplParsingDataRAII *CurParsedObjCImpl;
+ ConversionAction ParserConversionAction;
/// StashAwayMethodOrFunctionBodyTokens - Consume the tokens and store them
/// for later parsing.
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index b8d760e7e0975..d54e4ce19166a 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -55,6 +55,7 @@
#include "clang/Basic/TemplateKinds.h"
#include "clang/Basic/TokenKinds.h"
#include "clang/Basic/TypeTraits.h"
+#include "clang/Lex/LiteralConverter.h"
#include "clang/Sema/AnalysisBasedWarnings.h"
#include "clang/Sema/Attr.h"
#include "clang/Sema/CleanupInfo.h"
@@ -7374,7 +7375,8 @@ class Sema final : public SemaBase {
/// from multiple tokens. However, the common case is that StringToks points
/// to one string.
ExprResult ActOnStringLiteral(ArrayRef<Token> StringToks,
- Scope *UDLScope = nullptr);
+ Scope *UDLScope = nullptr,
+ ConversionAction Action = CA_ToExecEncoding);
ExprResult ActOnUnevaluatedStringLiteral(ArrayRef<Token> StringToks);
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index 405dddf7991b4..97e0721c02b1b 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -564,6 +564,9 @@ unsigned Parser::ParseAttributeArgsCommon(
nullptr,
Sema::ExpressionEvaluationContextRecord::EK_AttrArgument);
+ SaveAndRestore<ConversionAction> SavedTranslationState(
+ ParserConversionAction, CA_NoConversion);
+
ExprResult ArgExpr = ParseAssignmentExpression();
if (ArgExpr.isInvalid()) {
SkipUntil(tok::r_paren, StopAtSemi);
@@ -644,6 +647,9 @@ void Parser::ParseGNUAttributeArgs(
ParsedAttr::Kind AttrKind =
ParsedAttr::getParsedKind(AttrName, ScopeName, Form.getSyntax());
+ SaveAndRestore<ConversionAction> SavedTranslationState(ParserConversionAction,
+ CA_NoConversion);
+
if (AttrKind == ParsedAttr::AT_Availability) {
ParseAvailabilityAttribute(*AttrName, AttrNameLoc, Attrs, EndLoc, ScopeName,
ScopeLoc, Form);
@@ -723,6 +729,9 @@ unsigned Parser::ParseClangAttributeArgs(
ParsedAttr::Kind AttrKind =
ParsedAttr::getParsedKind(AttrName, ScopeName, Form.getSyntax());
+ SaveAndRestore<ConversionAction> SavedTranslationState(ParserConversionAction,
+ CA_NoConversion);
+
switch (AttrKind) {
default:
return ParseAttributeArgsCommon(AttrName, AttrNameLoc, Attrs, EndLoc,
@@ -1546,6 +1555,7 @@ void Parser::ParseExternalSourceSymbolAttribute(
SkipUntil(tok::comma, tok::r_paren, StopAtSemi | StopBeforeMatch);
continue;
}
+
if (Keyword == Ident_language) {
if (HadLanguage) {
Diag(KeywordLoc, diag::err_external_source_symbol_duplicate_clause)
diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
index 893989bd2398f..388cfa662068a 100644
--- a/clang/lib/Parse/ParseDeclCXX.cpp
+++ b/clang/lib/Parse/ParseDeclCXX.cpp
@@ -1001,6 +1001,8 @@ Decl *Parser::ParseStaticAssertDeclaration(SourceLocation &DeclEnd) {
return nullptr;
}
} else if (tokenIsLikeStringLiteral(Tok, getLangOpts())) {
+ SaveAndRestore<ConversionAction> SavedTranslationState(
+ ParserConversionAction, CA_NoConversion);
AssertMessage = ParseUnevaluatedStringLiteralExpression();
} else {
Diag(Tok, diag::err_expected_string_literal)
diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp
index 2987d32d6e0d2..f8855d06fa343 100644
--- a/clang/lib/Parse/ParseExpr.cpp
+++ b/clang/lib/Parse/ParseExpr.cpp
@@ -3060,9 +3060,9 @@ ExprResult Parser::ParseStringLiteralExpression(bool AllowUserDefinedLiteral,
}
// Pass the set of string tokens, ready for concatenation, to the actions.
- return Actions.ActOnStringLiteral(StringToks,
- AllowUserDefinedLiteral ? getCurScope()
- : nullptr);
+ return Actions.ActOnStringLiteral(
+ StringToks, AllowUserDefinedLiteral ? getCurScope() : nullptr,
+ ParserConversionAction);
}
ExprResult Parser::ParseGenericSelectionExpression() {
diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp
index 5e1fd4df1a3f0..7ac5e0a36d60e 100644
--- a/clang/lib/Parse/Parser.cpp
+++ b/clang/lib/Parse/Parser.cpp
@@ -70,6 +70,8 @@ Parser::Parser(Preprocessor &pp, Sema &actions, bool skipFunctionBodies)
NumCachedScopes = 0;
CurParsedObjCImpl = nullptr;
+ ParserConversionAction = CA_ToExecEncoding;
+
// Add #pragma handlers. These are removed and destroyed in the
// destructor.
initializePragmaHandlers();
@@ -1551,6 +1553,8 @@ void Parser::ParseKNRParamDeclarations(Declarator &D) {
}
ExprResult Parser::ParseAsmStringLiteral(bool ForAsmLabel) {
+ SaveAndRestore<ConversionAction> SavedTranslationState(ParserConversionAction,
+ CA_NoConversion);
ExprResult AsmString;
if (isTokenStringLiteral()) {
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 391ad927af439..089fdc5c5b6cc 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -2159,8 +2159,8 @@ ExprResult Sema::ActOnUnevaluatedStringLiteral(ArrayRef<Token> StringToks) {
if (getLangOpts().MicrosoftExt)
StringToks = ExpandedToks = ExpandFunctionLocalPredefinedMacros(StringToks);
- StringLiteralParser Literal(StringToks, PP,
- StringLiteralEvalMethod::Unevaluated);
+ StringLiteralParser Literal(
+ StringToks, PP, StringLiteralEvalMethod::Unevaluated, CA_NoConversion);
if (Literal.hadError)
return ExprError();
@@ -2231,8 +2231,8 @@ Sema::ExpandFunctionLocalPredefinedMacros(ArrayRef<Token> Toks) {
return ExpandedToks;
}
-ExprResult
-Sema::ActOnStringLiteral(ArrayRef<Token> StringToks, Scope *UDLScope) {
+ExprResult Sema::ActOnStringLiteral(ArrayRef<Token> StringToks, Scope *UDLScope,
+ ConversionAction Action) {
assert(!StringToks.empty() && "Must have at least one string!");
// StringToks needs backing storage as it doesn't hold array elements itself
@@ -2240,8 +2240,8 @@ Sema::ActOnStringLiteral(ArrayRef<Token> StringToks, Scope *UDLScope) {
if (getLangOpts().MicrosoftExt)
StringToks = ExpandedToks = ExpandFunctionLocalPredefinedMacros(StringToks);
- StringLiteralParser Literal(
- StringToks, PP, StringLiteralEvalMethod::Evaluated, CA_ToLiteralEncoding);
+ StringLiteralParser Literal(StringToks, PP,
+ StringLiteralEvalMethod::Evaluated, Action);
if (Literal.hadError)
return ExprError();
diff --git a/clang/test/CodeGen/systemz-charset-diag.cpp b/clang/test/CodeGen/systemz-charset-diag.cpp
index 4ed94810150a3..ad08e1f391214 100644
--- a/clang/test/CodeGen/systemz-charset-diag.cpp
+++ b/clang/test/CodeGen/systemz-charset-diag.cpp
@@ -1,3 +1,11 @@
// RUN: %clang_cc1 -triple s390x-none-zos -fexec-charset IBM-1047 %s -std=c++17 -emit-llvm -o - -verify
const char* Computer = "🖥️"; // expected-error-re {{conversion to literal encoding failed: {{.*}}}}
+
+static_assert(false, "Error string"); // expected-error {{static assertion failed: Error string}}
+
+[[deprecated("message")]] void test_deprecated() {return;} // expected-note {{'test_deprecated' has been explicitly marked deprecated here}}
+
+int main() {
+ test_deprecated(); // expected-warning {{'test_deprecated' is deprecated: message}}
+}
diff --git a/clang/test/CodeGen/systemz-charset.c b/clang/test/CodeGen/systemz-charset.c
index 766b6a83f00ff..618b0cc203ab6 100644
--- a/clang/test/CodeGen/systemz-charset.c
+++ b/clang/test/CodeGen/systemz-charset.c
@@ -66,3 +66,8 @@ const char* hello_macro = HELLO;
const char* preprocessor_concatenation = HELLO_WORLD;
//CHECK: c"\C8\85\93\93\96@\E6\96\99\93\84Z\00"
//CHECK-UTF8: c"Hello World!\00"
+
+void test1() {
+ printf(__FUNCTION__);
+}
+//CHECK: @__FUNCTION__.test1 = private unnamed_addr constant [6 x i8] c"\A3\85\A2\A3\F1\00"
>From 840f505abbb265ac25f665ab8c8451725f1cd051 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Fri, 8 May 2026 12:29:23 -0400
Subject: [PATCH 3/9] Remove old include
---
clang/include/clang/Sema/Sema.h | 1 -
1 file changed, 1 deletion(-)
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index d54e4ce19166a..aecd0d1c2f5dd 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -55,7 +55,6 @@
#include "clang/Basic/TemplateKinds.h"
#include "clang/Basic/TokenKinds.h"
#include "clang/Basic/TypeTraits.h"
-#include "clang/Lex/LiteralConverter.h"
#include "clang/Sema/AnalysisBasedWarnings.h"
#include "clang/Sema/Attr.h"
#include "clang/Sema/CleanupInfo.h"
>From 7ee3d5c76f9538338d6449f52b2d5cf120afb389 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Mon, 11 May 2026 09:27:48 -0400
Subject: [PATCH 4/9] Fix build failure
---
clang/include/clang/Sema/Sema.h | 1 +
1 file changed, 1 insertion(+)
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index aecd0d1c2f5dd..5d00b0c94daa3 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -55,6 +55,7 @@
#include "clang/Basic/TemplateKinds.h"
#include "clang/Basic/TokenKinds.h"
#include "clang/Basic/TypeTraits.h"
+#include "clang/Lex/TextEncodingConfig.h"
#include "clang/Sema/AnalysisBasedWarnings.h"
#include "clang/Sema/Attr.h"
#include "clang/Sema/CleanupInfo.h"
>From ff2c43189b2224b2f0e4e3ba68d4d3558149e634 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Tue, 12 May 2026 08:07:08 -0400
Subject: [PATCH 5/9] fix CI
---
clang/test/CodeGen/systemz-charset.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/clang/test/CodeGen/systemz-charset.c b/clang/test/CodeGen/systemz-charset.c
index 618b0cc203ab6..16f269f8fb2f5 100644
--- a/clang/test/CodeGen/systemz-charset.c
+++ b/clang/test/CodeGen/systemz-charset.c
@@ -1,6 +1,8 @@
// RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -fexec-charset IBM-1047 -o - | FileCheck %s
// RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -fexec-charset UTF-8 -DIBM1047_ONLY=1 -o - | FileCheck %s --check-prefix=CHECK-UTF8
+int printf(char const *, ...);
+
const char *UpperCaseLetters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
//CHECK: c"\C1\C2\C3\C4\C5\C6\C7\C8\C9\D1\D2\D3\D4\D5\D6\D7\D8\D9\E2\E3\E4\E5\E6\E7\E8\E9\00"
//CHECK-UTF8: c"ABCDEFGHIJKLMNOPQRSTUVWXYZ\00"
>From 4011c35715375f7e727819b2fd287ab6644e3f02 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Tue, 12 May 2026 15:21:15 -0400
Subject: [PATCH 6/9] fix CI
---
clang/include/clang/AST/Expr.h | 6 ++++++
clang/include/clang/Sema/Sema.h | 4 ++--
clang/lib/AST/Expr.cpp | 15 +++++++++++++++
clang/lib/Parse/ParseDecl.cpp | 1 -
clang/lib/Parse/Parser.cpp | 2 +-
clang/lib/Sema/SemaExpr.cpp | 5 +++--
6 files changed, 27 insertions(+), 6 deletions(-)
diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h
index b91bf4a5375fb..69ac328c8f0a7 100644
--- a/clang/include/clang/AST/Expr.h
+++ b/clang/include/clang/AST/Expr.h
@@ -28,6 +28,7 @@
#include "clang/Basic/LangOptions.h"
#include "clang/Basic/SyncScope.h"
#include "clang/Basic/TypeTraits.h"
+#include "clang/Lex/TextEncoding.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APSInt.h"
#include "llvm/ADT/SmallVector.h"
@@ -2066,6 +2067,11 @@ class PredefinedExpr final
return getIdentKindName(getIdentKind());
}
+ static std::string
+ ComputeNameAndTranslate(PredefinedIdentKind IK, const Decl *CurrentDecl,
+ TextEncoding &TE,
+ bool ForceElaboratedPrinting = false);
+
static std::string ComputeName(PredefinedIdentKind IK,
const Decl *CurrentDecl,
bool ForceElaboratedPrinting = false);
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 5d00b0c94daa3..f78455769a082 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -55,7 +55,7 @@
#include "clang/Basic/TemplateKinds.h"
#include "clang/Basic/TokenKinds.h"
#include "clang/Basic/TypeTraits.h"
-#include "clang/Lex/TextEncodingConfig.h"
+#include "clang/Lex/TextEncoding.h"
#include "clang/Sema/AnalysisBasedWarnings.h"
#include "clang/Sema/Attr.h"
#include "clang/Sema/CleanupInfo.h"
@@ -7376,7 +7376,7 @@ class Sema final : public SemaBase {
/// to one string.
ExprResult ActOnStringLiteral(ArrayRef<Token> StringToks,
Scope *UDLScope = nullptr,
- ConversionAction Action = CA_ToExecEncoding);
+ ConversionAction Action = CA_ToLiteralEncoding);
ExprResult ActOnUnevaluatedStringLiteral(ArrayRef<Token> StringToks);
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index 90747be4208e1..ead2880b9ebec 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -673,6 +673,21 @@ StringRef PredefinedExpr::getIdentKindName(PredefinedIdentKind IK) {
llvm_unreachable("Unknown ident kind for PredefinedExpr");
}
+std::string PredefinedExpr::ComputeNameAndTranslate(
+ PredefinedIdentKind IK, const Decl *CurrentDecl, TextEncoding &TE,
+ bool ForceElaboratedPrinting) {
+ using namespace clang::charinfo;
+ std::string Result = ComputeName(IK, CurrentDecl, ForceElaboratedPrinting);
+ llvm::TextEncodingConverter *Converter =
+ TE.getConverter(CA_ToLiteralEncoding);
+ if (Converter) {
+ SmallString<128> Converted;
+ Converter->convert(Result, Converted);
+ Result = std::string(Converted);
+ }
+ return Result;
+}
+
// FIXME: Maybe this should use DeclPrinter with a special "print predefined
// expr" policy instead.
std::string PredefinedExpr::ComputeName(PredefinedIdentKind IK,
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index 97e0721c02b1b..3aa41ebc05397 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -1555,7 +1555,6 @@ void Parser::ParseExternalSourceSymbolAttribute(
SkipUntil(tok::comma, tok::r_paren, StopAtSemi | StopBeforeMatch);
continue;
}
-
if (Keyword == Ident_language) {
if (HadLanguage) {
Diag(KeywordLoc, diag::err_external_source_symbol_duplicate_clause)
diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp
index 7ac5e0a36d60e..5a199b842fe8e 100644
--- a/clang/lib/Parse/Parser.cpp
+++ b/clang/lib/Parse/Parser.cpp
@@ -70,7 +70,7 @@ Parser::Parser(Preprocessor &pp, Sema &actions, bool skipFunctionBodies)
NumCachedScopes = 0;
CurParsedObjCImpl = nullptr;
- ParserConversionAction = CA_ToExecEncoding;
+ ParserConversionAction = CA_ToLiteralEncoding;
// Add #pragma handlers. These are removed and destroyed in the
// destructor.
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 089fdc5c5b6cc..eac281b523862 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -3636,8 +3636,9 @@ ExprResult Sema::BuildPredefinedExpr(SourceLocation Loc,
// the string.
bool ForceElaboratedPrinting =
IK == PredefinedIdentKind::Function && getLangOpts().MSVCCompat;
- auto Str =
- PredefinedExpr::ComputeName(IK, currentDecl, ForceElaboratedPrinting);
+ auto Str = PredefinedExpr::ComputeNameAndTranslate(
+ IK, currentDecl, getPreprocessor().getTextEncoding(),
+ ForceElaboratedPrinting);
unsigned Length = Str.length();
llvm::APInt LengthI(32, Length + 1);
>From 6b3785f0e03b0344a7459bc185733342b47dcc43 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Fri, 8 May 2026 12:19:11 -0400
Subject: [PATCH 7/9] Add format string handling
---
clang/include/clang/AST/FormatString.h | 12 ++--
clang/include/clang/Basic/TargetInfo.h | 3 +
clang/include/clang/Lex/TextEncoding.h | 3 +-
clang/lib/AST/FormatString.cpp | 86 ++++++++++++-----------
clang/lib/AST/FormatStringParsing.h | 36 +++++++---
clang/lib/AST/PrintfFormatString.cpp | 89 +++++++++++++++---------
clang/lib/AST/ScanfFormatString.cpp | 23 +++---
clang/lib/Basic/TargetInfo.cpp | 3 +
clang/lib/Frontend/CompilerInstance.cpp | 4 +-
clang/lib/Lex/TextEncoding.cpp | 11 ++-
clang/lib/Sema/SemaChecking.cpp | 54 ++++++++------
llvm/include/llvm/Support/TextEncoding.h | 10 +++
llvm/lib/Support/TextEncoding.cpp | 19 +++++
13 files changed, 233 insertions(+), 120 deletions(-)
diff --git a/clang/include/clang/AST/FormatString.h b/clang/include/clang/AST/FormatString.h
index a3382e1a1d007..a24ade2d71ee9 100644
--- a/clang/include/clang/AST/FormatString.h
+++ b/clang/include/clang/AST/FormatString.h
@@ -19,6 +19,7 @@
#define LLVM_CLANG_AST_FORMATSTRING_H
#include "clang/AST/CanonicalType.h"
+#include "llvm/Support/TextEncoding.h"
#include <optional>
namespace clang {
@@ -728,7 +729,8 @@ class FormatStringHandler {
virtual bool HandleInvalidPrintfConversionSpecifier(
const analyze_printf::PrintfSpecifier &FS, const char *startSpecifier,
- unsigned specifierLen) {
+ unsigned specifierLen,
+ const llvm::TextEncodingConverter &FormatStrConverter) {
return true;
}
@@ -744,10 +746,10 @@ class FormatStringHandler {
// Scanf-specific handlers.
- virtual bool
- HandleInvalidScanfConversionSpecifier(const analyze_scanf::ScanfSpecifier &FS,
- const char *startSpecifier,
- unsigned specifierLen) {
+ virtual bool HandleInvalidScanfConversionSpecifier(
+ const analyze_scanf::ScanfSpecifier &FS, const char *startSpecifier,
+ unsigned specifierLen,
+ const llvm::TextEncodingConverter &FormatStrConverter) {
return true;
}
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index a4984cffc430a..909bde840d3fa 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -38,6 +38,7 @@
#include "llvm/IR/DerivedTypes.h"
#include "llvm/Support/DataTypes.h"
#include "llvm/Support/Error.h"
+#include "llvm/Support/TextEncoding.h"
#include "llvm/Support/VersionTuple.h"
#include "llvm/TargetParser/Triple.h"
#include <cassert>
@@ -323,6 +324,8 @@ class TargetInfo : public TransferrableTargetInfo,
virtual ~TargetInfo();
+ llvm::TextEncodingConverter *FormatStrConverter;
+
/// Retrieve the target options.
TargetOptions &getTargetOpts() const {
assert(TargetOpts && "Missing target options");
diff --git a/clang/include/clang/Lex/TextEncoding.h b/clang/include/clang/Lex/TextEncoding.h
index c892d1fadbc38..f525b06cff37b 100644
--- a/clang/include/clang/Lex/TextEncoding.h
+++ b/clang/include/clang/Lex/TextEncoding.h
@@ -22,7 +22,8 @@ class TextEncoding {
public:
llvm::TextEncodingConverter *getConverter(ConversionAction Action) const;
static std::error_code
- setConvertersFromOptions(TextEncoding &TE, const clang::LangOptions &Opts);
+ setConvertersFromOptions(TextEncoding &TE, const clang::LangOptions &Opts,
+ clang::TargetInfo &TInfo);
llvm::StringRef getLiteralEncoding() { return LiteralEncoding; }
};
diff --git a/clang/lib/AST/FormatString.cpp b/clang/lib/AST/FormatString.cpp
index 7e1ac0de6dcaf..0d449fb5f0904 100644
--- a/clang/lib/AST/FormatString.cpp
+++ b/clang/lib/AST/FormatString.cpp
@@ -33,8 +33,9 @@ FormatStringHandler::~FormatStringHandler() {}
// scanf format strings.
//===----------------------------------------------------------------------===//
-OptionalAmount clang::analyze_format_string::ParseAmount(const char *&Beg,
- const char *E) {
+OptionalAmount clang::analyze_format_string::ParseAmount(
+ const char *&Beg, const char *E,
+ const llvm::TextEncodingConverter &FormatStrConverter) {
const char *I = Beg;
UpdateOnReturn<const char *> UpdateBeg(Beg, I);
@@ -42,7 +43,7 @@ OptionalAmount clang::analyze_format_string::ParseAmount(const char *&Beg,
bool hasDigits = false;
for (; I != E; ++I) {
- char c = *I;
+ char c = FormatStrConverter.convert(*I);
if (c >= '0' && c <= '9') {
hasDigits = true;
accumulator = (accumulator * 10) + (c - '0');
@@ -60,21 +61,22 @@ OptionalAmount clang::analyze_format_string::ParseAmount(const char *&Beg,
}
OptionalAmount clang::analyze_format_string::ParseNonPositionAmount(
- const char *&Beg, const char *E, unsigned &argIndex) {
- if (*Beg == '*') {
+ const char *&Beg, const char *E, unsigned &argIndex,
+ const llvm::TextEncodingConverter &FormatStrConverter) {
+ if (FormatStrConverter.convert(*Beg) == '*') {
++Beg;
return OptionalAmount(OptionalAmount::Arg, argIndex++, Beg, 0, false);
}
- return ParseAmount(Beg, E);
+ return ParseAmount(Beg, E, FormatStrConverter);
}
OptionalAmount clang::analyze_format_string::ParsePositionAmount(
FormatStringHandler &H, const char *Start, const char *&Beg, const char *E,
- PositionContext p) {
- if (*Beg == '*') {
+ PositionContext p, const llvm::TextEncodingConverter &FormatStrConverter) {
+ if (FormatStrConverter.convert(*Beg) == '*') {
const char *I = Beg + 1;
- const OptionalAmount &Amt = ParseAmount(I, E);
+ const OptionalAmount &Amt = ParseAmount(I, E, FormatStrConverter);
if (Amt.getHowSpecified() == OptionalAmount::NotSpecified) {
H.HandleInvalidPosition(Beg, I - Beg, p);
@@ -89,7 +91,7 @@ OptionalAmount clang::analyze_format_string::ParsePositionAmount(
assert(Amt.getHowSpecified() == OptionalAmount::Constant);
- if (*I == '$') {
+ if (FormatStrConverter.convert(*I) == '$') {
// Handle positional arguments
// Special case: '*0$', since this is an easy mistake.
@@ -109,18 +111,21 @@ OptionalAmount clang::analyze_format_string::ParsePositionAmount(
return OptionalAmount(false);
}
- return ParseAmount(Beg, E);
+ return ParseAmount(Beg, E, FormatStrConverter);
}
bool clang::analyze_format_string::ParseFieldWidth(
FormatStringHandler &H, FormatSpecifier &CS, const char *Start,
- const char *&Beg, const char *E, unsigned *argIndex) {
+ const char *&Beg, const char *E, unsigned *argIndex,
+ const llvm::TextEncodingConverter &FormatStrConverter) {
// FIXME: Support negative field widths.
if (argIndex) {
- CS.setFieldWidth(ParseNonPositionAmount(Beg, E, *argIndex));
+ CS.setFieldWidth(
+ ParseNonPositionAmount(Beg, E, *argIndex, FormatStrConverter));
} else {
const OptionalAmount Amt = ParsePositionAmount(
- H, Start, Beg, E, analyze_format_string::FieldWidthPos);
+ H, Start, Beg, E, analyze_format_string::FieldWidthPos,
+ FormatStrConverter);
if (Amt.isInvalid())
return true;
@@ -129,14 +134,13 @@ bool clang::analyze_format_string::ParseFieldWidth(
return false;
}
-bool clang::analyze_format_string::ParseArgPosition(FormatStringHandler &H,
- FormatSpecifier &FS,
- const char *Start,
- const char *&Beg,
- const char *E) {
+bool clang::analyze_format_string::ParseArgPosition(
+ FormatStringHandler &H, FormatSpecifier &FS, const char *Start,
+ const char *&Beg, const char *E,
+ const llvm::TextEncodingConverter &FormatStrConverter) {
const char *I = Beg;
- const OptionalAmount &Amt = ParseAmount(I, E);
+ const OptionalAmount &Amt = ParseAmount(I, E, FormatStrConverter);
if (I == E) {
// No more characters left?
@@ -144,7 +148,8 @@ bool clang::analyze_format_string::ParseArgPosition(FormatStringHandler &H,
return true;
}
- if (Amt.getHowSpecified() == OptionalAmount::Constant && *(I++) == '$') {
+ if (Amt.getHowSpecified() == OptionalAmount::Constant &&
+ FormatStrConverter.convert(*(I++)) == '$') {
// Warn that positional arguments are non-standard.
H.HandlePosition(Start, I - Start);
@@ -165,16 +170,15 @@ bool clang::analyze_format_string::ParseArgPosition(FormatStringHandler &H,
return false;
}
-bool clang::analyze_format_string::ParseVectorModifier(FormatStringHandler &H,
- FormatSpecifier &FS,
- const char *&I,
- const char *E,
- const LangOptions &LO) {
+bool clang::analyze_format_string::ParseVectorModifier(
+ FormatStringHandler &H, FormatSpecifier &FS, const char *&I, const char *E,
+ const LangOptions &LO,
+ const llvm::TextEncodingConverter &FormatStrConverter) {
if (!LO.OpenCL)
return false;
const char *Start = I;
- if (*I == 'v') {
+ if (FormatStrConverter.convert(*I) == 'v') {
++I;
if (I == E) {
@@ -182,7 +186,7 @@ bool clang::analyze_format_string::ParseVectorModifier(FormatStringHandler &H,
return true;
}
- OptionalAmount NumElts = ParseAmount(I, E);
+ OptionalAmount NumElts = ParseAmount(I, E, FormatStrConverter);
if (NumElts.getHowSpecified() != OptionalAmount::Constant) {
H.HandleIncompleteSpecifier(Start, E - Start);
return true;
@@ -194,22 +198,20 @@ bool clang::analyze_format_string::ParseVectorModifier(FormatStringHandler &H,
return false;
}
-bool clang::analyze_format_string::ParseLengthModifier(FormatSpecifier &FS,
- const char *&I,
- const char *E,
- const LangOptions &LO,
- bool IsScanf) {
+bool clang::analyze_format_string::ParseLengthModifier(
+ FormatSpecifier &FS, const char *&I, const char *E, const LangOptions &LO,
+ const llvm::TextEncodingConverter &FormatStrConverter, bool IsScanf) {
LengthModifier::Kind lmKind = LengthModifier::None;
const char *lmPosition = I;
- switch (*I) {
+ switch (FormatStrConverter.convert(*I)) {
default:
return false;
case 'h':
++I;
- if (I != E && *I == 'h') {
+ if (I != E && FormatStrConverter.convert(*I) == 'h') {
++I;
lmKind = LengthModifier::AsChar;
- } else if (I != E && *I == 'l' && LO.OpenCL) {
+ } else if (I != E && FormatStrConverter.convert(*I) == 'l' && LO.OpenCL) {
++I;
lmKind = LengthModifier::AsShortLong;
} else {
@@ -218,7 +220,7 @@ bool clang::analyze_format_string::ParseLengthModifier(FormatSpecifier &FS,
break;
case 'l':
++I;
- if (I != E && *I == 'l') {
+ if (I != E && FormatStrConverter.convert(*I) == 'l') {
++I;
lmKind = LengthModifier::AsLongLong;
} else {
@@ -251,7 +253,9 @@ bool clang::analyze_format_string::ParseLengthModifier(FormatSpecifier &FS,
// be parsed as the GNU extension 'a' length modifier. If not, this
// will be parsed as a conversion specifier.
++I;
- if (I != E && (*I == 's' || *I == 'S' || *I == '[')) {
+ if (I != E && (FormatStrConverter.convert(*I) == 's' ||
+ FormatStrConverter.convert(*I) == 'S' ||
+ FormatStrConverter.convert(*I) == '[')) {
lmKind = LengthModifier::AsAllocate;
break;
}
@@ -269,7 +273,8 @@ bool clang::analyze_format_string::ParseLengthModifier(FormatSpecifier &FS,
// scanf: AsInt64
case 'I':
if (I + 1 != E && I + 2 != E) {
- if (I[1] == '6' && I[2] == '4') {
+ if (FormatStrConverter.convert(I[1]) == '6' &&
+ FormatStrConverter.convert(I[2]) == '4') {
I += 3;
lmKind = LengthModifier::AsInt64;
break;
@@ -277,7 +282,8 @@ bool clang::analyze_format_string::ParseLengthModifier(FormatSpecifier &FS,
if (IsScanf)
return false;
- if (I[1] == '3' && I[2] == '2') {
+ if (FormatStrConverter.convert(I[1]) == '3' &&
+ FormatStrConverter.convert(I[2]) == '2') {
I += 3;
lmKind = LengthModifier::AsInt32;
break;
diff --git a/clang/lib/AST/FormatStringParsing.h b/clang/lib/AST/FormatStringParsing.h
index 401528481a9d6..531bc291e0b5b 100644
--- a/clang/lib/AST/FormatStringParsing.h
+++ b/clang/lib/AST/FormatStringParsing.h
@@ -35,29 +35,43 @@ template <typename T> class UpdateOnReturn {
namespace analyze_format_string {
-OptionalAmount ParseAmount(const char *&Beg, const char *E);
-OptionalAmount ParseNonPositionAmount(const char *&Beg, const char *E,
- unsigned &argIndex);
+OptionalAmount
+ParseAmount(const char *&Beg, const char *E,
+ const llvm::TextEncodingConverter &FormatStrConverter);
-OptionalAmount ParsePositionAmount(FormatStringHandler &H, const char *Start,
- const char *&Beg, const char *E,
- PositionContext p);
+OptionalAmount
+ParseNonPositionAmount(const char *&Beg, const char *E, unsigned &argIndex,
+ const llvm::TextEncodingConverter &FormatStrConverter);
+
+OptionalAmount
+ParsePositionAmount(FormatStringHandler &H, const char *Start, const char *&Beg,
+ const char *E, PositionContext p,
+ const llvm::TextEncodingConverter &FormatStrConverter);
+
+OptionalAmount
+ParsePositionAmount(FormatStringHandler &H, const char *Start, const char *&Beg,
+ const char *E, PositionContext p,
+ const llvm::TextEncodingConverter &FormatStrConverter);
bool ParseFieldWidth(FormatStringHandler &H, FormatSpecifier &CS,
const char *Start, const char *&Beg, const char *E,
- unsigned *argIndex);
+ unsigned *argIndex,
+ const llvm::TextEncodingConverter &FormatStrConverter);
bool ParseArgPosition(FormatStringHandler &H, FormatSpecifier &CS,
- const char *Start, const char *&Beg, const char *E);
+ const char *Start, const char *&Beg, const char *E,
+ const llvm::TextEncodingConverter &FormatStrConverter);
bool ParseVectorModifier(FormatStringHandler &H, FormatSpecifier &FS,
- const char *&Beg, const char *E,
- const LangOptions &LO);
+ const char *&Beg, const char *E, const LangOptions &LO,
+ const llvm::TextEncodingConverter &FormatStrConverter);
/// Returns true if a LengthModifier was parsed and installed in the
/// FormatSpecifier& argument, and false otherwise.
bool ParseLengthModifier(FormatSpecifier &FS, const char *&Beg, const char *E,
- const LangOptions &LO, bool IsScanf = false);
+ const LangOptions &LO,
+ const llvm::TextEncodingConverter &FormatStrConverter,
+ bool IsScanf = false);
/// Returns true if the invalid specifier in \p SpecifierBegin is a UTF-8
/// string; check that it won't go further than \p FmtStrEnd and write
diff --git a/clang/lib/AST/PrintfFormatString.cpp b/clang/lib/AST/PrintfFormatString.cpp
index 6610a2de9e083..7efcc554ec136 100644
--- a/clang/lib/AST/PrintfFormatString.cpp
+++ b/clang/lib/AST/PrintfFormatString.cpp
@@ -35,14 +35,17 @@ typedef clang::analyze_format_string::SpecifierResult<PrintfSpecifier>
using analyze_format_string::ParseNonPositionAmount;
-static bool ParsePrecision(FormatStringHandler &H, PrintfSpecifier &FS,
- const char *Start, const char *&Beg, const char *E,
- unsigned *argIndex) {
+static bool
+ParsePrecision(FormatStringHandler &H, PrintfSpecifier &FS, const char *Start,
+ const char *&Beg, const char *E, unsigned *argIndex,
+ const llvm::TextEncodingConverter &FormatStrConverter) {
if (argIndex) {
- FS.setPrecision(ParseNonPositionAmount(Beg, E, *argIndex));
+ FS.setPrecision(
+ ParseNonPositionAmount(Beg, E, *argIndex, FormatStrConverter));
} else {
const OptionalAmount Amt = ParsePositionAmount(
- H, Start, Beg, E, analyze_format_string::PrecisionPos);
+ H, Start, Beg, E, analyze_format_string::PrecisionPos,
+ FormatStrConverter);
if (Amt.isInvalid())
return true;
FS.setPrecision(Amt);
@@ -50,11 +53,14 @@ static bool ParsePrecision(FormatStringHandler &H, PrintfSpecifier &FS,
return false;
}
-static bool ParseObjCFlags(FormatStringHandler &H, PrintfSpecifier &FS,
- const char *FlagBeg, const char *E, bool Warn) {
+static bool
+ParseObjCFlags(FormatStringHandler &H, PrintfSpecifier &FS, const char *FlagBeg,
+ const char *E, bool Warn,
+ const llvm::TextEncodingConverter &FormatStrConverter) {
StringRef Flag(FlagBeg, E - FlagBeg);
// Currently there is only one flag.
- if (Flag == "tt") {
+ if (Flag.size() == 2 && FormatStrConverter.convert(FlagBeg[0]) == 't' &&
+ FormatStrConverter.convert(FlagBeg[1]) == 't') {
FS.setHasObjCTechnicalTerm(FlagBeg);
return false;
}
@@ -81,6 +87,8 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E,
const char *Start = nullptr;
UpdateOnReturn<const char *> UpdateBeg(Beg, I);
+ const llvm::TextEncodingConverter &FormatStrConverter =
+ *Target.FormatStrConverter;
// Look for a '%' character that indicates the start of a format specifier.
for (; I != E; ++I) {
char c = *I;
@@ -89,7 +97,7 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E,
H.HandleNullChar(I);
return true;
}
- if (c == '%') {
+ if (FormatStrConverter.convert(c) == '%') {
Start = I++; // Record the start of the format specifier.
break;
}
@@ -107,7 +115,7 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E,
}
PrintfSpecifier FS;
- if (ParseArgPosition(H, FS, Start, I, E))
+ if (ParseArgPosition(H, FS, Start, I, E, FormatStrConverter))
return true;
if (I == E) {
@@ -117,13 +125,17 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E,
return true;
}
- if (*I == '{') {
+ if (FormatStrConverter.convert(*I) == '{') {
++I;
unsigned char PrivacyFlags = 0;
StringRef MatchedStr;
do {
- StringRef Str(I, E - I);
+ const char *II;
+ std::string S(I, E - I);
+ for (unsigned long i = 0; i < S.length(); ++i)
+ S[i] = FormatStrConverter.convert(S[i]);
+ StringRef Str(S);
std::string Match = "^[[:space:]]*"
"(private|public|sensitive|mask\\.[^[:space:],}]*)"
"[[:space:]]*(,|})";
@@ -132,25 +144,38 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E,
if (R.match(Str, &Matches)) {
MatchedStr = Matches[1];
+ II = I;
I += Matches[0].size();
+ while (FormatStrConverter.convert(*II) == ' ')
+ ++II;
+
// Set the privacy flag if the privacy annotation in the
// comma-delimited segment is at least as strict as the privacy
// annotations in previous comma-delimited segments.
if (MatchedStr.starts_with("mask")) {
- StringRef MaskType = MatchedStr.substr(sizeof("mask.") - 1);
+ StringRef MaskType(II + sizeof("mask.") - 1,
+ MatchedStr.size() - sizeof("mask.") + 1);
unsigned Size = MaskType.size();
+
if (Warn && (Size == 0 || Size > 8))
H.handleInvalidMaskType(MaskType);
FS.setMaskType(MaskType);
- } else if (MatchedStr == "sensitive")
+ } else if (MatchedStr == "sensitive") {
+ StringRef ProxyMatchedStr(II, sizeof("sensitive") - 1);
+ MatchedStr = ProxyMatchedStr;
PrivacyFlags = clang::analyze_os_log::OSLogBufferItem::IsSensitive;
- else if (PrivacyFlags !=
- clang::analyze_os_log::OSLogBufferItem::IsSensitive &&
- MatchedStr == "private")
+ } else if (PrivacyFlags !=
+ clang::analyze_os_log::OSLogBufferItem::IsSensitive &&
+ MatchedStr == "private") {
+ StringRef ProxyMatchedStr(II, sizeof("private") - 1);
+ MatchedStr = ProxyMatchedStr;
PrivacyFlags = clang::analyze_os_log::OSLogBufferItem::IsPrivate;
- else if (PrivacyFlags == 0 && MatchedStr == "public")
+ } else if (PrivacyFlags == 0 && MatchedStr == "public") {
+ StringRef ProxyMatchedStr(II, sizeof("public") - 1);
+ MatchedStr = ProxyMatchedStr;
PrivacyFlags = clang::analyze_os_log::OSLogBufferItem::IsPublic;
+ }
} else {
size_t CommaOrBracePos =
Str.find_if([](char c) { return c == ',' || c == '}'; });
@@ -165,7 +190,7 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E,
I += CommaOrBracePos + 1;
}
// Continue until the closing brace is found.
- } while (*(I - 1) == ',');
+ } while (FormatStrConverter.convert(*(I - 1)) == ',');
// Set the privacy flag.
switch (PrivacyFlags) {
@@ -188,7 +213,7 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E,
// Look for flags (if any).
bool hasMore = true;
for (; I != E; ++I) {
- switch (*I) {
+ switch (FormatStrConverter.convert(*I)) {
default:
hasMore = false;
break;
@@ -225,7 +250,8 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E,
// Look for the field width (if any).
if (ParseFieldWidth(H, FS, Start, I, E,
- FS.usesPositionalArg() ? nullptr : &argIndex))
+ FS.usesPositionalArg() ? nullptr : &argIndex,
+ FormatStrConverter))
return true;
if (I == E) {
@@ -236,7 +262,7 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E,
}
// Look for the precision (if any).
- if (*I == '.') {
+ if (FormatStrConverter.convert(*I) == '.') {
++I;
if (I == E) {
if (Warn)
@@ -245,7 +271,8 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E,
}
if (ParsePrecision(H, FS, Start, I, E,
- FS.usesPositionalArg() ? nullptr : &argIndex))
+ FS.usesPositionalArg() ? nullptr : &argIndex,
+ FormatStrConverter))
return true;
if (I == E) {
@@ -256,11 +283,11 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E,
}
}
- if (ParseVectorModifier(H, FS, I, E, LO))
+ if (ParseVectorModifier(H, FS, I, E, LO, FormatStrConverter))
return true;
// Look for the length modifier.
- if (ParseLengthModifier(FS, I, E, LO) && I == E) {
+ if (ParseLengthModifier(FS, I, E, LO, FormatStrConverter) && I == E) {
// No more characters left?
if (Warn)
H.HandleIncompleteSpecifier(Start, E - Start);
@@ -274,7 +301,7 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E,
// enables better recovery, and we don't know if
// these flags are applicable until later.
const char *ObjCModifierFlagsStart = nullptr, *ObjCModifierFlagsEnd = nullptr;
- if (*I == '[') {
+ if (FormatStrConverter.convert(*I) == '[') {
ObjCModifierFlagsStart = I;
++I;
auto flagStart = I;
@@ -286,8 +313,8 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E,
return true;
}
// Did we find the closing ']'?
- if (*I == ']') {
- if (ParseObjCFlags(H, FS, flagStart, I, Warn))
+ if (FormatStrConverter.convert(*I) == ']') {
+ if (ParseObjCFlags(H, FS, flagStart, I, Warn, FormatStrConverter))
return true;
++I;
break;
@@ -307,7 +334,7 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E,
// Finally, look for the conversion specifier.
const char *conversionPosition = I++;
ConversionSpecifier::Kind k = ConversionSpecifier::InvalidSpecifier;
- switch (*conversionPosition) {
+ switch (FormatStrConverter.convert(*conversionPosition)) {
default:
break;
// C99: 7.19.6.1 (section 8).
@@ -470,7 +497,8 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E,
FS.setConversionSpecifier(CS);
}
// Assume the conversion takes one argument.
- return !H.HandleInvalidPrintfConversionSpecifier(FS, Start, Len);
+ return !H.HandleInvalidPrintfConversionSpecifier(FS, Start, Len,
+ FormatStrConverter);
}
return PrintfSpecifierResult(Start, FS);
}
@@ -480,7 +508,6 @@ bool clang::analyze_format_string::ParsePrintfString(
const TargetInfo &Target, bool isFreeBSDKPrintf) {
unsigned argIndex = 0;
-
// Keep looking for a format specifier until we have exhausted the string.
while (I != E) {
const PrintfSpecifierResult &FSR = ParsePrintfSpecifier(
diff --git a/clang/lib/AST/ScanfFormatString.cpp b/clang/lib/AST/ScanfFormatString.cpp
index 90cbbd60bbcf5..c63171844d90d 100644
--- a/clang/lib/AST/ScanfFormatString.cpp
+++ b/clang/lib/AST/ScanfFormatString.cpp
@@ -81,7 +81,8 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H,
const char *I = Beg;
const char *Start = nullptr;
UpdateOnReturn<const char *> UpdateBeg(Beg, I);
-
+ const llvm::TextEncodingConverter &FormatStrConverter =
+ *Target.FormatStrConverter;
// Look for a '%' character that indicates the start of a format specifier.
for (; I != E; ++I) {
char c = *I;
@@ -90,7 +91,9 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H,
H.HandleNullChar(I);
return true;
}
- if (c == '%') {
+ SmallString<1> ConvertedChar;
+ FormatStrConverter.convert(StringRef(&c, 1), ConvertedChar);
+ if (ConvertedChar[0] == '%') {
Start = I++; // Record the start of the format specifier.
break;
}
@@ -107,7 +110,7 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H,
}
ScanfSpecifier FS;
- if (ParseArgPosition(H, FS, Start, I, E))
+ if (ParseArgPosition(H, FS, Start, I, E, FormatStrConverter))
return true;
if (I == E) {
@@ -117,7 +120,7 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H,
}
// Look for '*' flag if it is present.
- if (*I == '*') {
+ if (FormatStrConverter.convert(*I) == '*') {
FS.setSuppressAssignment(I);
if (++I == E) {
H.HandleIncompleteSpecifier(Start, E - Start);
@@ -127,7 +130,8 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H,
// Look for the field width (if any). Unlike printf, this is either
// a fixed integer or isn't present.
- const OptionalAmount &Amt = clang::analyze_format_string::ParseAmount(I, E);
+ const OptionalAmount &Amt =
+ clang::analyze_format_string::ParseAmount(I, E, FormatStrConverter);
if (Amt.getHowSpecified() != OptionalAmount::NotSpecified) {
assert(Amt.getHowSpecified() == OptionalAmount::Constant);
FS.setFieldWidth(Amt);
@@ -140,7 +144,9 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H,
}
// Look for the length modifier.
- if (ParseLengthModifier(FS, I, E, LO, /*IsScanf=*/true) && I == E) {
+ if (ParseLengthModifier(FS, I, E, LO, FormatStrConverter,
+ /*IsScanf=*/true) &&
+ I == E) {
// No more characters left?
H.HandleIncompleteSpecifier(Start, E - Start);
return true;
@@ -155,7 +161,7 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H,
// Finally, look for the conversion specifier.
const char *conversionPosition = I++;
ScanfConversionSpecifier::Kind k = ScanfConversionSpecifier::InvalidSpecifier;
- switch (*conversionPosition) {
+ switch (FormatStrConverter.convert(*conversionPosition)) {
default:
break;
case '%':
@@ -262,7 +268,8 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H,
FS.setConversionSpecifier(CS);
}
// Assume the conversion takes one argument.
- return !H.HandleInvalidScanfConversionSpecifier(FS, Beg, Len);
+ return !H.HandleInvalidScanfConversionSpecifier(FS, Beg, Len,
+ FormatStrConverter);
}
return ScanfSpecifierResult(Start, FS);
}
diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp
index 854d23cadaea2..0864d6855068a 100644
--- a/clang/lib/Basic/TargetInfo.cpp
+++ b/clang/lib/Basic/TargetInfo.cpp
@@ -194,6 +194,9 @@ TargetInfo::TargetInfo(const llvm::Triple &T) : Triple(T) {
MaxOpenCLWorkGroupSize = 1024;
MaxBitIntWidth.reset();
+
+ FormatStrConverter = new llvm::TextEncodingConverter(
+ std::move(*llvm::TextEncodingConverter::createNoopConverter()));
}
// Out of line virtual dtor for TargetInfo.
diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index f4e0f09035fff..ef2899c47f3c8 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -555,8 +555,8 @@ void CompilerInstance::createPreprocessor(TranslationUnitKind TUKind) {
if (GetDependencyDirectives)
PP->setDependencyDirectivesGetter(*GetDependencyDirectives);
- if (auto EC = TextEncoding::setConvertersFromOptions(PP->getTextEncoding(),
- getLangOpts()))
+ if (auto EC = TextEncoding::setConvertersFromOptions(
+ PP->getTextEncoding(), getLangOpts(), getTarget()))
PP->getDiagnostics().Report(clang::diag::err_fe_text_encoding_config)
<< PP->getTextEncoding().getLiteralEncoding();
}
diff --git a/clang/lib/Lex/TextEncoding.cpp b/clang/lib/Lex/TextEncoding.cpp
index 393caaadd5d37..ec9945ec789bd 100644
--- a/clang/lib/Lex/TextEncoding.cpp
+++ b/clang/lib/Lex/TextEncoding.cpp
@@ -21,7 +21,8 @@ TextEncoding::getConverter(ConversionAction Action) const {
std::error_code
TextEncoding::setConvertersFromOptions(TextEncoding &TEC,
- const clang::LangOptions &Opts) {
+ const clang::LangOptions &Opts,
+ clang::TargetInfo &TInfo) {
using namespace llvm;
const char *UTF8 = "UTF-8";
@@ -39,5 +40,13 @@ TextEncoding::setConvertersFromOptions(TextEncoding &TEC,
new TextEncodingConverter(std::move(*ErrorOrConverter));
else
return ErrorOrConverter.getError();
+
+ ErrorOrConverter = llvm::TextEncodingConverter::create(TEC.SystemEncoding,
+ TEC.InternalEncoding);
+
+ if (ErrorOrConverter)
+ TInfo.FormatStrConverter =
+ new TextEncodingConverter(std::move(*ErrorOrConverter));
+
return std::error_code();
}
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 8a8c9cc9d2c23..d9cae43a69fdc 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -104,6 +104,7 @@
#include "llvm/Support/Locale.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/SaveAndRestore.h"
+#include "llvm/Support/TextEncoding.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/TargetParser/RISCVTargetParser.h"
#include "llvm/TargetParser/Triple.h"
@@ -7930,10 +7931,10 @@ class CheckFormatHandler : public analyze_format_string::FormatStringHandler {
ArrayRef<FixItHint> Fixit = {});
protected:
- bool HandleInvalidConversionSpecifier(unsigned argIndex, SourceLocation Loc,
- const char *startSpec,
- unsigned specifierLen,
- const char *csStart, unsigned csLen);
+ bool HandleInvalidConversionSpecifier(
+ unsigned argIndex, SourceLocation Loc, const char *startSpec,
+ unsigned specifierLen, const char *csStart, unsigned csLen,
+ const llvm::TextEncodingConverter &FormatStrConverter);
void HandlePositionalNonpositionalArgs(SourceLocation Loc,
const char *startSpec,
@@ -8163,7 +8164,8 @@ void UncoveredArgHandler::Diagnose(Sema &S, bool IsFunctionCall,
bool CheckFormatHandler::HandleInvalidConversionSpecifier(
unsigned argIndex, SourceLocation Loc, const char *startSpec,
- unsigned specifierLen, const char *csStart, unsigned csLen) {
+ unsigned specifierLen, const char *csStart, unsigned csLen,
+ const llvm::TextEncodingConverter &FormatStrConverter) {
bool keepGoing = true;
if (argIndex < NumDataArgs) {
// Consider the argument coverered, even though the specifier doesn't
@@ -8178,7 +8180,13 @@ bool CheckFormatHandler::HandleInvalidConversionSpecifier(
keepGoing = false;
}
- StringRef Specifier(csStart, csLen);
+ // The csStart points to a character that has already been converted to the
+ // exec charset, so we have to reverse the conversion to allow diagnostic
+ // message to match an expected value when using -verify option,
+ std::string RS(csStart, csLen);
+ for (unsigned int i = 0; i < RS.size(); ++i)
+ RS[i] = FormatStrConverter.convert(RS[i]);
+ StringRef Specifier(RS);
// If the specifier in non-printable, it could be the first byte of a UTF-8
// sequence. In that case, print the UTF-8 code point. If not, print the byte
@@ -8332,7 +8340,8 @@ class CheckPrintfHandler : public CheckFormatHandler {
bool HandleInvalidPrintfConversionSpecifier(
const analyze_printf::PrintfSpecifier &FS, const char *startSpecifier,
- unsigned specifierLen) override;
+ unsigned specifierLen,
+ const llvm::TextEncodingConverter &FormatStrConverter) override;
void handleInvalidMaskType(StringRef MaskType) override;
@@ -8472,13 +8481,14 @@ class DecomposePrintfHandler : public CheckPrintfHandler {
bool CheckPrintfHandler::HandleInvalidPrintfConversionSpecifier(
const analyze_printf::PrintfSpecifier &FS, const char *startSpecifier,
- unsigned specifierLen) {
+ unsigned specifierLen,
+ const llvm::TextEncodingConverter &FormatStrConverter) {
const analyze_printf::PrintfConversionSpecifier &CS =
FS.getConversionSpecifier();
return HandleInvalidConversionSpecifier(
FS.getArgIndex(), getLocationOfByte(CS.getStart()), startSpecifier,
- specifierLen, CS.getStart(), CS.getLength());
+ specifierLen, CS.getStart(), CS.getLength(), FormatStrConverter);
}
void CheckPrintfHandler::handleInvalidMaskType(StringRef MaskType) {
@@ -8986,15 +8996,15 @@ bool CheckPrintfHandler::HandlePrintfSpecifier(
// Check for using an Objective-C specific conversion specifier
// in a non-ObjC literal.
if (!allowsObjCArg() && CS.isObjCArg()) {
- return HandleInvalidPrintfConversionSpecifier(FS, startSpecifier,
- specifierLen);
+ return HandleInvalidPrintfConversionSpecifier(
+ FS, startSpecifier, specifierLen, *Target.FormatStrConverter);
}
// %P can only be used with os_log.
if (FSType != FormatStringType::OSLog &&
CS.getKind() == ConversionSpecifier::PArg) {
- return HandleInvalidPrintfConversionSpecifier(FS, startSpecifier,
- specifierLen);
+ return HandleInvalidPrintfConversionSpecifier(
+ FS, startSpecifier, specifierLen, *Target.FormatStrConverter);
}
// %n is not allowed with os_log.
@@ -9013,8 +9023,8 @@ bool CheckPrintfHandler::HandlePrintfSpecifier(
(CS.getKind() == ConversionSpecifier::PArg ||
CS.getKind() == ConversionSpecifier::sArg ||
CS.getKind() == ConversionSpecifier::ObjCObjArg)) {
- return HandleInvalidPrintfConversionSpecifier(FS, startSpecifier,
- specifierLen);
+ return HandleInvalidPrintfConversionSpecifier(
+ FS, startSpecifier, specifierLen, *Target.FormatStrConverter);
}
// Check for use of public/private annotation outside of os_log().
@@ -9687,10 +9697,10 @@ class CheckScanfHandler : public CheckFormatHandler {
const char *startSpecifier,
unsigned specifierLen) override;
- bool
- HandleInvalidScanfConversionSpecifier(const analyze_scanf::ScanfSpecifier &FS,
- const char *startSpecifier,
- unsigned specifierLen) override;
+ bool HandleInvalidScanfConversionSpecifier(
+ const analyze_scanf::ScanfSpecifier &FS, const char *startSpecifier,
+ unsigned specifierLen,
+ const llvm::TextEncodingConverter &FormatStrConverter) override;
void HandleIncompleteScanList(const char *start, const char *end) override;
};
@@ -9706,13 +9716,15 @@ void CheckScanfHandler::HandleIncompleteScanList(const char *start,
bool CheckScanfHandler::HandleInvalidScanfConversionSpecifier(
const analyze_scanf::ScanfSpecifier &FS, const char *startSpecifier,
- unsigned specifierLen) {
+ unsigned specifierLen,
+ const llvm::TextEncodingConverter &FormatStrConverter) {
+
const analyze_scanf::ScanfConversionSpecifier &CS =
FS.getConversionSpecifier();
return HandleInvalidConversionSpecifier(
FS.getArgIndex(), getLocationOfByte(CS.getStart()), startSpecifier,
- specifierLen, CS.getStart(), CS.getLength());
+ specifierLen, CS.getStart(), CS.getLength(), FormatStrConverter);
}
bool CheckScanfHandler::HandleScanfSpecifier(
diff --git a/llvm/include/llvm/Support/TextEncoding.h b/llvm/include/llvm/Support/TextEncoding.h
index 8a304910aa5dd..8f5a6122ede45 100644
--- a/llvm/include/llvm/Support/TextEncoding.h
+++ b/llvm/include/llvm/Support/TextEncoding.h
@@ -105,6 +105,8 @@ class TextEncodingConverter {
LLVM_ABI static ErrorOr<TextEncodingConverter> create(StringRef From,
StringRef To);
+ LLVM_ABI static ErrorOr<TextEncodingConverter> createNoopConverter();
+
TextEncodingConverter(const TextEncodingConverter &) = delete;
TextEncodingConverter &operator=(const TextEncodingConverter &) = delete;
@@ -135,6 +137,14 @@ class TextEncodingConverter {
return std::string(Result);
return EC;
}
+
+ char convert(char SingleChar) const {
+ SmallString<1> Result;
+ auto EC = Converter->convert(StringRef(&SingleChar, 1), Result);
+ if (!EC)
+ return Result[0];
+ return '\0';
+ }
};
} // namespace llvm
diff --git a/llvm/lib/Support/TextEncoding.cpp b/llvm/lib/Support/TextEncoding.cpp
index d36f02c1300b9..5c1d9696686a2 100644
--- a/llvm/lib/Support/TextEncoding.cpp
+++ b/llvm/lib/Support/TextEncoding.cpp
@@ -356,3 +356,22 @@ ErrorOr<TextEncodingConverter> TextEncodingConverter::create(StringRef From,
return std::make_error_code(std::errc::invalid_argument);
#endif
}
+
+class TextEncodingConverterNoop final
+ : public details::TextEncodingConverterImplBase {
+
+public:
+ TextEncodingConverterNoop() {}
+
+ std::error_code convertString(StringRef Source,
+ SmallVectorImpl<char> &Result) override {
+ Result.assign(Source.begin(), Source.end());
+ return std::error_code();
+ }
+
+ void reset() override {}
+};
+
+ErrorOr<TextEncodingConverter> TextEncodingConverter::createNoopConverter() {
+ return TextEncodingConverter(std::make_unique<TextEncodingConverterNoop>());
+}
>From 48c2a16354369421987de1ab9e69770b99586654 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Wed, 13 May 2026 15:10:35 -0400
Subject: [PATCH 8/9] fix CI
---
clang/include/clang/Lex/TextEncoding.h | 1 +
clang/lib/Lex/TextEncoding.cpp | 4 ++--
2 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/clang/include/clang/Lex/TextEncoding.h b/clang/include/clang/Lex/TextEncoding.h
index f525b06cff37b..097e96371338b 100644
--- a/clang/include/clang/Lex/TextEncoding.h
+++ b/clang/include/clang/Lex/TextEncoding.h
@@ -10,6 +10,7 @@
#define LLVM_CLANG_LEX_TEXTENCODING_H
#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/TargetInfo.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/TextEncoding.h"
diff --git a/clang/lib/Lex/TextEncoding.cpp b/clang/lib/Lex/TextEncoding.cpp
index ec9945ec789bd..682b45816cc7e 100644
--- a/clang/lib/Lex/TextEncoding.cpp
+++ b/clang/lib/Lex/TextEncoding.cpp
@@ -41,8 +41,8 @@ TextEncoding::setConvertersFromOptions(TextEncoding &TEC,
else
return ErrorOrConverter.getError();
- ErrorOrConverter = llvm::TextEncodingConverter::create(TEC.SystemEncoding,
- TEC.InternalEncoding);
+ ErrorOrConverter = llvm::TextEncodingConverter::create(
+ TInfo.getDefaultNarrowTextEncoding(), UTF8);
if (ErrorOrConverter)
TInfo.FormatStrConverter =
>From d96b76449eed75eaa209aa69be39412e6ce5fbe7 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Thu, 28 May 2026 15:16:49 -0400
Subject: [PATCH 9/9] do not convert character by character
---
clang/lib/Lex/TextEncoding.cpp | 2 +-
clang/lib/Sema/SemaChecking.cpp | 10 ++++++----
2 files changed, 7 insertions(+), 5 deletions(-)
diff --git a/clang/lib/Lex/TextEncoding.cpp b/clang/lib/Lex/TextEncoding.cpp
index 682b45816cc7e..df3e4dbcaf4b4 100644
--- a/clang/lib/Lex/TextEncoding.cpp
+++ b/clang/lib/Lex/TextEncoding.cpp
@@ -42,7 +42,7 @@ TextEncoding::setConvertersFromOptions(TextEncoding &TEC,
return ErrorOrConverter.getError();
ErrorOrConverter = llvm::TextEncodingConverter::create(
- TInfo.getDefaultNarrowTextEncoding(), UTF8);
+ TInfo.getDefaultOrdinaryTextEncoding(), UTF8);
if (ErrorOrConverter)
TInfo.FormatStrConverter =
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index d9cae43a69fdc..6ec6979440369 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -8183,10 +8183,12 @@ bool CheckFormatHandler::HandleInvalidConversionSpecifier(
// The csStart points to a character that has already been converted to the
// exec charset, so we have to reverse the conversion to allow diagnostic
// message to match an expected value when using -verify option,
- std::string RS(csStart, csLen);
- for (unsigned int i = 0; i < RS.size(); ++i)
- RS[i] = FormatStrConverter.convert(RS[i]);
- StringRef Specifier(RS);
+ SmallString<4> RS;
+ auto EC = FormatStrConverter.convert(StringRef(csStart, csLen), RS);
+ if (EC) {
+ keepGoing = false;
+ }
+ llvm::StringRef Specifier(RS);
// If the specifier in non-printable, it could be the first byte of a UTF-8
// sequence. In that case, print the UTF-8 code point. If not, print the byte
More information about the llvm-branch-commits
mailing list