[clang] [llvm] Enable fexec-charset option (PR #138895)
Abhina Sree via cfe-commits
cfe-commits at lists.llvm.org
Mon Jun 8 11:48:29 PDT 2026
https://github.com/abhina-sree updated https://github.com/llvm/llvm-project/pull/138895
>From 3656fad704727278ae17d7c1762d89b345bf6254 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Fri, 8 May 2026 12:16:13 -0400
Subject: [PATCH 1/7] This patch enables the fexec-charset option to control
the execution charset of string literals. It sets the default internal
charset, system charset, and execution charset for z/OS and UTF-8 for all
other platforms.
---
.../clang/Basic/DiagnosticFrontendKinds.td | 1 +
.../include/clang/Basic/DiagnosticLexKinds.td | 2 +
clang/include/clang/Basic/LangOptions.h | 3 +
clang/include/clang/Lex/LiteralSupport.h | 19 +-
clang/include/clang/Lex/Preprocessor.h | 3 +
clang/include/clang/Lex/TextEncodingConfig.h | 34 ++++
clang/include/clang/Options/Options.td | 5 +
clang/lib/Frontend/CompilerInstance.cpp | 6 +
clang/lib/Frontend/FrontendAction.cpp | 4 +-
clang/lib/Frontend/InitPreprocessor.cpp | 15 +-
clang/lib/Lex/CMakeLists.txt | 1 +
clang/lib/Lex/LiteralSupport.cpp | 170 ++++++++++++++----
clang/lib/Lex/PPDirectives.cpp | 6 +-
clang/lib/Lex/TextEncodingConfig.cpp | 45 +++++
clang/test/CodeGen/systemz-charset.c | 58 ++++++
clang/test/CodeGen/systemz-charset.cpp | 70 ++++++++
clang/test/Preprocessor/init-s390x.c | 1 +
llvm/include/llvm/TargetParser/Triple.h | 4 +
llvm/lib/TargetParser/Triple.cpp | 7 +
19 files changed, 409 insertions(+), 45 deletions(-)
create mode 100644 clang/include/clang/Lex/TextEncodingConfig.h
create mode 100644 clang/lib/Lex/TextEncodingConfig.cpp
create mode 100644 clang/test/CodeGen/systemz-charset.c
create mode 100644 clang/test/CodeGen/systemz-charset.cpp
diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
index b585f0d3fa9a9..798fc4923d692 100644
--- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td
+++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
@@ -366,6 +366,7 @@ def warn_alias_type_mismatch : Warning<
"alias and aliasee have different types %0 and %1">,
InGroup<DiagGroup<"attribute-alias">>;
+def err_fe_text_encoding_config : Error<"failed to set fexec-charset to '%0'">;
def warn_atomic_op_misaligned : Warning<
"misaligned atomic operation may incur "
"significant performance penalty"
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index 383bf1a7fdb3f..f12fa0205b650 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -287,6 +287,8 @@ def ext_string_too_long : Extension<"string literal of length %0 exceeds "
"support">, InGroup<OverlengthStrings>;
def err_character_too_large : Error<
"character too large for enclosing character literal type">;
+def err_exec_charset_conversion_failed
+ : Error<"conversion to execution encoding failed: '%0'">;
def warn_c99_compat_unicode_literal : Warning<
"unicode literals are incompatible with C99">,
InGroup<C99Compat>, DefaultIgnore;
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 64b12b6fd72c7..1501bc0e38218 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -618,6 +618,9 @@ class LangOptions : public LangOptionsBase {
/// The allocation token mode.
std::optional<llvm::AllocTokenMode> AllocTokenMode;
+ /// Name of the execution encoding to convert the internal encoding to.
+ std::string ExecEncoding;
+
LangOptions();
/// Set language defaults for the given input language and
diff --git a/clang/include/clang/Lex/LiteralSupport.h b/clang/include/clang/Lex/LiteralSupport.h
index ea5f63bc20399..6b404403ed95f 100644
--- a/clang/include/clang/Lex/LiteralSupport.h
+++ b/clang/include/clang/Lex/LiteralSupport.h
@@ -17,11 +17,13 @@
#include "clang/Basic/CharInfo.h"
#include "clang/Basic/LLVM.h"
#include "clang/Basic/TokenKinds.h"
+#include "clang/Lex/TextEncodingConfig.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/TextEncoding.h"
namespace clang {
@@ -233,6 +235,7 @@ class StringLiteralParser {
const LangOptions &Features;
const TargetInfo &Target;
DiagnosticsEngine *Diags;
+ TextEncodingConfig *TEC;
unsigned MaxTokenLength;
unsigned SizeBound;
@@ -246,18 +249,19 @@ class StringLiteralParser {
StringLiteralEvalMethod EvalMethod;
public:
- StringLiteralParser(ArrayRef<Token> StringToks, Preprocessor &PP,
- StringLiteralEvalMethod StringMethod =
- StringLiteralEvalMethod::Evaluated);
+ StringLiteralParser(
+ ArrayRef<Token> StringToks, Preprocessor &PP,
+ StringLiteralEvalMethod StringMethod = StringLiteralEvalMethod::Evaluated,
+ ConversionAction Action = CA_ToExecEncoding);
StringLiteralParser(ArrayRef<Token> StringToks, const SourceManager &sm,
const LangOptions &features, const TargetInfo &target,
DiagnosticsEngine *diags = nullptr)
- : SM(sm), Features(features), Target(target), Diags(diags),
+ : SM(sm), Features(features), Target(target), Diags(diags), TEC(nullptr),
MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
ResultPtr(ResultBuf.data()),
EvalMethod(StringLiteralEvalMethod::Evaluated), hadError(false),
Pascal(false) {
- init(StringToks);
+ init(StringToks, CA_NoConversion);
}
bool hadError;
@@ -305,9 +309,10 @@ class StringLiteralParser {
static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix);
private:
- void init(ArrayRef<Token> StringToks);
+ void init(ArrayRef<Token> StringToks, ConversionAction Action);
bool CopyStringFragment(const Token &Tok, const char *TokBegin,
- StringRef Fragment);
+ StringRef Fragment,
+ llvm::TextEncodingConverter *Converter);
void DiagnoseLexingError(SourceLocation Loc);
};
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 8b684e85eb1c1..27fc7ef8d68dc 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -30,6 +30,7 @@
#include "clang/Lex/ModuleMap.h"
#include "clang/Lex/PPCallbacks.h"
#include "clang/Lex/PPEmbedParameters.h"
+#include "clang/Lex/TextEncodingConfig.h"
#include "clang/Lex/Token.h"
#include "clang/Lex/TokenLexer.h"
#include "clang/Support/Compiler.h"
@@ -198,6 +199,7 @@ class Preprocessor {
std::unique_ptr<ScratchBuffer> ScratchBuf;
HeaderSearch &HeaderInfo;
ModuleLoader &TheModuleLoader;
+ TextEncodingConfig TEC;
/// External source of macros.
ExternalPreprocessorSource *ExternalSource;
@@ -1264,6 +1266,7 @@ class Preprocessor {
SelectorTable &getSelectorTable() { return Selectors; }
Builtin::Context &getBuiltinInfo() { return *BuiltinInfo; }
llvm::BumpPtrAllocator &getPreprocessorAllocator() { return BP; }
+ TextEncodingConfig &getTextEncodingConfig() { return TEC; }
void setExternalSource(ExternalPreprocessorSource *Source) {
ExternalSource = Source;
diff --git a/clang/include/clang/Lex/TextEncodingConfig.h b/clang/include/clang/Lex/TextEncodingConfig.h
new file mode 100644
index 0000000000000..09967a81beeed
--- /dev/null
+++ b/clang/include/clang/Lex/TextEncodingConfig.h
@@ -0,0 +1,34 @@
+//===-- clang/Lex/TextEncodingConfig.h - Text Conversion Config -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LEX_TEXTENCODINGCONFIG_H
+#define LLVM_CLANG_LEX_TEXTENCODINGCONFIG_H
+
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/TargetInfo.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/TextEncoding.h"
+
+enum ConversionAction { CA_NoConversion, CA_ToExecEncoding };
+
+class TextEncodingConfig {
+ llvm::StringRef ExecEncoding;
+ llvm::TextEncodingConverter *ToExecEncodingConverter = nullptr;
+
+public:
+ llvm::TextEncodingConverter *getConverter(ConversionAction Action) const;
+ static std::error_code
+ setConvertersFromOptions(TextEncodingConfig &TEC,
+ const clang::LangOptions &Opts);
+
+ llvm::StringRef getExecEncoding() { return ExecEncoding; }
+};
+
+#endif
diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td
index 753e3ac1b74a5..ee7937f38f1a3 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -7521,6 +7521,11 @@ let Visibility = [CC1Option, CC1AsOption, FC1Option] in {
def tune_cpu : Separate<["-"], "tune-cpu">,
HelpText<"Tune for a specific cpu type">,
MarshallingInfoString<TargetOpts<"TuneCPU">>;
+def fexec_charset : Separate<["-"], "fexec-charset">, MetaVarName<"<encoding>">,
+ HelpText<"Set the execution <encoding> for string and character literals. "
+ "Supported character encodings include ISO-8859-1, UTF-8, IBM1047, "
+ "and possibly those supported by ICU or the host iconv library.">,
+ MarshallingInfoString<LangOpts<"ExecEncoding">>;
def target_cpu : Separate<["-"], "target-cpu">,
HelpText<"Target a specific cpu type">,
MarshallingInfoString<TargetOpts<"CPU">>;
diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index 9e88abbece7f2..dff396d91f2f1 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -34,6 +34,7 @@
#include "clang/Lex/HeaderSearch.h"
#include "clang/Lex/Preprocessor.h"
#include "clang/Lex/PreprocessorOptions.h"
+#include "clang/Lex/TextEncodingConfig.h"
#include "clang/Sema/CodeCompleteConsumer.h"
#include "clang/Sema/ParsedAttr.h"
#include "clang/Sema/Sema.h"
@@ -553,6 +554,11 @@ void CompilerInstance::createPreprocessor(TranslationUnitKind TUKind) {
if (GetDependencyDirectives)
PP->setDependencyDirectivesGetter(*GetDependencyDirectives);
+
+ if (auto EC = TextEncodingConfig::setConvertersFromOptions(
+ PP->getTextEncodingConfig(), getLangOpts()))
+ PP->getDiagnostics().Report(clang::diag::err_fe_text_encoding_config)
+ << PP->getTextEncodingConfig().getExecEncoding();
}
// ASTContext
diff --git a/clang/lib/Frontend/FrontendAction.cpp b/clang/lib/Frontend/FrontendAction.cpp
index 7754861fabaf0..d9ca4c1b08ca0 100644
--- a/clang/lib/Frontend/FrontendAction.cpp
+++ b/clang/lib/Frontend/FrontendAction.cpp
@@ -525,7 +525,9 @@ static SourceLocation ReadOriginalFileName(CompilerInstance &CI,
if (T.isAtStartOfLine() || T.getKind() != tok::string_literal)
return SourceLocation();
- StringLiteralParser Literal(T, CI.getPreprocessor());
+ StringLiteralParser Literal(T, CI.getPreprocessor(),
+ StringLiteralEvalMethod::Evaluated,
+ CA_NoConversion);
if (Literal.hadError)
return SourceLocation();
RawLexer->LexFromRawLexer(T);
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 3f0468a938149..200eab9b971a7 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1033,10 +1033,17 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
}
}
- // Macros to help identify the narrow and wide character sets
- // FIXME: clang currently ignores -fexec-charset=. If this changes,
- // then this may need to be updated.
- Builder.defineMacro("__clang_literal_encoding__", "\"UTF-8\"");
+ // Macros to help identify the narrow and wide character sets. This is set
+ // to fexec-charset. If fexec-charset is not specified, the default is the
+ // system charset.
+ if (!LangOpts.ExecEncoding.empty())
+ Builder.defineMacro("__clang_literal_encoding__",
+ Twine("\"" + LangOpts.ExecEncoding + "\""));
+ else
+ Builder.defineMacro(
+ "__clang_literal_encoding__",
+ Twine("\"" + TI.getTriple().getDefaultNarrowTextEncoding() + "\""));
+
if (TI.getTypeWidth(TI.getWCharType()) >= 32) {
// FIXME: 32-bit wchar_t signals UTF-32. This may change
// if -fwide-exec-charset= is ever supported.
diff --git a/clang/lib/Lex/CMakeLists.txt b/clang/lib/Lex/CMakeLists.txt
index f61737cd68021..106a5d3b126be 100644
--- a/clang/lib/Lex/CMakeLists.txt
+++ b/clang/lib/Lex/CMakeLists.txt
@@ -29,6 +29,7 @@ add_clang_library(clangLex
Preprocessor.cpp
PreprocessorLexer.cpp
ScratchBuffer.cpp
+ TextEncodingConfig.cpp
TokenConcatenation.cpp
TokenLexer.cpp
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index 482146ccf8654..9b8835bbf5e35 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -126,6 +126,17 @@ static bool IsEscapeValidInUnevaluatedStringLiteral(char Escape) {
return false;
}
+static llvm::ErrorOr<char>
+convertCharacter(StringRef Char, const llvm::TextEncodingConverter &Converter) {
+ SmallString<8> ResultCharConv;
+ std::error_code EC = Converter.convert(Char, ResultCharConv);
+ if (EC)
+ return EC;
+ else if (ResultCharConv.size() > 1)
+ return std::error_code(E2BIG, std::generic_category());
+ return ResultCharConv[0];
+}
+
/// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
/// either a character or a string literal.
static unsigned ProcessCharEscape(const char *ThisTokBegin,
@@ -134,7 +145,8 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
FullSourceLoc Loc, unsigned CharWidth,
DiagnosticsEngine *Diags,
const LangOptions &Features,
- StringLiteralEvalMethod EvalMethod) {
+ StringLiteralEvalMethod EvalMethod,
+ llvm::TextEncodingConverter *Converter) {
const char *EscapeBegin = ThisTokBuf;
bool Delimited = false;
bool EndDelimiterFound = false;
@@ -146,6 +158,8 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
// that would have been \", which would not have been the end of string.
unsigned ResultChar = *ThisTokBuf++;
char Escape = ResultChar;
+ bool Transcode = true;
+ bool Invalid = false;
switch (ResultChar) {
// These map to themselves.
case '\\': case '\'': case '"': case '?': break;
@@ -186,6 +200,7 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
ResultChar = 11;
break;
case 'x': { // Hex escape.
+ Transcode = false;
ResultChar = 0;
if (ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
Delimited = true;
@@ -249,6 +264,7 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
case '4': case '5': case '6': case '7': {
// Octal escapes.
--ThisTokBuf;
+ Transcode = false;
ResultChar = 0;
// Octal escapes are a series of octal digits with maximum length 3.
@@ -272,6 +288,7 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
}
case 'o': {
bool Overflow = false;
+ Transcode = false;
if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
HadError = true;
if (Diags)
@@ -334,6 +351,7 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
<< std::string(1, ResultChar);
break;
default:
+ Invalid = true;
if (!Diags)
break;
@@ -367,6 +385,21 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
HadError = true;
}
+ if (!HadError && EvalMethod != StringLiteralEvalMethod::Unevaluated &&
+ Transcode && Converter) {
+ // Invalid escapes are written as '?' and then translated.
+ assert(ResultChar <= std::numeric_limits<char>::max());
+ char ByteChar = Invalid ? '?' : ResultChar;
+ auto ErrorOrChar = convertCharacter(StringRef(&ByteChar, 1), *Converter);
+ if (ErrorOrChar)
+ ResultChar = *ErrorOrChar;
+ else {
+ Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+ diag::err_exec_charset_conversion_failed)
+ << ErrorOrChar.getError().message();
+ HadError = true;
+ }
+ }
return ResultChar;
}
@@ -1811,6 +1844,11 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
uint32_t *buffer_begin = &codepoint_buffer.front();
uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
+ const TextEncodingConfig &TEC = PP.getTextEncodingConfig();
+ llvm::TextEncodingConverter *Converter = nullptr;
+ if (isOrdinary())
+ Converter = TEC.getConverter(CA_ToExecEncoding);
+
// Unicode escapes representing characters that cannot be correctly
// represented in a single code unit are disallowed in character literals
// by this implementation.
@@ -1825,7 +1863,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
} else if (tok::utf32_char_constant == Kind) {
largest_character_for_kind = 0x10FFFF;
} else {
- largest_character_for_kind = 0x7Fu;
+ largest_character_for_kind = (Converter == nullptr) ? 0x7Fu : 0xFFu;
}
while (begin != end) {
@@ -1865,6 +1903,22 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
HadError = true;
PP.Diag(Loc, diag::err_character_too_large);
}
+ if (!HadError && Converter) {
+ assert(isOrdinary() && "Only ordinary characters are supported");
+ std::string UTF8String;
+ convertUTF32ToUTF8String(
+ ArrayRef<char>(reinterpret_cast<const char *>(tmp_out_start),
+ 4),
+ UTF8String);
+ auto ErrorOrChar = convertCharacter(UTF8String, *Converter);
+ if (ErrorOrChar) {
+ *tmp_out_start = *ErrorOrChar;
+ } else {
+ HadError = true;
+ PP.Diag(Loc, diag::err_exec_charset_conversion_failed)
+ << ErrorOrChar.getError().message();
+ }
+ }
}
}
@@ -1872,16 +1926,37 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
}
// Is this a Universal Character Name escape?
if (begin[1] == 'u' || begin[1] == 'U' || begin[1] == 'N') {
- unsigned short UcnLen = 0;
- if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
- FullSourceLoc(Loc, PP.getSourceManager()),
- &PP.getDiagnostics(), PP.getLangOpts(), true)) {
- HadError = true;
- } else if (*buffer_begin > largest_character_for_kind) {
- HadError = true;
- PP.Diag(Loc, diag::err_character_too_large);
+ if (Converter == nullptr) {
+ unsigned short UcnLen = 0;
+ if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
+ FullSourceLoc(Loc, PP.getSourceManager()),
+ &PP.getDiagnostics(), PP.getLangOpts(), true)) {
+ HadError = true;
+ } else if (*buffer_begin > largest_character_for_kind) {
+ HadError = true;
+ PP.Diag(Loc, diag::err_character_too_large);
+ }
+ } else {
+ char Cp[5];
+ char *ResultPtr = Cp;
+ EncodeUCNEscape(TokBegin, begin, end, ResultPtr, HadError,
+ FullSourceLoc(Loc, PP.getSourceManager()),
+ /*CharByteWidth=*/1u, &PP.getDiagnostics(),
+ PP.getLangOpts());
+ assert(ResultPtr - Cp <= 4 &&
+ "unexpected result size for UCN escape character");
+ if (!HadError) {
+ auto ErrorOrChar =
+ convertCharacter(StringRef(Cp, ResultPtr - Cp), *Converter);
+ if (ErrorOrChar)
+ *buffer_begin = *ErrorOrChar;
+ else {
+ PP.Diag(Loc, diag::err_exec_charset_conversion_failed)
+ << ErrorOrChar.getError().message();
+ HadError = true;
+ }
+ }
}
-
++buffer_begin;
continue;
}
@@ -1890,7 +1965,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
ProcessCharEscape(TokBegin, begin, end, HadError,
FullSourceLoc(Loc, PP.getSourceManager()), CharWidth,
&PP.getDiagnostics(), PP.getLangOpts(),
- StringLiteralEvalMethod::Evaluated);
+ StringLiteralEvalMethod::Evaluated, nullptr);
*buffer_begin++ = result;
}
@@ -2000,16 +2075,18 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
///
StringLiteralParser::StringLiteralParser(ArrayRef<Token> StringToks,
Preprocessor &PP,
- StringLiteralEvalMethod EvalMethod)
+ StringLiteralEvalMethod EvalMethod,
+ ConversionAction Action)
: SM(PP.getSourceManager()), Features(PP.getLangOpts()),
Target(PP.getTargetInfo()), Diags(&PP.getDiagnostics()),
- MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
- ResultPtr(ResultBuf.data()), EvalMethod(EvalMethod), hadError(false),
- Pascal(false) {
- init(StringToks);
+ TEC(&PP.getTextEncodingConfig()), MaxTokenLength(0), SizeBound(0),
+ CharByteWidth(0), Kind(tok::unknown), ResultPtr(ResultBuf.data()),
+ EvalMethod(EvalMethod), hadError(false), Pascal(false) {
+ init(StringToks, Action);
}
-void StringLiteralParser::init(ArrayRef<Token> StringToks){
+void StringLiteralParser::init(ArrayRef<Token> StringToks,
+ ConversionAction Action) {
// The literal token may have come from an invalid source location (e.g. due
// to a PCH error), in which case the token length will be 0.
if (StringToks.empty() || StringToks[0].getLength() < 2)
@@ -2101,6 +2178,10 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks){
SourceLocation UDSuffixTokLoc;
+ llvm::TextEncodingConverter *Converter = nullptr;
+ if (isOrdinary() && TEC)
+ Converter = TEC->getConverter(Action);
+
for (unsigned i = 0, e = StringToks.size(); i != e; ++i) {
const char *ThisTokBuf = &TokenBuf[0];
// Get the spelling of the token, which eliminates trigraphs, etc. We know
@@ -2211,7 +2292,8 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks){
StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos);
// Copy everything before the \r\n sequence into the string literal.
- if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF))
+ if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF,
+ Converter))
hadError = true;
// Point into the \n inside the \r\n sequence and operate on the
@@ -2250,24 +2332,32 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks){
// Copy the character span over.
if (CopyStringFragment(StringToks[i], ThisTokBegin,
- StringRef(InStart, ThisTokBuf - InStart)))
+ StringRef(InStart, ThisTokBuf - InStart),
+ Converter))
hadError = true;
continue;
}
// Is this a Universal Character Name escape?
if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U' ||
ThisTokBuf[1] == 'N') {
- EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
- ResultPtr, hadError,
+ char *Cp = ResultPtr;
+ EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, ResultPtr,
+ hadError,
FullSourceLoc(StringToks[i].getLocation(), SM),
CharByteWidth, Diags, Features);
+ if (!hadError && Converter) {
+ SmallString<8> CpConv;
+ Converter->convert(StringRef(Cp), CpConv);
+ memcpy(Cp, CpConv.data(), CpConv.size());
+ ResultPtr = Cp + CpConv.size();
+ }
continue;
}
// Otherwise, this is a non-UCN escape character. Process it.
- unsigned ResultChar =
- ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
- FullSourceLoc(StringToks[i].getLocation(), SM),
- CharByteWidth * 8, Diags, Features, EvalMethod);
+ unsigned ResultChar = ProcessCharEscape(
+ ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
+ FullSourceLoc(StringToks[i].getLocation(), SM), CharByteWidth * 8,
+ Diags, Features, EvalMethod, Converter);
if (CharByteWidth == 4) {
// FIXME: Make the type of the result buffer correct instead of
@@ -2343,12 +2433,29 @@ static const char *resyncUTF8(const char *Err, const char *End) {
/// This function copies from Fragment, which is a sequence of bytes
/// within Tok's contents (which begin at TokBegin) into ResultPtr.
/// Performs widening for multi-byte characters.
-bool StringLiteralParser::CopyStringFragment(const Token &Tok,
- const char *TokBegin,
- StringRef Fragment) {
+bool StringLiteralParser::CopyStringFragment(
+ const Token &Tok, const char *TokBegin, StringRef Fragment,
+ llvm::TextEncodingConverter *Converter) {
+
const llvm::UTF8 *ErrorPtrTmp;
- if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
+ if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp)) {
+ if (Converter) {
+ assert(isOrdinary() && "Only ordinary literals are supported");
+ SmallString<64> CpConv;
+ char *Cp = ResultPtr - Fragment.size();
+ auto EC = Converter->convert(Fragment, CpConv);
+ if (!EC) {
+ memcpy(Cp, CpConv.data(), CpConv.size());
+ ResultPtr = Cp + CpConv.size();
+ } else { // there was a conversion error
+ if (Diags)
+ Diags->Report(Tok.getLocation(),
+ diag::err_exec_charset_conversion_failed)
+ << EC.message();
+ }
+ }
return false;
+ }
// If we see bad encoding for unprefixed string literals, warn and
// simply copy the byte values, for compatibility with gcc and older
@@ -2465,7 +2572,8 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
} else {
ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
FullSourceLoc(Tok.getLocation(), SM), CharByteWidth * 8,
- Diags, Features, StringLiteralEvalMethod::Evaluated);
+ Diags, Features, StringLiteralEvalMethod::Evaluated,
+ /*TextEncodingConfig=*/nullptr);
--ByteNo;
}
assert(!HadError && "This method isn't valid on erroneous strings");
diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp
index 12d3c765b15bc..a20703db01def 100644
--- a/clang/lib/Lex/PPDirectives.cpp
+++ b/clang/lib/Lex/PPDirectives.cpp
@@ -1650,7 +1650,8 @@ void Preprocessor::HandleLineDirective() {
return;
} else {
// Parse and validate the string, converting it into a unique ID.
- StringLiteralParser Literal(StrTok, *this);
+ StringLiteralParser Literal(
+ StrTok, *this, StringLiteralEvalMethod::Evaluated, CA_NoConversion);
assert(Literal.isOrdinary() && "Didn't allow wide strings in");
if (Literal.hadError) {
DiscardUntilEndOfDirective();
@@ -1801,7 +1802,8 @@ void Preprocessor::HandleDigitDirective(Token &DigitTok) {
return;
} else {
// Parse and validate the string, converting it into a unique ID.
- StringLiteralParser Literal(StrTok, *this);
+ StringLiteralParser Literal(
+ StrTok, *this, StringLiteralEvalMethod::Evaluated, CA_NoConversion);
assert(Literal.isOrdinary() && "Didn't allow wide strings in");
if (Literal.hadError) {
DiscardUntilEndOfDirective();
diff --git a/clang/lib/Lex/TextEncodingConfig.cpp b/clang/lib/Lex/TextEncodingConfig.cpp
new file mode 100644
index 0000000000000..b89d5baefcc23
--- /dev/null
+++ b/clang/lib/Lex/TextEncodingConfig.cpp
@@ -0,0 +1,45 @@
+//===--- TextEncodingConfig.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Lex/TextEncodingConfig.h"
+#include "clang/Basic/DiagnosticDriver.h"
+
+using namespace llvm;
+
+llvm::TextEncodingConverter *
+TextEncodingConfig::getConverter(ConversionAction Action) const {
+ switch (Action) {
+ case CA_ToExecEncoding:
+ return ToExecEncodingConverter;
+ default:
+ return nullptr;
+ }
+}
+
+std::error_code
+TextEncodingConfig::setConvertersFromOptions(TextEncodingConfig &TEC,
+ const clang::LangOptions &Opts) {
+ using namespace llvm;
+
+ const char *UTF8 = "UTF-8";
+ TEC.ExecEncoding =
+ Opts.ExecEncoding.empty() ? UTF8 : Opts.ExecEncoding.c_str();
+
+ // Create converter between internal and exec encoding specified
+ // in fexec-charset option.
+ if (TEC.ExecEncoding == UTF8)
+ return std::error_code();
+ ErrorOr<TextEncodingConverter> ErrorOrConverter =
+ llvm::TextEncodingConverter::create(UTF8, TEC.ExecEncoding);
+ if (ErrorOrConverter)
+ TEC.ToExecEncodingConverter =
+ new TextEncodingConverter(std::move(*ErrorOrConverter));
+ else
+ return ErrorOrConverter.getError();
+ return std::error_code();
+}
diff --git a/clang/test/CodeGen/systemz-charset.c b/clang/test/CodeGen/systemz-charset.c
new file mode 100644
index 0000000000000..897b9d2eeefa1
--- /dev/null
+++ b/clang/test/CodeGen/systemz-charset.c
@@ -0,0 +1,58 @@
+// RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -fexec-charset IBM-1047 -o - | FileCheck %s
+// RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -fexec-charset UTF-8 -DIBM1047_ONLY=1 -o - | FileCheck %s --check-prefix=CHECK-UTF8
+
+const char *UpperCaseLetters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+//CHECK: c"\C1\C2\C3\C4\C5\C6\C7\C8\C9\D1\D2\D3\D4\D5\D6\D7\D8\D9\E2\E3\E4\E5\E6\E7\E8\E9\00"
+//CHECK-UTF8: c"ABCDEFGHIJKLMNOPQRSTUVWXYZ\00"
+
+const char *LowerCaseLetters = "abcdefghijklmnopqrstuvwxyz";
+//CHECK: c"\81\82\83\84\85\86\87\88\89\91\92\93\94\95\96\97\98\99\A2\A3\A4\A5\A6\A7\A8\A9\00"
+//CHECK-UTF8: c"abcdefghijklmnopqrstuvwxyz\00"
+
+const char *Digits = "0123456789";
+//CHECK: c"\F0\F1\F2\F3\F4\F5\F6\F7\F8\F9\00"
+//CHECK-UTF8: c"0123456789\00"
+
+const char *SpecialCharacters = " .<(+|&!$*);^-/,%%_>`:#@=";
+//CHECK: c"@KLMNOPZ[\\]^_`akllmnyz{|~\00"
+//CHECK-UTF8: c" .<(+|&!$*);^-/,%%_>`:#@=\00"
+
+const char *EscapeCharacters = "\a\b\f\n\r\t\v\\\'\"\?";
+//CHECK: c"/\16\0C\15\0D\05\0B\E0}\7Fo\00"
+//CHECK-UTF8: c"\07\08\0C\0A\0D\09\0B\\'\22?\00"
+
+const char *InvalidEscape = "\y\z";
+//CHECK: c"oo\00"
+//CHECK-UTF8: c"yz\00"
+
+const char *HexCharacters = "\x12\x13\x14";
+//CHECK: c"\12\13\14\00"
+//CHECK-UTF8: c"\12\13\14\00"
+
+const char *OctalCharacters = "\141\142\143";
+//CHECK: c"abc\00"
+//CHECK-UTF8: c"abc\00"
+
+const char singleChar = 'a';
+//CHECK: i8 -127
+//CHECK-UTF8: 97
+
+#ifndef IBM1047_ONLY
+const char cent = '¢';
+//CHECK: i8 74
+
+const char currency = '¤';
+//CHECK: i8 -97
+#endif
+
+const char *UcnCharacters = "\u00E2\u00AC\U000000DF";
+//CHECK: c"B\B0Y\00"
+//CHECK-UTF8: c"\C3\A2\C2\AC\C3\9F\00"
+
+const char *Unicode = "ÿ";
+//CHECK: c"\DF\00"
+//CHECK-UTF8: c"\C3\BF\00"
+
+// RUN: not %clang_cc1 -fexec-charset invalid %s 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+// CHECK-ERROR: error: failed to set fexec-charset to 'invalid'
+
diff --git a/clang/test/CodeGen/systemz-charset.cpp b/clang/test/CodeGen/systemz-charset.cpp
new file mode 100644
index 0000000000000..db106f2803677
--- /dev/null
+++ b/clang/test/CodeGen/systemz-charset.cpp
@@ -0,0 +1,70 @@
+// RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -std=c++17 -fexec-charset IBM-1047 -o - | FileCheck %s
+// RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -fexec-charset UTF-8 -o - | FileCheck %s --check-prefix=CHECK-UTF8
+
+const char *RawString = R"(Hello\n)";
+//CHECK: c"\C8\85\93\93\96\E0\95\00"
+//CHECK-UTF8: c"Hello\\n\00"
+
+const char *MultiLineRawString = R"(
+Hello
+There)";
+//CHECK: c"\15\C8\85\93\93\96\15\E3\88\85\99\85\00"
+//CHECK-UTF8: c"\0AHello\0AThere\00"
+
+char UnicodeChar8 = u8'1';
+//CHECK: i8 49
+//CHECK-UTF8: i8 49
+char16_t UnicodeChar16 = u'1';
+//CHECK: i16 49
+//CHECK-UTF8: i16 49
+char32_t UnicodeChar32 = U'1';
+//CHECK: i32 49
+//CHECK-UTF8: i32 49
+
+const char *EscapeCharacters8 = u8"\a\b\f\n\r\t\v\\\'\"\?";
+//CHECK: c"\07\08\0C\0A\0D\09\0B\\'\22?\00"
+//CHECK-UTF8: c"\07\08\0C\0A\0D\09\0B\\'\22?\00"
+
+const char16_t *EscapeCharacters16 = u"\a\b\f\n\r\t\v\\\'\"\?";
+//CHECK: [12 x i16] [i16 7, i16 8, i16 12, i16 10, i16 13, i16 9, i16 11, i16 92, i16 39, i16 34, i16 63, i16 0]
+//CHECK-UTF8: [12 x i16] [i16 7, i16 8, i16 12, i16 10, i16 13, i16 9, i16 11, i16 92, i16 39, i16 34, i16 63, i16 0]
+
+const char32_t *EscapeCharacters32 = U"\a\b\f\n\r\t\v\\\'\"\?";
+//CHECK: [12 x i32] [i32 7, i32 8, i32 12, i32 10, i32 13, i32 9, i32 11, i32 92, i32 39, i32 34, i32 63, i32 0]
+//CHECK-UTF8: [12 x i32] [i32 7, i32 8, i32 12, i32 10, i32 13, i32 9, i32 11, i32 92, i32 39, i32 34, i32 63, i32 0]
+
+const char *UnicodeString8 = u8"Hello";
+//CHECK: c"Hello\00"
+//CHECK-UTF8: c"Hello\00"
+
+const char16_t *UnicodeString16 = u"Hello";
+//CHECK: [6 x i16] [i16 72, i16 101, i16 108, i16 108, i16 111, i16 0]
+//CHECK-UTF8: [6 x i16] [i16 72, i16 101, i16 108, i16 108, i16 111, i16 0]
+
+const char32_t *UnicodeString32 = U"Hello";
+//CHECK: [6 x i32] [i32 72, i32 101, i32 108, i32 108, i32 111, i32 0]
+//CHECK=UTF8: [6 x i32] [i32 72, i32 101, i32 108, i32 108, i32 111, i32 0]
+
+const char *UnicodeRawString8 = u8R"("Hello\")";
+//CHECK: c"\22Hello\\\22\00"
+//CHECK=UTF8: c"\22Hello\\\22\00"
+
+const char16_t *UnicodeRawString16 = uR"("Hello\")";
+//CHECK: [9 x i16] [i16 34, i16 72, i16 101, i16 108, i16 108, i16 111, i16 92, i16 34, i16 0]
+//CHECK=UTF8: [9 x i16] [i16 34, i16 72, i16 101, i16 108, i16 108, i16 111, i16 92, i16 34, i16 0]
+
+const char32_t *UnicodeRawString32 = UR"("Hello\")";
+//CHECK: [9 x i32] [i32 34, i32 72, i32 101, i32 108, i32 108, i32 111, i32 92, i32 34, i32 0]
+//CHECK=UTF8: [9 x i32] [i32 34, i32 72, i32 101, i32 108, i32 108, i32 111, i32 92, i32 34, i32 0]
+
+const char *UnicodeUCNString8 = u8"\u00E2\u00AC\U000000DF";
+//CHECK: c"\C3\A2\C2\AC\C3\9F\00"
+//CHECK=UTF8: c"\C3\A2\C2\AC\C3\9F\00"
+
+const char16_t *UnicodeUCNString16 = u"\u00E2\u00AC\U000000DF";
+//CHECK: [4 x i16] [i16 226, i16 172, i16 223, i16 0]
+//CHECK=UTF8: [4 x i16] [i16 226, i16 172, i16 223, i16 0]
+
+const char32_t *UnicodeUCNString32 = U"\u00E2\u00AC\U000000DF";
+//CHECK: [4 x i32] [i32 226, i32 172, i32 223, i32 0]
+//CHECK=UTF8: [4 x i32] [i32 226, i32 172, i32 223, i32 0]
diff --git a/clang/test/Preprocessor/init-s390x.c b/clang/test/Preprocessor/init-s390x.c
index a8fbde46cbb75..4414c7d919879 100644
--- a/clang/test/Preprocessor/init-s390x.c
+++ b/clang/test/Preprocessor/init-s390x.c
@@ -206,4 +206,5 @@
// S390X-ZOS: #define __TOS_390__ 1
// S390X-ZOS: #define __TOS_MVS__ 1
// S390X-ZOS: #define __XPLINK__ 1
+// S390X-ZOS: #define __clang_literal_encoding__ "IBM-1047"
// S390X-ZOS-GNUXX: #define __wchar_t 1
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index 7b24db121818f..c04fef520afa4 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -528,6 +528,10 @@ class Triple {
/// For example, "fooos1.2.3" would return "1.2.3".
LLVM_ABI StringRef getEnvironmentVersionString() const;
+ /// Get the default system encoding of the triple.
+ /// For example, "IBM-1047" for z/OS, "UTF-8" for others
+ LLVM_ABI StringRef getDefaultNarrowTextEncoding() const;
+
/// @}
/// @name Convenience Predicates
/// @{
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index c6515425b7eb5..1f1812c9f4096 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -1678,6 +1678,13 @@ StringRef Triple::getOSAndEnvironmentName() const {
return Tmp.split('-').second; // Strip second component
}
+// Default encoding on z/OS is IBM-1047 and UTF-8 otherwise
+StringRef Triple::getDefaultNarrowTextEncoding() const {
+ if (getOS() == llvm::Triple::ZOS)
+ return "IBM-1047";
+ return "UTF-8";
+}
+
static VersionTuple parseVersionFromName(StringRef Name) {
VersionTuple Version;
Version.tryParse(Name);
>From 6c16d9e3f8afb85dfd63b6dcda3cf16f1ba6075d Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Mon, 11 May 2026 14:45:38 -0400
Subject: [PATCH 2/7] move conversion into EncodeUCNEscape, update testcase
---
clang/lib/Lex/LiteralSupport.cpp | 50 ++++++++++++--------------
clang/test/CodeGen/systemz-charset.cpp | 3 ++
2 files changed, 26 insertions(+), 27 deletions(-)
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index 9b8835bbf5e35..59ece0dbf79ed 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -784,11 +784,11 @@ static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
/// StringLiteralParser. When we decide to implement UCN's for identifiers,
/// we will likely rework our support for UCN's.
static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
- const char *ThisTokEnd,
- char *&ResultBuf, bool &HadError,
- FullSourceLoc Loc, unsigned CharByteWidth,
- DiagnosticsEngine *Diags,
- const LangOptions &Features) {
+ const char *ThisTokEnd, char *&ResultBuf,
+ bool &HadError, FullSourceLoc Loc,
+ unsigned CharByteWidth, DiagnosticsEngine *Diags,
+ const LangOptions &Features,
+ llvm::TextEncodingConverter *Converter) {
typedef uint32_t UTF32;
UTF32 UcnVal = 0;
unsigned short UcnLen = 0;
@@ -875,6 +875,20 @@ static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
}
// Update the buffer.
ResultBuf += bytesToWrite;
+
+ if (Converter) {
+ SmallString<4> CpConv;
+ char *Cp = ResultBuf - bytesToWrite;
+ auto EC = Converter->convert(StringRef(Cp, bytesToWrite), CpConv);
+ if (!EC) {
+ memcpy(Cp, CpConv.data(), CpConv.size());
+ ResultBuf = Cp + CpConv.size();
+ } else {
+ Diags->Report(Loc, diag::err_exec_charset_conversion_failed)
+ << EC.message();
+ HadError = true;
+ }
+ }
}
/// integer-constant: [C99 6.4.4.1]
@@ -1942,20 +1956,9 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
EncodeUCNEscape(TokBegin, begin, end, ResultPtr, HadError,
FullSourceLoc(Loc, PP.getSourceManager()),
/*CharByteWidth=*/1u, &PP.getDiagnostics(),
- PP.getLangOpts());
- assert(ResultPtr - Cp <= 4 &&
- "unexpected result size for UCN escape character");
- if (!HadError) {
- auto ErrorOrChar =
- convertCharacter(StringRef(Cp, ResultPtr - Cp), *Converter);
- if (ErrorOrChar)
- *buffer_begin = *ErrorOrChar;
- else {
- PP.Diag(Loc, diag::err_exec_charset_conversion_failed)
- << ErrorOrChar.getError().message();
- HadError = true;
- }
- }
+ PP.getLangOpts(), Converter);
+ if (!HadError)
+ *buffer_begin = *Cp;
}
++buffer_begin;
continue;
@@ -2340,17 +2343,10 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks,
// Is this a Universal Character Name escape?
if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U' ||
ThisTokBuf[1] == 'N') {
- char *Cp = ResultPtr;
EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, ResultPtr,
hadError,
FullSourceLoc(StringToks[i].getLocation(), SM),
- CharByteWidth, Diags, Features);
- if (!hadError && Converter) {
- SmallString<8> CpConv;
- Converter->convert(StringRef(Cp), CpConv);
- memcpy(Cp, CpConv.data(), CpConv.size());
- ResultPtr = Cp + CpConv.size();
- }
+ CharByteWidth, Diags, Features, Converter);
continue;
}
// Otherwise, this is a non-UCN escape character. Process it.
diff --git a/clang/test/CodeGen/systemz-charset.cpp b/clang/test/CodeGen/systemz-charset.cpp
index db106f2803677..396b82909fc9b 100644
--- a/clang/test/CodeGen/systemz-charset.cpp
+++ b/clang/test/CodeGen/systemz-charset.cpp
@@ -21,6 +21,9 @@ char32_t UnicodeChar32 = U'1';
//CHECK: i32 49
//CHECK-UTF8: i32 49
+int FourChar = '1234';
+//CHECK: i32 -235736076
+
const char *EscapeCharacters8 = u8"\a\b\f\n\r\t\v\\\'\"\?";
//CHECK: c"\07\08\0C\0A\0D\09\0B\\'\22?\00"
//CHECK-UTF8: c"\07\08\0C\0A\0D\09\0B\\'\22?\00"
>From a90fb877aee0483e1f61f486cf8c5a01e8bca713 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Fri, 22 May 2026 13:27:16 -0400
Subject: [PATCH 3/7] Move function to TargetInfo, remove unused includes
---
clang/include/clang/Basic/TargetInfo.h | 7 +++++++
clang/include/clang/Lex/TextEncodingConfig.h | 3 ---
clang/lib/Frontend/InitPreprocessor.cpp | 5 ++---
llvm/include/llvm/TargetParser/Triple.h | 4 ----
llvm/lib/TargetParser/Triple.cpp | 7 -------
5 files changed, 9 insertions(+), 17 deletions(-)
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index cc226403877e2..779093f963bc1 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -1590,6 +1590,13 @@ class TargetInfo : public TransferrableTargetInfo,
getTriple().isOSFreeBSD());
}
+ // Default encoding on z/OS is IBM-1047 and UTF-8 otherwise
+ StringRef getDefaultNarrowTextEncoding() const {
+ if (getTriple().getOS() == llvm::Triple::ZOS)
+ return "IBM-1047";
+ return "UTF-8";
+ }
+
// Identify whether this target supports __builtin_cpu_supports and
// __builtin_cpu_is.
virtual bool supportsCpuSupports() const { return false; }
diff --git a/clang/include/clang/Lex/TextEncodingConfig.h b/clang/include/clang/Lex/TextEncodingConfig.h
index 09967a81beeed..9e1e7649d65c4 100644
--- a/clang/include/clang/Lex/TextEncodingConfig.h
+++ b/clang/include/clang/Lex/TextEncodingConfig.h
@@ -9,10 +9,7 @@
#ifndef LLVM_CLANG_LEX_TEXTENCODINGCONFIG_H
#define LLVM_CLANG_LEX_TEXTENCODINGCONFIG_H
-#include "clang/Basic/Diagnostic.h"
#include "clang/Basic/LangOptions.h"
-#include "clang/Basic/TargetInfo.h"
-#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/TextEncoding.h"
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 200eab9b971a7..fbb3be1ab0ac7 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1040,9 +1040,8 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
Builder.defineMacro("__clang_literal_encoding__",
Twine("\"" + LangOpts.ExecEncoding + "\""));
else
- Builder.defineMacro(
- "__clang_literal_encoding__",
- Twine("\"" + TI.getTriple().getDefaultNarrowTextEncoding() + "\""));
+ Builder.defineMacro("__clang_literal_encoding__",
+ Twine("\"" + TI.getDefaultNarrowTextEncoding() + "\""));
if (TI.getTypeWidth(TI.getWCharType()) >= 32) {
// FIXME: 32-bit wchar_t signals UTF-32. This may change
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index c04fef520afa4..7b24db121818f 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -528,10 +528,6 @@ class Triple {
/// For example, "fooos1.2.3" would return "1.2.3".
LLVM_ABI StringRef getEnvironmentVersionString() const;
- /// Get the default system encoding of the triple.
- /// For example, "IBM-1047" for z/OS, "UTF-8" for others
- LLVM_ABI StringRef getDefaultNarrowTextEncoding() const;
-
/// @}
/// @name Convenience Predicates
/// @{
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index 1f1812c9f4096..c6515425b7eb5 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -1678,13 +1678,6 @@ StringRef Triple::getOSAndEnvironmentName() const {
return Tmp.split('-').second; // Strip second component
}
-// Default encoding on z/OS is IBM-1047 and UTF-8 otherwise
-StringRef Triple::getDefaultNarrowTextEncoding() const {
- if (getOS() == llvm::Triple::ZOS)
- return "IBM-1047";
- return "UTF-8";
-}
-
static VersionTuple parseVersionFromName(StringRef Name) {
VersionTuple Version;
Version.tryParse(Name);
>From 1fd56715f49539813306a3a775d6af556c9d9bad Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Thu, 28 May 2026 15:23:33 -0400
Subject: [PATCH 4/7] Add test coverage for conversion errors in string
literals
---
clang/test/CodeGen/systemz-charset-diag.cpp | 3 +++
1 file changed, 3 insertions(+)
create mode 100644 clang/test/CodeGen/systemz-charset-diag.cpp
diff --git a/clang/test/CodeGen/systemz-charset-diag.cpp b/clang/test/CodeGen/systemz-charset-diag.cpp
new file mode 100644
index 0000000000000..5b398b4b58af6
--- /dev/null
+++ b/clang/test/CodeGen/systemz-charset-diag.cpp
@@ -0,0 +1,3 @@
+// RUN: %clang_cc1 -triple s390x-none-zos -fexec-charset IBM-1047 %s -std=c++17 -emit-llvm -o - -verify
+
+const char* Computer = "🖥️"; // expected-error-re {{conversion to execution encoding failed: {{.*}}}}
>From 4f3ca3cc8fa30024b47b485b68012302eb18c85a Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Wed, 3 Jun 2026 09:37:11 -0400
Subject: [PATCH 5/7] rename TextEncodingConfig to TextEncoding, address other
comments
---
clang/include/clang/Basic/TargetInfo.h | 2 +-
clang/include/clang/Lex/LiteralSupport.h | 6 +++---
clang/include/clang/Lex/Preprocessor.h | 6 +++---
.../Lex/{TextEncodingConfig.h => TextEncoding.h} | 11 +++++------
clang/include/clang/Options/Options.td | 2 +-
clang/lib/Frontend/CompilerInstance.cpp | 8 ++++----
clang/lib/Frontend/InitPreprocessor.cpp | 12 ++++++------
clang/lib/Lex/CMakeLists.txt | 2 +-
clang/lib/Lex/LiteralSupport.cpp | 12 ++++++------
clang/lib/Lex/PPDirectives.cpp | 6 ++----
.../Lex/{TextEncodingConfig.cpp => TextEncoding.cpp} | 12 +++++-------
11 files changed, 37 insertions(+), 42 deletions(-)
rename clang/include/clang/Lex/{TextEncodingConfig.h => TextEncoding.h} (70%)
rename clang/lib/Lex/{TextEncodingConfig.cpp => TextEncoding.cpp} (75%)
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index 779093f963bc1..a4984cffc430a 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -1591,7 +1591,7 @@ class TargetInfo : public TransferrableTargetInfo,
}
// Default encoding on z/OS is IBM-1047 and UTF-8 otherwise
- StringRef getDefaultNarrowTextEncoding() const {
+ StringRef getDefaultOrdinaryTextEncoding() const {
if (getTriple().getOS() == llvm::Triple::ZOS)
return "IBM-1047";
return "UTF-8";
diff --git a/clang/include/clang/Lex/LiteralSupport.h b/clang/include/clang/Lex/LiteralSupport.h
index 6b404403ed95f..cb56cd391b22f 100644
--- a/clang/include/clang/Lex/LiteralSupport.h
+++ b/clang/include/clang/Lex/LiteralSupport.h
@@ -17,7 +17,7 @@
#include "clang/Basic/CharInfo.h"
#include "clang/Basic/LLVM.h"
#include "clang/Basic/TokenKinds.h"
-#include "clang/Lex/TextEncodingConfig.h"
+#include "clang/Lex/TextEncoding.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallString.h"
@@ -235,7 +235,7 @@ class StringLiteralParser {
const LangOptions &Features;
const TargetInfo &Target;
DiagnosticsEngine *Diags;
- TextEncodingConfig *TEC;
+ TextEncoding *TE;
unsigned MaxTokenLength;
unsigned SizeBound;
@@ -256,7 +256,7 @@ class StringLiteralParser {
StringLiteralParser(ArrayRef<Token> StringToks, const SourceManager &sm,
const LangOptions &features, const TargetInfo &target,
DiagnosticsEngine *diags = nullptr)
- : SM(sm), Features(features), Target(target), Diags(diags), TEC(nullptr),
+ : SM(sm), Features(features), Target(target), Diags(diags), TE(nullptr),
MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
ResultPtr(ResultBuf.data()),
EvalMethod(StringLiteralEvalMethod::Evaluated), hadError(false),
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 27fc7ef8d68dc..78d9778bb6001 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -30,7 +30,7 @@
#include "clang/Lex/ModuleMap.h"
#include "clang/Lex/PPCallbacks.h"
#include "clang/Lex/PPEmbedParameters.h"
-#include "clang/Lex/TextEncodingConfig.h"
+#include "clang/Lex/TextEncoding.h"
#include "clang/Lex/Token.h"
#include "clang/Lex/TokenLexer.h"
#include "clang/Support/Compiler.h"
@@ -199,7 +199,7 @@ class Preprocessor {
std::unique_ptr<ScratchBuffer> ScratchBuf;
HeaderSearch &HeaderInfo;
ModuleLoader &TheModuleLoader;
- TextEncodingConfig TEC;
+ TextEncoding TE;
/// External source of macros.
ExternalPreprocessorSource *ExternalSource;
@@ -1266,7 +1266,7 @@ class Preprocessor {
SelectorTable &getSelectorTable() { return Selectors; }
Builtin::Context &getBuiltinInfo() { return *BuiltinInfo; }
llvm::BumpPtrAllocator &getPreprocessorAllocator() { return BP; }
- TextEncodingConfig &getTextEncodingConfig() { return TEC; }
+ TextEncoding &getTextEncoding() { return TE; }
void setExternalSource(ExternalPreprocessorSource *Source) {
ExternalSource = Source;
diff --git a/clang/include/clang/Lex/TextEncodingConfig.h b/clang/include/clang/Lex/TextEncoding.h
similarity index 70%
rename from clang/include/clang/Lex/TextEncodingConfig.h
rename to clang/include/clang/Lex/TextEncoding.h
index 9e1e7649d65c4..770cb3c5eff1a 100644
--- a/clang/include/clang/Lex/TextEncodingConfig.h
+++ b/clang/include/clang/Lex/TextEncoding.h
@@ -1,4 +1,4 @@
-//===-- clang/Lex/TextEncodingConfig.h - Text Conversion Config -*- C++ -*-===//
+//===-- clang/Lex/TextEncoding.h - Text Encoding Conversion ------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_CLANG_LEX_TEXTENCODINGCONFIG_H
-#define LLVM_CLANG_LEX_TEXTENCODINGCONFIG_H
+#ifndef LLVM_CLANG_LEX_TEXTENCODING_H
+#define LLVM_CLANG_LEX_TEXTENCODING_H
#include "clang/Basic/LangOptions.h"
#include "llvm/ADT/StringRef.h"
@@ -15,15 +15,14 @@
enum ConversionAction { CA_NoConversion, CA_ToExecEncoding };
-class TextEncodingConfig {
+class TextEncoding {
llvm::StringRef ExecEncoding;
llvm::TextEncodingConverter *ToExecEncodingConverter = nullptr;
public:
llvm::TextEncodingConverter *getConverter(ConversionAction Action) const;
static std::error_code
- setConvertersFromOptions(TextEncodingConfig &TEC,
- const clang::LangOptions &Opts);
+ setConvertersFromOptions(TextEncoding &TE, const clang::LangOptions &Opts);
llvm::StringRef getExecEncoding() { return ExecEncoding; }
};
diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td
index ee7937f38f1a3..257f95bb88a42 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -7522,7 +7522,7 @@ def tune_cpu : Separate<["-"], "tune-cpu">,
HelpText<"Tune for a specific cpu type">,
MarshallingInfoString<TargetOpts<"TuneCPU">>;
def fexec_charset : Separate<["-"], "fexec-charset">, MetaVarName<"<encoding>">,
- HelpText<"Set the execution <encoding> for string and character literals. "
+ HelpText<"Set the execution <encoding> for ordinary string and character literals. "
"Supported character encodings include ISO-8859-1, UTF-8, IBM1047, "
"and possibly those supported by ICU or the host iconv library.">,
MarshallingInfoString<LangOpts<"ExecEncoding">>;
diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index dff396d91f2f1..952eb73c210ff 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -34,7 +34,7 @@
#include "clang/Lex/HeaderSearch.h"
#include "clang/Lex/Preprocessor.h"
#include "clang/Lex/PreprocessorOptions.h"
-#include "clang/Lex/TextEncodingConfig.h"
+#include "clang/Lex/TextEncoding.h"
#include "clang/Sema/CodeCompleteConsumer.h"
#include "clang/Sema/ParsedAttr.h"
#include "clang/Sema/Sema.h"
@@ -555,10 +555,10 @@ void CompilerInstance::createPreprocessor(TranslationUnitKind TUKind) {
if (GetDependencyDirectives)
PP->setDependencyDirectivesGetter(*GetDependencyDirectives);
- if (auto EC = TextEncodingConfig::setConvertersFromOptions(
- PP->getTextEncodingConfig(), getLangOpts()))
+ if (auto EC = TextEncoding::setConvertersFromOptions(PP->getTextEncoding(),
+ getLangOpts()))
PP->getDiagnostics().Report(clang::diag::err_fe_text_encoding_config)
- << PP->getTextEncodingConfig().getExecEncoding();
+ << PP->getTextEncoding().getExecEncoding();
}
// ASTContext
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index fbb3be1ab0ac7..15c62e39d9506 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1036,12 +1036,12 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
// Macros to help identify the narrow and wide character sets. This is set
// to fexec-charset. If fexec-charset is not specified, the default is the
// system charset.
- if (!LangOpts.ExecEncoding.empty())
- Builder.defineMacro("__clang_literal_encoding__",
- Twine("\"" + LangOpts.ExecEncoding + "\""));
- else
- Builder.defineMacro("__clang_literal_encoding__",
- Twine("\"" + TI.getDefaultNarrowTextEncoding() + "\""));
+ Builder.defineMacro(
+ "__clang_literal_encoding__",
+ Twine("\"" +
+ (LangOpts.ExecEncoding.empty() ? TI.getDefaultOrdinaryTextEncoding()
+ : LangOpts.ExecEncoding) +
+ "\""));
if (TI.getTypeWidth(TI.getWCharType()) >= 32) {
// FIXME: 32-bit wchar_t signals UTF-32. This may change
diff --git a/clang/lib/Lex/CMakeLists.txt b/clang/lib/Lex/CMakeLists.txt
index 106a5d3b126be..7b0be7249cd99 100644
--- a/clang/lib/Lex/CMakeLists.txt
+++ b/clang/lib/Lex/CMakeLists.txt
@@ -29,7 +29,7 @@ add_clang_library(clangLex
Preprocessor.cpp
PreprocessorLexer.cpp
ScratchBuffer.cpp
- TextEncodingConfig.cpp
+ TextEncoding.cpp
TokenConcatenation.cpp
TokenLexer.cpp
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index 59ece0dbf79ed..c3c1ad8d78ab8 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -1858,10 +1858,10 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
uint32_t *buffer_begin = &codepoint_buffer.front();
uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
- const TextEncodingConfig &TEC = PP.getTextEncodingConfig();
+ const TextEncoding &TE = PP.getTextEncoding();
llvm::TextEncodingConverter *Converter = nullptr;
if (isOrdinary())
- Converter = TEC.getConverter(CA_ToExecEncoding);
+ Converter = TE.getConverter(CA_ToExecEncoding);
// Unicode escapes representing characters that cannot be correctly
// represented in a single code unit are disallowed in character literals
@@ -2082,7 +2082,7 @@ StringLiteralParser::StringLiteralParser(ArrayRef<Token> StringToks,
ConversionAction Action)
: SM(PP.getSourceManager()), Features(PP.getLangOpts()),
Target(PP.getTargetInfo()), Diags(&PP.getDiagnostics()),
- TEC(&PP.getTextEncodingConfig()), MaxTokenLength(0), SizeBound(0),
+ TE(&PP.getTextEncoding()), MaxTokenLength(0), SizeBound(0),
CharByteWidth(0), Kind(tok::unknown), ResultPtr(ResultBuf.data()),
EvalMethod(EvalMethod), hadError(false), Pascal(false) {
init(StringToks, Action);
@@ -2182,8 +2182,8 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks,
SourceLocation UDSuffixTokLoc;
llvm::TextEncodingConverter *Converter = nullptr;
- if (isOrdinary() && TEC)
- Converter = TEC->getConverter(Action);
+ if (isOrdinary() && TE)
+ Converter = TE->getConverter(Action);
for (unsigned i = 0, e = StringToks.size(); i != e; ++i) {
const char *ThisTokBuf = &TokenBuf[0];
@@ -2569,7 +2569,7 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
FullSourceLoc(Tok.getLocation(), SM), CharByteWidth * 8,
Diags, Features, StringLiteralEvalMethod::Evaluated,
- /*TextEncodingConfig=*/nullptr);
+ /*TextEncoding=*/nullptr);
--ByteNo;
}
assert(!HadError && "This method isn't valid on erroneous strings");
diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp
index a20703db01def..12d3c765b15bc 100644
--- a/clang/lib/Lex/PPDirectives.cpp
+++ b/clang/lib/Lex/PPDirectives.cpp
@@ -1650,8 +1650,7 @@ void Preprocessor::HandleLineDirective() {
return;
} else {
// Parse and validate the string, converting it into a unique ID.
- StringLiteralParser Literal(
- StrTok, *this, StringLiteralEvalMethod::Evaluated, CA_NoConversion);
+ StringLiteralParser Literal(StrTok, *this);
assert(Literal.isOrdinary() && "Didn't allow wide strings in");
if (Literal.hadError) {
DiscardUntilEndOfDirective();
@@ -1802,8 +1801,7 @@ void Preprocessor::HandleDigitDirective(Token &DigitTok) {
return;
} else {
// Parse and validate the string, converting it into a unique ID.
- StringLiteralParser Literal(
- StrTok, *this, StringLiteralEvalMethod::Evaluated, CA_NoConversion);
+ StringLiteralParser Literal(StrTok, *this);
assert(Literal.isOrdinary() && "Didn't allow wide strings in");
if (Literal.hadError) {
DiscardUntilEndOfDirective();
diff --git a/clang/lib/Lex/TextEncodingConfig.cpp b/clang/lib/Lex/TextEncoding.cpp
similarity index 75%
rename from clang/lib/Lex/TextEncodingConfig.cpp
rename to clang/lib/Lex/TextEncoding.cpp
index b89d5baefcc23..99bf4c4f2d1c7 100644
--- a/clang/lib/Lex/TextEncodingConfig.cpp
+++ b/clang/lib/Lex/TextEncoding.cpp
@@ -1,4 +1,4 @@
-//===--- TextEncodingConfig.cpp -------------------------------------------===//
+//===--- TextEncoding.cpp -------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -6,13 +6,11 @@
//
//===----------------------------------------------------------------------===//
-#include "clang/Lex/TextEncodingConfig.h"
+#include "clang/Lex/TextEncoding.h"
#include "clang/Basic/DiagnosticDriver.h"
-using namespace llvm;
-
llvm::TextEncodingConverter *
-TextEncodingConfig::getConverter(ConversionAction Action) const {
+TextEncoding::getConverter(ConversionAction Action) const {
switch (Action) {
case CA_ToExecEncoding:
return ToExecEncodingConverter;
@@ -22,8 +20,8 @@ TextEncodingConfig::getConverter(ConversionAction Action) const {
}
std::error_code
-TextEncodingConfig::setConvertersFromOptions(TextEncodingConfig &TEC,
- const clang::LangOptions &Opts) {
+TextEncoding::setConvertersFromOptions(TextEncoding &TEC,
+ const clang::LangOptions &Opts) {
using namespace llvm;
const char *UTF8 = "UTF-8";
>From 7cc6be3bf026aeafd5f97c11807e542299f6c47c Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Wed, 3 Jun 2026 11:36:04 -0400
Subject: [PATCH 6/7] change default of StringLiteralParser to NoConversion
instead of CA_ToExecEncoding
---
clang/include/clang/Lex/LiteralSupport.h | 2 +-
clang/lib/Lex/TextEncoding.cpp | 2 +-
clang/lib/Sema/SemaExpr.cpp | 3 ++-
3 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/clang/include/clang/Lex/LiteralSupport.h b/clang/include/clang/Lex/LiteralSupport.h
index cb56cd391b22f..9270190e02480 100644
--- a/clang/include/clang/Lex/LiteralSupport.h
+++ b/clang/include/clang/Lex/LiteralSupport.h
@@ -252,7 +252,7 @@ class StringLiteralParser {
StringLiteralParser(
ArrayRef<Token> StringToks, Preprocessor &PP,
StringLiteralEvalMethod StringMethod = StringLiteralEvalMethod::Evaluated,
- ConversionAction Action = CA_ToExecEncoding);
+ ConversionAction Action = CA_NoConversion);
StringLiteralParser(ArrayRef<Token> StringToks, const SourceManager &sm,
const LangOptions &features, const TargetInfo &target,
DiagnosticsEngine *diags = nullptr)
diff --git a/clang/lib/Lex/TextEncoding.cpp b/clang/lib/Lex/TextEncoding.cpp
index 99bf4c4f2d1c7..ba878800564f0 100644
--- a/clang/lib/Lex/TextEncoding.cpp
+++ b/clang/lib/Lex/TextEncoding.cpp
@@ -1,4 +1,4 @@
-//===--- TextEncoding.cpp -------------------------------------------===//
+//===--- TextEncoding.cpp -------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 521a8516ac179..919c950d61324 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -2240,7 +2240,8 @@ Sema::ActOnStringLiteral(ArrayRef<Token> StringToks, Scope *UDLScope) {
if (getLangOpts().MicrosoftExt)
StringToks = ExpandedToks = ExpandFunctionLocalPredefinedMacros(StringToks);
- StringLiteralParser Literal(StringToks, PP);
+ StringLiteralParser Literal(
+ StringToks, PP, StringLiteralEvalMethod::Evaluated, CA_ToExecEncoding);
if (Literal.hadError)
return ExprError();
>From 7d18fbd4cf631dc7f58e9e7cd405b39942315da2 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Mon, 8 Jun 2026 14:47:45 -0400
Subject: [PATCH 7/7] remove extra conversion from utf32 to utf8 in
CharLiteralParser
---
clang/lib/Lex/LiteralSupport.cpp | 35 ++++++++++++++------------
clang/test/CodeGen/systemz-charset.cpp | 1 +
2 files changed, 20 insertions(+), 16 deletions(-)
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index c3c1ad8d78ab8..70070e8bb1f2a 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -1890,6 +1890,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
char const *tmp_in_start = start;
uint32_t *tmp_out_start = buffer_begin;
+ std::string UTF8String(start, begin);
llvm::ConversionResult res =
llvm::ConvertUTF8toUTF32(reinterpret_cast<llvm::UTF8 const **>(&start),
reinterpret_cast<llvm::UTF8 const *>(begin),
@@ -1912,26 +1913,28 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
HadError = true;
}
} else {
- for (; tmp_out_start < buffer_begin; ++tmp_out_start) {
- if (*tmp_out_start > largest_character_for_kind) {
+ uint32_t *validation_ptr = tmp_out_start;
+ for (; validation_ptr < buffer_begin; ++validation_ptr) {
+ if (*validation_ptr > largest_character_for_kind) {
HadError = true;
PP.Diag(Loc, diag::err_character_too_large);
}
- if (!HadError && Converter) {
- assert(isOrdinary() && "Only ordinary characters are supported");
- std::string UTF8String;
- convertUTF32ToUTF8String(
- ArrayRef<char>(reinterpret_cast<const char *>(tmp_out_start),
- 4),
- UTF8String);
- auto ErrorOrChar = convertCharacter(UTF8String, *Converter);
- if (ErrorOrChar) {
- *tmp_out_start = *ErrorOrChar;
- } else {
- HadError = true;
- PP.Diag(Loc, diag::err_exec_charset_conversion_failed)
- << ErrorOrChar.getError().message();
+ }
+
+ // Convert to execution character set if needed
+ if (!HadError && Converter) {
+ assert(isOrdinary() && "Only ordinary characters are supported");
+ SmallString<4> Converted;
+ auto ErrorOrChar = Converter->convert(UTF8String, Converted);
+ if (!ErrorOrChar) {
+ for (int i = 0; tmp_out_start < buffer_begin;
+ ++tmp_out_start, ++i) {
+ *tmp_out_start = Converted[i];
}
+ } else {
+ HadError = true;
+ PP.Diag(Loc, diag::err_exec_charset_conversion_failed)
+ << ErrorOrChar.message();
}
}
}
diff --git a/clang/test/CodeGen/systemz-charset.cpp b/clang/test/CodeGen/systemz-charset.cpp
index 396b82909fc9b..59c4ad550cd94 100644
--- a/clang/test/CodeGen/systemz-charset.cpp
+++ b/clang/test/CodeGen/systemz-charset.cpp
@@ -23,6 +23,7 @@ char32_t UnicodeChar32 = U'1';
int FourChar = '1234';
//CHECK: i32 -235736076
+//CHECK-UTF8: 825373492
const char *EscapeCharacters8 = u8"\a\b\f\n\r\t\v\\\'\"\?";
//CHECK: c"\07\08\0C\0A\0D\09\0B\\'\22?\00"
More information about the cfe-commits
mailing list