[clang] [llvm] Enable fexec-charset option (PR #138895)
Abhina Sree via cfe-commits
cfe-commits at lists.llvm.org
Fri May 8 09:23:06 PDT 2026
https://github.com/abhina-sree updated https://github.com/llvm/llvm-project/pull/138895
>From 586094419b9b2e1aa493b5a47af1a510e55bbf54 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Fri, 8 May 2026 12:16:13 -0400
Subject: [PATCH] This patch enables the fexec-charset option to control the
execution charset of string literals. It sets the default internal charset,
system charset, and execution charset for z/OS and UTF-8 for all other
platforms.
---
.../clang/Basic/DiagnosticFrontendKinds.td | 2 +-
.../include/clang/Basic/DiagnosticLexKinds.td | 2 +
clang/include/clang/Basic/LangOptions.h | 3 +
clang/include/clang/Lex/LiteralSupport.h | 19 +-
clang/include/clang/Lex/Preprocessor.h | 3 +
clang/include/clang/Lex/TextEncodingConfig.h | 34 ++++
clang/include/clang/Options/Options.td | 5 +
clang/lib/Frontend/CompilerInstance.cpp | 6 +
clang/lib/Frontend/FrontendAction.cpp | 4 +-
clang/lib/Frontend/InitPreprocessor.cpp | 15 +-
clang/lib/Lex/CMakeLists.txt | 1 +
clang/lib/Lex/LiteralSupport.cpp | 170 ++++++++++++++----
clang/lib/Lex/PPDirectives.cpp | 6 +-
clang/lib/Lex/TextEncodingConfig.cpp | 45 +++++
clang/test/CodeGen/systemz-charset.c | 58 ++++++
clang/test/CodeGen/systemz-charset.cpp | 46 +++++
clang/test/Preprocessor/init-s390x.c | 1 +
llvm/include/llvm/TargetParser/Triple.h | 4 +
llvm/lib/TargetParser/Triple.cpp | 7 +
19 files changed, 385 insertions(+), 46 deletions(-)
create mode 100644 clang/include/clang/Lex/TextEncodingConfig.h
create mode 100644 clang/lib/Lex/TextEncodingConfig.cpp
create mode 100644 clang/test/CodeGen/systemz-charset.c
create mode 100644 clang/test/CodeGen/systemz-charset.cpp
diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
index f384a97b6825e..61f96759862c4 100644
--- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td
+++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
@@ -359,7 +359,7 @@ def err_non_default_visibility_dllimport : Error<
"non-default visibility cannot be applied to 'dllimport' declaration">;
def err_ifunc_resolver_return : Error<
"ifunc resolver function must return a pointer">;
-
+def err_fe_text_encoding_config : Error<"failed to set fexec-charset to '%0'">;
def warn_atomic_op_misaligned : Warning<
"misaligned atomic operation may incur "
"significant performance penalty"
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index 85fa290de6fd9..f1ebbb40ceb4d 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -287,6 +287,8 @@ def ext_string_too_long : Extension<"string literal of length %0 exceeds "
"support">, InGroup<OverlengthStrings>;
def err_character_too_large : Error<
"character too large for enclosing character literal type">;
+def err_exec_charset_conversion_failed
+ : Error<"conversion to execution encoding failed: '%0'">;
def warn_c99_compat_unicode_literal : Warning<
"unicode literals are incompatible with C99">,
InGroup<C99Compat>, DefaultIgnore;
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 64b12b6fd72c7..1501bc0e38218 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -618,6 +618,9 @@ class LangOptions : public LangOptionsBase {
/// The allocation token mode.
std::optional<llvm::AllocTokenMode> AllocTokenMode;
+ /// Name of the execution encoding to convert the internal encoding to.
+ std::string ExecEncoding;
+
LangOptions();
/// Set language defaults for the given input language and
diff --git a/clang/include/clang/Lex/LiteralSupport.h b/clang/include/clang/Lex/LiteralSupport.h
index ea5f63bc20399..6b404403ed95f 100644
--- a/clang/include/clang/Lex/LiteralSupport.h
+++ b/clang/include/clang/Lex/LiteralSupport.h
@@ -17,11 +17,13 @@
#include "clang/Basic/CharInfo.h"
#include "clang/Basic/LLVM.h"
#include "clang/Basic/TokenKinds.h"
+#include "clang/Lex/TextEncodingConfig.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/TextEncoding.h"
namespace clang {
@@ -233,6 +235,7 @@ class StringLiteralParser {
const LangOptions &Features;
const TargetInfo &Target;
DiagnosticsEngine *Diags;
+ TextEncodingConfig *TEC;
unsigned MaxTokenLength;
unsigned SizeBound;
@@ -246,18 +249,19 @@ class StringLiteralParser {
StringLiteralEvalMethod EvalMethod;
public:
- StringLiteralParser(ArrayRef<Token> StringToks, Preprocessor &PP,
- StringLiteralEvalMethod StringMethod =
- StringLiteralEvalMethod::Evaluated);
+ StringLiteralParser(
+ ArrayRef<Token> StringToks, Preprocessor &PP,
+ StringLiteralEvalMethod StringMethod = StringLiteralEvalMethod::Evaluated,
+ ConversionAction Action = CA_ToExecEncoding);
StringLiteralParser(ArrayRef<Token> StringToks, const SourceManager &sm,
const LangOptions &features, const TargetInfo &target,
DiagnosticsEngine *diags = nullptr)
- : SM(sm), Features(features), Target(target), Diags(diags),
+ : SM(sm), Features(features), Target(target), Diags(diags), TEC(nullptr),
MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
ResultPtr(ResultBuf.data()),
EvalMethod(StringLiteralEvalMethod::Evaluated), hadError(false),
Pascal(false) {
- init(StringToks);
+ init(StringToks, CA_NoConversion);
}
bool hadError;
@@ -305,9 +309,10 @@ class StringLiteralParser {
static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix);
private:
- void init(ArrayRef<Token> StringToks);
+ void init(ArrayRef<Token> StringToks, ConversionAction Action);
bool CopyStringFragment(const Token &Tok, const char *TokBegin,
- StringRef Fragment);
+ StringRef Fragment,
+ llvm::TextEncodingConverter *Converter);
void DiagnoseLexingError(SourceLocation Loc);
};
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 8cba21539e48a..62cbe2dc5ce57 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -30,6 +30,7 @@
#include "clang/Lex/ModuleMap.h"
#include "clang/Lex/PPCallbacks.h"
#include "clang/Lex/PPEmbedParameters.h"
+#include "clang/Lex/TextEncodingConfig.h"
#include "clang/Lex/Token.h"
#include "clang/Lex/TokenLexer.h"
#include "clang/Support/Compiler.h"
@@ -198,6 +199,7 @@ class Preprocessor {
std::unique_ptr<ScratchBuffer> ScratchBuf;
HeaderSearch &HeaderInfo;
ModuleLoader &TheModuleLoader;
+ TextEncodingConfig TEC;
/// External source of macros.
ExternalPreprocessorSource *ExternalSource;
@@ -1269,6 +1271,7 @@ class Preprocessor {
SelectorTable &getSelectorTable() { return Selectors; }
Builtin::Context &getBuiltinInfo() { return *BuiltinInfo; }
llvm::BumpPtrAllocator &getPreprocessorAllocator() { return BP; }
+ TextEncodingConfig &getTextEncodingConfig() { return TEC; }
void setExternalSource(ExternalPreprocessorSource *Source) {
ExternalSource = Source;
diff --git a/clang/include/clang/Lex/TextEncodingConfig.h b/clang/include/clang/Lex/TextEncodingConfig.h
new file mode 100644
index 0000000000000..09967a81beeed
--- /dev/null
+++ b/clang/include/clang/Lex/TextEncodingConfig.h
@@ -0,0 +1,34 @@
+//===-- clang/Lex/TextEncodingConfig.h - Text Conversion Config -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LEX_TEXTENCODINGCONFIG_H
+#define LLVM_CLANG_LEX_TEXTENCODINGCONFIG_H
+
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/TargetInfo.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/TextEncoding.h"
+
+enum ConversionAction { CA_NoConversion, CA_ToExecEncoding };
+
+class TextEncodingConfig {
+ llvm::StringRef ExecEncoding;
+ llvm::TextEncodingConverter *ToExecEncodingConverter = nullptr;
+
+public:
+ llvm::TextEncodingConverter *getConverter(ConversionAction Action) const;
+ static std::error_code
+ setConvertersFromOptions(TextEncodingConfig &TEC,
+ const clang::LangOptions &Opts);
+
+ llvm::StringRef getExecEncoding() { return ExecEncoding; }
+};
+
+#endif
diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td
index 5eeabf4c33b76..73bce00b921ea 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -7504,6 +7504,11 @@ let Visibility = [CC1Option, CC1AsOption, FC1Option] in {
def tune_cpu : Separate<["-"], "tune-cpu">,
HelpText<"Tune for a specific cpu type">,
MarshallingInfoString<TargetOpts<"TuneCPU">>;
+def fexec_charset : Separate<["-"], "fexec-charset">, MetaVarName<"<encoding>">,
+ HelpText<"Set the execution <encoding> for string and character literals. "
+ "Supported character encodings include ISO-8859-1, UTF-8, IBM1047, "
+ "and possibly those supported by ICU or the host iconv library.">,
+ MarshallingInfoString<LangOpts<"ExecEncoding">>;
def target_cpu : Separate<["-"], "target-cpu">,
HelpText<"Target a specific cpu type">,
MarshallingInfoString<TargetOpts<"CPU">>;
diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index 91eda7392784f..c9b5342b7e8d9 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -34,6 +34,7 @@
#include "clang/Lex/HeaderSearch.h"
#include "clang/Lex/Preprocessor.h"
#include "clang/Lex/PreprocessorOptions.h"
+#include "clang/Lex/TextEncodingConfig.h"
#include "clang/Sema/CodeCompleteConsumer.h"
#include "clang/Sema/ParsedAttr.h"
#include "clang/Sema/Sema.h"
@@ -547,6 +548,11 @@ void CompilerInstance::createPreprocessor(TranslationUnitKind TUKind) {
if (GetDependencyDirectives)
PP->setDependencyDirectivesGetter(*GetDependencyDirectives);
+
+ if (auto EC = TextEncodingConfig::setConvertersFromOptions(
+ PP->getTextEncodingConfig(), getLangOpts()))
+ PP->getDiagnostics().Report(clang::diag::err_fe_text_encoding_config)
+ << PP->getTextEncodingConfig().getExecEncoding();
}
// ASTContext
diff --git a/clang/lib/Frontend/FrontendAction.cpp b/clang/lib/Frontend/FrontendAction.cpp
index 3bb1375fc5b77..47eb6ca1b87e6 100644
--- a/clang/lib/Frontend/FrontendAction.cpp
+++ b/clang/lib/Frontend/FrontendAction.cpp
@@ -525,7 +525,9 @@ static SourceLocation ReadOriginalFileName(CompilerInstance &CI,
if (T.isAtStartOfLine() || T.getKind() != tok::string_literal)
return SourceLocation();
- StringLiteralParser Literal(T, CI.getPreprocessor());
+ StringLiteralParser Literal(T, CI.getPreprocessor(),
+ StringLiteralEvalMethod::Evaluated,
+ CA_NoConversion);
if (Literal.hadError)
return SourceLocation();
RawLexer->LexFromRawLexer(T);
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 3f0468a938149..200eab9b971a7 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1033,10 +1033,17 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
}
}
- // Macros to help identify the narrow and wide character sets
- // FIXME: clang currently ignores -fexec-charset=. If this changes,
- // then this may need to be updated.
- Builder.defineMacro("__clang_literal_encoding__", "\"UTF-8\"");
+ // Macros to help identify the narrow and wide character sets. This is set
+ // to fexec-charset. If fexec-charset is not specified, the default is the
+ // system charset.
+ if (!LangOpts.ExecEncoding.empty())
+ Builder.defineMacro("__clang_literal_encoding__",
+ Twine("\"" + LangOpts.ExecEncoding + "\""));
+ else
+ Builder.defineMacro(
+ "__clang_literal_encoding__",
+ Twine("\"" + TI.getTriple().getDefaultNarrowTextEncoding() + "\""));
+
if (TI.getTypeWidth(TI.getWCharType()) >= 32) {
// FIXME: 32-bit wchar_t signals UTF-32. This may change
// if -fwide-exec-charset= is ever supported.
diff --git a/clang/lib/Lex/CMakeLists.txt b/clang/lib/Lex/CMakeLists.txt
index f61737cd68021..106a5d3b126be 100644
--- a/clang/lib/Lex/CMakeLists.txt
+++ b/clang/lib/Lex/CMakeLists.txt
@@ -29,6 +29,7 @@ add_clang_library(clangLex
Preprocessor.cpp
PreprocessorLexer.cpp
ScratchBuffer.cpp
+ TextEncodingConfig.cpp
TokenConcatenation.cpp
TokenLexer.cpp
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index 482146ccf8654..9b8835bbf5e35 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -126,6 +126,17 @@ static bool IsEscapeValidInUnevaluatedStringLiteral(char Escape) {
return false;
}
+static llvm::ErrorOr<char>
+convertCharacter(StringRef Char, const llvm::TextEncodingConverter &Converter) {
+ SmallString<8> ResultCharConv;
+ std::error_code EC = Converter.convert(Char, ResultCharConv);
+ if (EC)
+ return EC;
+ else if (ResultCharConv.size() > 1)
+ return std::error_code(E2BIG, std::generic_category());
+ return ResultCharConv[0];
+}
+
/// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
/// either a character or a string literal.
static unsigned ProcessCharEscape(const char *ThisTokBegin,
@@ -134,7 +145,8 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
FullSourceLoc Loc, unsigned CharWidth,
DiagnosticsEngine *Diags,
const LangOptions &Features,
- StringLiteralEvalMethod EvalMethod) {
+ StringLiteralEvalMethod EvalMethod,
+ llvm::TextEncodingConverter *Converter) {
const char *EscapeBegin = ThisTokBuf;
bool Delimited = false;
bool EndDelimiterFound = false;
@@ -146,6 +158,8 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
// that would have been \", which would not have been the end of string.
unsigned ResultChar = *ThisTokBuf++;
char Escape = ResultChar;
+ bool Transcode = true;
+ bool Invalid = false;
switch (ResultChar) {
// These map to themselves.
case '\\': case '\'': case '"': case '?': break;
@@ -186,6 +200,7 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
ResultChar = 11;
break;
case 'x': { // Hex escape.
+ Transcode = false;
ResultChar = 0;
if (ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
Delimited = true;
@@ -249,6 +264,7 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
case '4': case '5': case '6': case '7': {
// Octal escapes.
--ThisTokBuf;
+ Transcode = false;
ResultChar = 0;
// Octal escapes are a series of octal digits with maximum length 3.
@@ -272,6 +288,7 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
}
case 'o': {
bool Overflow = false;
+ Transcode = false;
if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
HadError = true;
if (Diags)
@@ -334,6 +351,7 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
<< std::string(1, ResultChar);
break;
default:
+ Invalid = true;
if (!Diags)
break;
@@ -367,6 +385,21 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
HadError = true;
}
+ if (!HadError && EvalMethod != StringLiteralEvalMethod::Unevaluated &&
+ Transcode && Converter) {
+ // Invalid escapes are written as '?' and then translated.
+ assert(ResultChar <= std::numeric_limits<char>::max());
+ char ByteChar = Invalid ? '?' : ResultChar;
+ auto ErrorOrChar = convertCharacter(StringRef(&ByteChar, 1), *Converter);
+ if (ErrorOrChar)
+ ResultChar = *ErrorOrChar;
+ else {
+ Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+ diag::err_exec_charset_conversion_failed)
+ << ErrorOrChar.getError().message();
+ HadError = true;
+ }
+ }
return ResultChar;
}
@@ -1811,6 +1844,11 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
uint32_t *buffer_begin = &codepoint_buffer.front();
uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
+ const TextEncodingConfig &TEC = PP.getTextEncodingConfig();
+ llvm::TextEncodingConverter *Converter = nullptr;
+ if (isOrdinary())
+ Converter = TEC.getConverter(CA_ToExecEncoding);
+
// Unicode escapes representing characters that cannot be correctly
// represented in a single code unit are disallowed in character literals
// by this implementation.
@@ -1825,7 +1863,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
} else if (tok::utf32_char_constant == Kind) {
largest_character_for_kind = 0x10FFFF;
} else {
- largest_character_for_kind = 0x7Fu;
+ largest_character_for_kind = (Converter == nullptr) ? 0x7Fu : 0xFFu;
}
while (begin != end) {
@@ -1865,6 +1903,22 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
HadError = true;
PP.Diag(Loc, diag::err_character_too_large);
}
+ if (!HadError && Converter) {
+ assert(isOrdinary() && "Only ordinary characters are supported");
+ std::string UTF8String;
+ convertUTF32ToUTF8String(
+ ArrayRef<char>(reinterpret_cast<const char *>(tmp_out_start),
+ 4),
+ UTF8String);
+ auto ErrorOrChar = convertCharacter(UTF8String, *Converter);
+ if (ErrorOrChar) {
+ *tmp_out_start = *ErrorOrChar;
+ } else {
+ HadError = true;
+ PP.Diag(Loc, diag::err_exec_charset_conversion_failed)
+ << ErrorOrChar.getError().message();
+ }
+ }
}
}
@@ -1872,16 +1926,37 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
}
// Is this a Universal Character Name escape?
if (begin[1] == 'u' || begin[1] == 'U' || begin[1] == 'N') {
- unsigned short UcnLen = 0;
- if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
- FullSourceLoc(Loc, PP.getSourceManager()),
- &PP.getDiagnostics(), PP.getLangOpts(), true)) {
- HadError = true;
- } else if (*buffer_begin > largest_character_for_kind) {
- HadError = true;
- PP.Diag(Loc, diag::err_character_too_large);
+ if (Converter == nullptr) {
+ unsigned short UcnLen = 0;
+ if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
+ FullSourceLoc(Loc, PP.getSourceManager()),
+ &PP.getDiagnostics(), PP.getLangOpts(), true)) {
+ HadError = true;
+ } else if (*buffer_begin > largest_character_for_kind) {
+ HadError = true;
+ PP.Diag(Loc, diag::err_character_too_large);
+ }
+ } else {
+ char Cp[5];
+ char *ResultPtr = Cp;
+ EncodeUCNEscape(TokBegin, begin, end, ResultPtr, HadError,
+ FullSourceLoc(Loc, PP.getSourceManager()),
+ /*CharByteWidth=*/1u, &PP.getDiagnostics(),
+ PP.getLangOpts());
+ assert(ResultPtr - Cp <= 4 &&
+ "unexpected result size for UCN escape character");
+ if (!HadError) {
+ auto ErrorOrChar =
+ convertCharacter(StringRef(Cp, ResultPtr - Cp), *Converter);
+ if (ErrorOrChar)
+ *buffer_begin = *ErrorOrChar;
+ else {
+ PP.Diag(Loc, diag::err_exec_charset_conversion_failed)
+ << ErrorOrChar.getError().message();
+ HadError = true;
+ }
+ }
}
-
++buffer_begin;
continue;
}
@@ -1890,7 +1965,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
ProcessCharEscape(TokBegin, begin, end, HadError,
FullSourceLoc(Loc, PP.getSourceManager()), CharWidth,
&PP.getDiagnostics(), PP.getLangOpts(),
- StringLiteralEvalMethod::Evaluated);
+ StringLiteralEvalMethod::Evaluated, nullptr);
*buffer_begin++ = result;
}
@@ -2000,16 +2075,18 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
///
StringLiteralParser::StringLiteralParser(ArrayRef<Token> StringToks,
Preprocessor &PP,
- StringLiteralEvalMethod EvalMethod)
+ StringLiteralEvalMethod EvalMethod,
+ ConversionAction Action)
: SM(PP.getSourceManager()), Features(PP.getLangOpts()),
Target(PP.getTargetInfo()), Diags(&PP.getDiagnostics()),
- MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
- ResultPtr(ResultBuf.data()), EvalMethod(EvalMethod), hadError(false),
- Pascal(false) {
- init(StringToks);
+ TEC(&PP.getTextEncodingConfig()), MaxTokenLength(0), SizeBound(0),
+ CharByteWidth(0), Kind(tok::unknown), ResultPtr(ResultBuf.data()),
+ EvalMethod(EvalMethod), hadError(false), Pascal(false) {
+ init(StringToks, Action);
}
-void StringLiteralParser::init(ArrayRef<Token> StringToks){
+void StringLiteralParser::init(ArrayRef<Token> StringToks,
+ ConversionAction Action) {
// The literal token may have come from an invalid source location (e.g. due
// to a PCH error), in which case the token length will be 0.
if (StringToks.empty() || StringToks[0].getLength() < 2)
@@ -2101,6 +2178,10 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks){
SourceLocation UDSuffixTokLoc;
+ llvm::TextEncodingConverter *Converter = nullptr;
+ if (isOrdinary() && TEC)
+ Converter = TEC->getConverter(Action);
+
for (unsigned i = 0, e = StringToks.size(); i != e; ++i) {
const char *ThisTokBuf = &TokenBuf[0];
// Get the spelling of the token, which eliminates trigraphs, etc. We know
@@ -2211,7 +2292,8 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks){
StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos);
// Copy everything before the \r\n sequence into the string literal.
- if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF))
+ if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF,
+ Converter))
hadError = true;
// Point into the \n inside the \r\n sequence and operate on the
@@ -2250,24 +2332,32 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks){
// Copy the character span over.
if (CopyStringFragment(StringToks[i], ThisTokBegin,
- StringRef(InStart, ThisTokBuf - InStart)))
+ StringRef(InStart, ThisTokBuf - InStart),
+ Converter))
hadError = true;
continue;
}
// Is this a Universal Character Name escape?
if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U' ||
ThisTokBuf[1] == 'N') {
- EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
- ResultPtr, hadError,
+ char *Cp = ResultPtr;
+ EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, ResultPtr,
+ hadError,
FullSourceLoc(StringToks[i].getLocation(), SM),
CharByteWidth, Diags, Features);
+ if (!hadError && Converter) {
+ SmallString<8> CpConv;
+ Converter->convert(StringRef(Cp), CpConv);
+ memcpy(Cp, CpConv.data(), CpConv.size());
+ ResultPtr = Cp + CpConv.size();
+ }
continue;
}
// Otherwise, this is a non-UCN escape character. Process it.
- unsigned ResultChar =
- ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
- FullSourceLoc(StringToks[i].getLocation(), SM),
- CharByteWidth * 8, Diags, Features, EvalMethod);
+ unsigned ResultChar = ProcessCharEscape(
+ ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
+ FullSourceLoc(StringToks[i].getLocation(), SM), CharByteWidth * 8,
+ Diags, Features, EvalMethod, Converter);
if (CharByteWidth == 4) {
// FIXME: Make the type of the result buffer correct instead of
@@ -2343,12 +2433,29 @@ static const char *resyncUTF8(const char *Err, const char *End) {
/// This function copies from Fragment, which is a sequence of bytes
/// within Tok's contents (which begin at TokBegin) into ResultPtr.
/// Performs widening for multi-byte characters.
-bool StringLiteralParser::CopyStringFragment(const Token &Tok,
- const char *TokBegin,
- StringRef Fragment) {
+bool StringLiteralParser::CopyStringFragment(
+ const Token &Tok, const char *TokBegin, StringRef Fragment,
+ llvm::TextEncodingConverter *Converter) {
+
const llvm::UTF8 *ErrorPtrTmp;
- if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
+ if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp)) {
+ if (Converter) {
+ assert(isOrdinary() && "Only ordinary literals are supported");
+ SmallString<64> CpConv;
+ char *Cp = ResultPtr - Fragment.size();
+ auto EC = Converter->convert(Fragment, CpConv);
+ if (!EC) {
+ memcpy(Cp, CpConv.data(), CpConv.size());
+ ResultPtr = Cp + CpConv.size();
+ } else { // there was a conversion error
+ if (Diags)
+ Diags->Report(Tok.getLocation(),
+ diag::err_exec_charset_conversion_failed)
+ << EC.message();
+ }
+ }
return false;
+ }
// If we see bad encoding for unprefixed string literals, warn and
// simply copy the byte values, for compatibility with gcc and older
@@ -2465,7 +2572,8 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
} else {
ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
FullSourceLoc(Tok.getLocation(), SM), CharByteWidth * 8,
- Diags, Features, StringLiteralEvalMethod::Evaluated);
+ Diags, Features, StringLiteralEvalMethod::Evaluated,
+ /*TextEncodingConfig=*/nullptr);
--ByteNo;
}
assert(!HadError && "This method isn't valid on erroneous strings");
diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp
index 6e90f20572f1f..1add87d2a5177 100644
--- a/clang/lib/Lex/PPDirectives.cpp
+++ b/clang/lib/Lex/PPDirectives.cpp
@@ -1650,7 +1650,8 @@ void Preprocessor::HandleLineDirective() {
return;
} else {
// Parse and validate the string, converting it into a unique ID.
- StringLiteralParser Literal(StrTok, *this);
+ StringLiteralParser Literal(
+ StrTok, *this, StringLiteralEvalMethod::Evaluated, CA_NoConversion);
assert(Literal.isOrdinary() && "Didn't allow wide strings in");
if (Literal.hadError) {
DiscardUntilEndOfDirective();
@@ -1801,7 +1802,8 @@ void Preprocessor::HandleDigitDirective(Token &DigitTok) {
return;
} else {
// Parse and validate the string, converting it into a unique ID.
- StringLiteralParser Literal(StrTok, *this);
+ StringLiteralParser Literal(
+ StrTok, *this, StringLiteralEvalMethod::Evaluated, CA_NoConversion);
assert(Literal.isOrdinary() && "Didn't allow wide strings in");
if (Literal.hadError) {
DiscardUntilEndOfDirective();
diff --git a/clang/lib/Lex/TextEncodingConfig.cpp b/clang/lib/Lex/TextEncodingConfig.cpp
new file mode 100644
index 0000000000000..b89d5baefcc23
--- /dev/null
+++ b/clang/lib/Lex/TextEncodingConfig.cpp
@@ -0,0 +1,45 @@
+//===--- TextEncodingConfig.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Lex/TextEncodingConfig.h"
+#include "clang/Basic/DiagnosticDriver.h"
+
+using namespace llvm;
+
+llvm::TextEncodingConverter *
+TextEncodingConfig::getConverter(ConversionAction Action) const {
+ switch (Action) {
+ case CA_ToExecEncoding:
+ return ToExecEncodingConverter;
+ default:
+ return nullptr;
+ }
+}
+
+std::error_code
+TextEncodingConfig::setConvertersFromOptions(TextEncodingConfig &TEC,
+ const clang::LangOptions &Opts) {
+ using namespace llvm;
+
+ const char *UTF8 = "UTF-8";
+ TEC.ExecEncoding =
+ Opts.ExecEncoding.empty() ? UTF8 : Opts.ExecEncoding.c_str();
+
+ // Create converter between internal and exec encoding specified
+ // in fexec-charset option.
+ if (TEC.ExecEncoding == UTF8)
+ return std::error_code();
+ ErrorOr<TextEncodingConverter> ErrorOrConverter =
+ llvm::TextEncodingConverter::create(UTF8, TEC.ExecEncoding);
+ if (ErrorOrConverter)
+ TEC.ToExecEncodingConverter =
+ new TextEncodingConverter(std::move(*ErrorOrConverter));
+ else
+ return ErrorOrConverter.getError();
+ return std::error_code();
+}
diff --git a/clang/test/CodeGen/systemz-charset.c b/clang/test/CodeGen/systemz-charset.c
new file mode 100644
index 0000000000000..897b9d2eeefa1
--- /dev/null
+++ b/clang/test/CodeGen/systemz-charset.c
@@ -0,0 +1,58 @@
+// RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -fexec-charset IBM-1047 -o - | FileCheck %s
+// RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -fexec-charset UTF-8 -DIBM1047_ONLY=1 -o - | FileCheck %s --check-prefix=CHECK-UTF8
+
+const char *UpperCaseLetters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+//CHECK: c"\C1\C2\C3\C4\C5\C6\C7\C8\C9\D1\D2\D3\D4\D5\D6\D7\D8\D9\E2\E3\E4\E5\E6\E7\E8\E9\00"
+//CHECK-UTF8: c"ABCDEFGHIJKLMNOPQRSTUVWXYZ\00"
+
+const char *LowerCaseLetters = "abcdefghijklmnopqrstuvwxyz";
+//CHECK: c"\81\82\83\84\85\86\87\88\89\91\92\93\94\95\96\97\98\99\A2\A3\A4\A5\A6\A7\A8\A9\00"
+//CHECK-UTF8: c"abcdefghijklmnopqrstuvwxyz\00"
+
+const char *Digits = "0123456789";
+//CHECK: c"\F0\F1\F2\F3\F4\F5\F6\F7\F8\F9\00"
+//CHECK-UTF8: c"0123456789\00"
+
+const char *SpecialCharacters = " .<(+|&!$*);^-/,%%_>`:#@=";
+//CHECK: c"@KLMNOPZ[\\]^_`akllmnyz{|~\00"
+//CHECK-UTF8: c" .<(+|&!$*);^-/,%%_>`:#@=\00"
+
+const char *EscapeCharacters = "\a\b\f\n\r\t\v\\\'\"\?";
+//CHECK: c"/\16\0C\15\0D\05\0B\E0}\7Fo\00"
+//CHECK-UTF8: c"\07\08\0C\0A\0D\09\0B\\'\22?\00"
+
+const char *InvalidEscape = "\y\z";
+//CHECK: c"oo\00"
+//CHECK-UTF8: c"yz\00"
+
+const char *HexCharacters = "\x12\x13\x14";
+//CHECK: c"\12\13\14\00"
+//CHECK-UTF8: c"\12\13\14\00"
+
+const char *OctalCharacters = "\141\142\143";
+//CHECK: c"abc\00"
+//CHECK-UTF8: c"abc\00"
+
+const char singleChar = 'a';
+//CHECK: i8 -127
+//CHECK-UTF8: 97
+
+#ifndef IBM1047_ONLY
+const char cent = '¢';
+//CHECK: i8 74
+
+const char currency = '¤';
+//CHECK: i8 -97
+#endif
+
+const char *UcnCharacters = "\u00E2\u00AC\U000000DF";
+//CHECK: c"B\B0Y\00"
+//CHECK-UTF8: c"\C3\A2\C2\AC\C3\9F\00"
+
+const char *Unicode = "ÿ";
+//CHECK: c"\DF\00"
+//CHECK-UTF8: c"\C3\BF\00"
+
+// RUN: not %clang_cc1 -fexec-charset invalid %s 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+// CHECK-ERROR: error: failed to set fexec-charset to 'invalid'
+
diff --git a/clang/test/CodeGen/systemz-charset.cpp b/clang/test/CodeGen/systemz-charset.cpp
new file mode 100644
index 0000000000000..f7becd5b39492
--- /dev/null
+++ b/clang/test/CodeGen/systemz-charset.cpp
@@ -0,0 +1,46 @@
+// RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -std=c++17 -fexec-charset IBM-1047 -o - | FileCheck %s
+
+const char *RawString = R"(Hello\n)";
+//CHECK: c"\C8\85\93\93\96\E0\95\00"
+
+const char *MultiLineRawString = R"(
+Hello
+There)";
+//CHECK: c"\15\C8\85\93\93\96\15\E3\88\85\99\85\00"
+
+char UnicodeChar8 = u8'1';
+//CHECK: i8 49
+char16_t UnicodeChar16 = u'1';
+//CHECK: i16 49
+char32_t UnicodeChar32 = U'1';
+//CHECK: i32 49
+
+const char *EscapeCharacters8 = u8"\a\b\f\n\r\t\v\\\'\"\?";
+//CHECK: c"\07\08\0C\0A\0D\09\0B\\'\22?\00"
+
+const char16_t *EscapeCharacters16 = u"\a\b\f\n\r\t\v\\\'\"\?";
+//CHECK: [12 x i16] [i16 7, i16 8, i16 12, i16 10, i16 13, i16 9, i16 11, i16 92, i16 39, i16 34, i16 63, i16 0]
+
+const char32_t *EscapeCharacters32 = U"\a\b\f\n\r\t\v\\\'\"\?";
+//CHECK: [12 x i32] [i32 7, i32 8, i32 12, i32 10, i32 13, i32 9, i32 11, i32 92, i32 39, i32 34, i32 63, i32 0]
+
+const char *UnicodeString8 = u8"Hello";
+//CHECK: c"Hello\00"
+const char16_t *UnicodeString16 = u"Hello";
+//CHECK: [6 x i16] [i16 72, i16 101, i16 108, i16 108, i16 111, i16 0]
+const char32_t *UnicodeString32 = U"Hello";
+//CHECK: [6 x i32] [i32 72, i32 101, i32 108, i32 108, i32 111, i32 0]
+
+const char *UnicodeRawString8 = u8R"("Hello\")";
+//CHECK: c"\22Hello\\\22\00"
+const char16_t *UnicodeRawString16 = uR"("Hello\")";
+//CHECK: [9 x i16] [i16 34, i16 72, i16 101, i16 108, i16 108, i16 111, i16 92, i16 34, i16 0]
+const char32_t *UnicodeRawString32 = UR"("Hello\")";
+//CHECK: [9 x i32] [i32 34, i32 72, i32 101, i32 108, i32 108, i32 111, i32 92, i32 34, i32 0]
+
+const char *UnicodeUCNString8 = u8"\u00E2\u00AC\U000000DF";
+//CHECK: c"\C3\A2\C2\AC\C3\9F\00"
+const char16_t *UnicodeUCNString16 = u"\u00E2\u00AC\U000000DF";
+//CHECK: [4 x i16] [i16 226, i16 172, i16 223, i16 0]
+const char32_t *UnicodeUCNString32 = U"\u00E2\u00AC\U000000DF";
+//CHECK: [4 x i32] [i32 226, i32 172, i32 223, i32 0]
diff --git a/clang/test/Preprocessor/init-s390x.c b/clang/test/Preprocessor/init-s390x.c
index a8fbde46cbb75..4414c7d919879 100644
--- a/clang/test/Preprocessor/init-s390x.c
+++ b/clang/test/Preprocessor/init-s390x.c
@@ -206,4 +206,5 @@
// S390X-ZOS: #define __TOS_390__ 1
// S390X-ZOS: #define __TOS_MVS__ 1
// S390X-ZOS: #define __XPLINK__ 1
+// S390X-ZOS: #define __clang_literal_encoding__ "IBM-1047"
// S390X-ZOS-GNUXX: #define __wchar_t 1
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index d5a42d9646c18..74e6ab17d9b3c 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -528,6 +528,10 @@ class Triple {
/// For example, "fooos1.2.3" would return "1.2.3".
LLVM_ABI StringRef getEnvironmentVersionString() const;
+ /// Get the default system encoding of the triple.
+ /// For example, "IBM-1047" for z/OS, "UTF-8" for others
+ LLVM_ABI StringRef getDefaultNarrowTextEncoding() const;
+
/// @}
/// @name Convenience Predicates
/// @{
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index c6515425b7eb5..1f1812c9f4096 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -1678,6 +1678,13 @@ StringRef Triple::getOSAndEnvironmentName() const {
return Tmp.split('-').second; // Strip second component
}
+// Default encoding on z/OS is IBM-1047 and UTF-8 otherwise
+StringRef Triple::getDefaultNarrowTextEncoding() const {
+ if (getOS() == llvm::Triple::ZOS)
+ return "IBM-1047";
+ return "UTF-8";
+}
+
static VersionTuple parseVersionFromName(StringRef Name) {
VersionTuple Version;
Version.tryParse(Name);
More information about the cfe-commits
mailing list