[clang] [llvm] Enable fexec-charset option (PR #138895)
Abhina Sree via cfe-commits
cfe-commits at lists.llvm.org
Thu Mar 5 05:37:36 PST 2026
https://github.com/abhina-sree updated https://github.com/llvm/llvm-project/pull/138895
>From b16517e01b77108e46cbb2941d4edacd8ef9193b Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Wed, 7 May 2025 11:26:59 -0400
Subject: [PATCH 01/15] This patch enables the fexec-charset option to control
the execution charset of string literals. It sets the default internal
charset, system charset, and execution charset for z/OS and UTF-8 for all
other platforms.
(cherry picked from commit 0295d0da4db8b8fcd54084dc6ae95d8b0bbf45d9)
(cherry picked from commit e379f6cb9d063cb78c6b48b0e0a8d9f241958f89)
---
clang/docs/LanguageExtensions.rst | 3 +-
clang/include/clang/Basic/LangOptions.h | 3 +
clang/include/clang/Basic/TokenKinds.h | 7 ++
clang/include/clang/Lex/LiteralConverter.h | 36 ++++++
clang/include/clang/Lex/LiteralSupport.h | 19 +--
clang/include/clang/Lex/Preprocessor.h | 3 +
clang/include/clang/Options/Options.td | 5 +
clang/lib/Driver/ToolChains/Clang.cpp | 17 ++-
clang/lib/Frontend/CompilerInstance.cpp | 4 +
clang/lib/Frontend/InitPreprocessor.cpp | 12 +-
clang/lib/Lex/CMakeLists.txt | 1 +
clang/lib/Lex/LiteralConverter.cpp | 69 +++++++++++
clang/lib/Lex/LiteralSupport.cpp | 133 +++++++++++++++++----
clang/test/CodeGen/systemz-charset.c | 35 ++++++
clang/test/CodeGen/systemz-charset.cpp | 46 +++++++
clang/test/Driver/cl-options.c | 7 +-
clang/test/Driver/clang_f_opts.c | 12 +-
clang/test/Preprocessor/init-s390x.c | 1 +
llvm/include/llvm/TargetParser/Triple.h | 3 +
llvm/lib/TargetParser/Triple.cpp | 7 ++
20 files changed, 375 insertions(+), 48 deletions(-)
create mode 100644 clang/include/clang/Lex/LiteralConverter.h
create mode 100644 clang/lib/Lex/LiteralConverter.cpp
create mode 100644 clang/test/CodeGen/systemz-charset.c
create mode 100644 clang/test/CodeGen/systemz-charset.cpp
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 29328355c3e6f..16be538069a32 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -422,8 +422,7 @@ Builtin Macros
``__clang_literal_encoding__``
Defined to a narrow string literal that represents the current encoding of
narrow string literals, e.g., ``"hello"``. This macro typically expands to
- "UTF-8" (but may change in the future if the
- ``-fexec-charset="Encoding-Name"`` option is implemented.)
+ the text encoding specified by -fexec-charset if specified, or the system charset.
``__clang_wide_literal_encoding__``
Defined to a narrow string literal that represents the current encoding of
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index ebd0436fa154b..9c2c79ff33366 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -593,6 +593,9 @@ class LangOptions : public LangOptionsBase {
/// The allocation token mode.
std::optional<llvm::AllocTokenMode> AllocTokenMode;
+ /// Name of the exec charset to convert the internal charset to.
+ std::string ExecCharset;
+
LangOptions();
/// Set language defaults for the given input language and
diff --git a/clang/include/clang/Basic/TokenKinds.h b/clang/include/clang/Basic/TokenKinds.h
index c0316257d9d97..1b2152ae518c7 100644
--- a/clang/include/clang/Basic/TokenKinds.h
+++ b/clang/include/clang/Basic/TokenKinds.h
@@ -115,6 +115,13 @@ inline bool isLiteral(TokenKind K) {
return isInLiteralRange;
}
+/// Return true if this is a utf literal kind.
+inline bool isUTFLiteral(TokenKind K) {
+ return K == tok::utf8_char_constant || K == tok::utf8_string_literal ||
+ K == tok::utf16_char_constant || K == tok::utf16_string_literal ||
+ K == tok::utf32_char_constant || K == tok::utf32_string_literal;
+}
+
/// Return true if this is any of tok::annot_* kinds.
bool isAnnotation(TokenKind K);
diff --git a/clang/include/clang/Lex/LiteralConverter.h b/clang/include/clang/Lex/LiteralConverter.h
new file mode 100644
index 0000000000000..999b2c146930f
--- /dev/null
+++ b/clang/include/clang/Lex/LiteralConverter.h
@@ -0,0 +1,36 @@
+//===--- clang/Lex/LiteralConverter.h - Translator for Literals -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LEX_LITERALCONVERTER_H
+#define LLVM_CLANG_LEX_LITERALCONVERTER_H
+
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/TargetInfo.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/TextEncoding.h"
+
+enum ConversionAction { NoConversion, ToSystemCharset, ToExecCharset };
+
+class LiteralConverter {
+ llvm::StringRef InternalCharset;
+ llvm::StringRef SystemCharset;
+ llvm::StringRef ExecCharset;
+ llvm::StringMap<llvm::TextEncodingConverter> TextEncodingConverters;
+
+public:
+ llvm::TextEncodingConverter *getConverter(const char *Codepage);
+ llvm::TextEncodingConverter *getConverter(ConversionAction Action);
+ llvm::TextEncodingConverter *createAndInsertCharConverter(const char *To);
+ void setConvertersFromOptions(const clang::LangOptions &Opts,
+ const clang::TargetInfo &TInfo,
+ clang::DiagnosticsEngine &Diags);
+};
+
+#endif
diff --git a/clang/include/clang/Lex/LiteralSupport.h b/clang/include/clang/Lex/LiteralSupport.h
index ea5f63bc20399..eaa2016c6a888 100644
--- a/clang/include/clang/Lex/LiteralSupport.h
+++ b/clang/include/clang/Lex/LiteralSupport.h
@@ -17,12 +17,13 @@
#include "clang/Basic/CharInfo.h"
#include "clang/Basic/LLVM.h"
#include "clang/Basic/TokenKinds.h"
+#include "clang/Lex/LiteralConverter.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/DataTypes.h"
-
+#include "llvm/Support/TextEncoding.h"
namespace clang {
class DiagnosticsEngine;
@@ -233,6 +234,7 @@ class StringLiteralParser {
const LangOptions &Features;
const TargetInfo &Target;
DiagnosticsEngine *Diags;
+ LiteralConverter *LiteralConv;
unsigned MaxTokenLength;
unsigned SizeBound;
@@ -246,18 +248,19 @@ class StringLiteralParser {
StringLiteralEvalMethod EvalMethod;
public:
- StringLiteralParser(ArrayRef<Token> StringToks, Preprocessor &PP,
- StringLiteralEvalMethod StringMethod =
- StringLiteralEvalMethod::Evaluated);
+ StringLiteralParser(
+ ArrayRef<Token> StringToks, Preprocessor &PP,
+ StringLiteralEvalMethod StringMethod = StringLiteralEvalMethod::Evaluated,
+ ConversionAction Action = ToExecCharset);
StringLiteralParser(ArrayRef<Token> StringToks, const SourceManager &sm,
const LangOptions &features, const TargetInfo &target,
DiagnosticsEngine *diags = nullptr)
: SM(sm), Features(features), Target(target), Diags(diags),
- MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
- ResultPtr(ResultBuf.data()),
+ LiteralConv(nullptr), MaxTokenLength(0), SizeBound(0), CharByteWidth(0),
+ Kind(tok::unknown), ResultPtr(ResultBuf.data()),
EvalMethod(StringLiteralEvalMethod::Evaluated), hadError(false),
Pascal(false) {
- init(StringToks);
+ init(StringToks, NoConversion);
}
bool hadError;
@@ -305,7 +308,7 @@ class StringLiteralParser {
static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix);
private:
- void init(ArrayRef<Token> StringToks);
+ void init(ArrayRef<Token> StringToks, ConversionAction Action);
bool CopyStringFragment(const Token &Tok, const char *TokBegin,
StringRef Fragment);
void DiagnoseLexingError(SourceLocation Loc);
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 5fb83eafc6b2a..62a65d4127947 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -25,6 +25,7 @@
#include "clang/Basic/TokenKinds.h"
#include "clang/Lex/HeaderSearch.h"
#include "clang/Lex/Lexer.h"
+#include "clang/Lex/LiteralConverter.h"
#include "clang/Lex/MacroInfo.h"
#include "clang/Lex/ModuleLoader.h"
#include "clang/Lex/ModuleMap.h"
@@ -222,6 +223,7 @@ class Preprocessor {
std::unique_ptr<ScratchBuffer> ScratchBuf;
HeaderSearch &HeaderInfo;
ModuleLoader &TheModuleLoader;
+ LiteralConverter LiteralConv;
/// External source of macros.
ExternalPreprocessorSource *ExternalSource;
@@ -1299,6 +1301,7 @@ class Preprocessor {
SelectorTable &getSelectorTable() { return Selectors; }
Builtin::Context &getBuiltinInfo() { return *BuiltinInfo; }
llvm::BumpPtrAllocator &getPreprocessorAllocator() { return BP; }
+ LiteralConverter &getLiteralConverter() { return LiteralConv; }
void setExternalSource(ExternalPreprocessorSource *Source) {
ExternalSource = Source;
diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td
index 40251a65f8f70..63832667aeb53 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -7670,6 +7670,11 @@ let Visibility = [CC1Option, CC1AsOption, FC1Option] in {
def tune_cpu : Separate<["-"], "tune-cpu">,
HelpText<"Tune for a specific cpu type">,
MarshallingInfoString<TargetOpts<"TuneCPU">>;
+def fexec_charset : Separate<["-"], "fexec-charset">, MetaVarName<"<charset>">,
+ HelpText<"Set the execution <charset> for string and character literals. "
+ "Supported character encodings include ISO8859-1, UTF-8, IBM-1047 "
+ "and those supported by the host icu or iconv library.">,
+ MarshallingInfoString<LangOpts<"ExecCharset">>;
def target_cpu : Separate<["-"], "target-cpu">,
HelpText<"Target a specific cpu type">,
MarshallingInfoString<TargetOpts<"CPU">>;
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 87d4d73748940..2eeacc438bc2f 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -50,6 +50,7 @@
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/Process.h"
+#include "llvm/Support/TextEncoding.h"
#include "llvm/Support/YAMLParser.h"
#include "llvm/TargetParser/AArch64TargetParser.h"
#include "llvm/TargetParser/ARMTargetParserCommon.h"
@@ -7564,12 +7565,20 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
<< value;
}
- // -fexec_charset=UTF-8 is default. Reject others
+ // Set the default fexec-charset as the system charset.
+ CmdArgs.push_back("-fexec-charset");
+ CmdArgs.push_back(Args.MakeArgString(Triple.getSystemCharset()));
if (Arg *execCharset = Args.getLastArg(options::OPT_fexec_charset_EQ)) {
StringRef value = execCharset->getValue();
- if (!value.equals_insensitive("utf-8"))
- D.Diag(diag::err_drv_invalid_value) << execCharset->getAsString(Args)
- << value;
+ llvm::ErrorOr<llvm::TextEncodingConverter> ErrorOrConverter =
+ llvm::TextEncodingConverter::create("UTF-8", value.data());
+ if (ErrorOrConverter) {
+ CmdArgs.push_back("-fexec-charset");
+ CmdArgs.push_back(Args.MakeArgString(value));
+ } else {
+ D.Diag(diag::err_drv_invalid_value)
+ << execCharset->getAsString(Args) << value;
+ }
}
RenderDiagnosticsOptions(D, Args, CmdArgs);
diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index cb3e6fb9688a9..976e9806cf680 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -32,6 +32,7 @@
#include "clang/Frontend/Utils.h"
#include "clang/Frontend/VerifyDiagnosticConsumer.h"
#include "clang/Lex/HeaderSearch.h"
+#include "clang/Lex/LiteralConverter.h"
#include "clang/Lex/Preprocessor.h"
#include "clang/Lex/PreprocessorOptions.h"
#include "clang/Sema/CodeCompleteConsumer.h"
@@ -544,6 +545,9 @@ void CompilerInstance::createPreprocessor(TranslationUnitKind TUKind) {
if (GetDependencyDirectives)
PP->setDependencyDirectivesGetter(*GetDependencyDirectives);
+
+ PP->getLiteralConverter().setConvertersFromOptions(getLangOpts(), getTarget(),
+ getDiagnostics());
}
std::string
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 18c694579abdf..07c7551162cb1 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1027,10 +1027,14 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
}
}
- // Macros to help identify the narrow and wide character sets
- // FIXME: clang currently ignores -fexec-charset=. If this changes,
- // then this may need to be updated.
- Builder.defineMacro("__clang_literal_encoding__", "\"UTF-8\"");
+ // Macros to help identify the narrow and wide character sets. This is set
+ // to fexec-charset. If fexec-charset is not specified, the default is the
+ // system charset.
+ if (!LangOpts.ExecCharset.empty())
+ Builder.defineMacro("__clang_literal_encoding__", LangOpts.ExecCharset);
+ else
+ Builder.defineMacro("__clang_literal_encoding__",
+ TI.getTriple().getSystemCharset());
if (TI.getTypeWidth(TI.getWCharType()) >= 32) {
// FIXME: 32-bit wchar_t signals UTF-32. This may change
// if -fwide-exec-charset= is ever supported.
diff --git a/clang/lib/Lex/CMakeLists.txt b/clang/lib/Lex/CMakeLists.txt
index f61737cd68021..9e38a1b8fbb44 100644
--- a/clang/lib/Lex/CMakeLists.txt
+++ b/clang/lib/Lex/CMakeLists.txt
@@ -12,6 +12,7 @@ add_clang_library(clangLex
InitHeaderSearch.cpp
Lexer.cpp
LexHLSLRootSignature.cpp
+ LiteralConverter.cpp
LiteralSupport.cpp
MacroArgs.cpp
MacroInfo.cpp
diff --git a/clang/lib/Lex/LiteralConverter.cpp b/clang/lib/Lex/LiteralConverter.cpp
new file mode 100644
index 0000000000000..b00f44a238ec0
--- /dev/null
+++ b/clang/lib/Lex/LiteralConverter.cpp
@@ -0,0 +1,69 @@
+//===--- LiteralConverter.cpp - Translator for String Literals -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Lex/LiteralConverter.h"
+#include "clang/Basic/DiagnosticDriver.h"
+
+using namespace llvm;
+
+llvm::TextEncodingConverter *
+LiteralConverter::getConverter(const char *Codepage) {
+ auto Iter = TextEncodingConverters.find(Codepage);
+ if (Iter != TextEncodingConverters.end())
+ return &Iter->second;
+ return nullptr;
+}
+
+llvm::TextEncodingConverter *
+LiteralConverter::getConverter(ConversionAction Action) {
+ StringRef CodePage;
+ if (Action == ToSystemCharset)
+ CodePage = SystemCharset;
+ else if (Action == ToExecCharset)
+ CodePage = ExecCharset;
+ else
+ CodePage = InternalCharset;
+ return getConverter(CodePage.data());
+}
+
+llvm::TextEncodingConverter *
+LiteralConverter::createAndInsertCharConverter(const char *To) {
+ const char *From = InternalCharset.data();
+ llvm::TextEncodingConverter *Converter = getConverter(To);
+ if (Converter)
+ return Converter;
+
+ ErrorOr<TextEncodingConverter> ErrorOrConverter =
+ llvm::TextEncodingConverter::create(From, To);
+ if (!ErrorOrConverter)
+ return nullptr;
+ TextEncodingConverters.insert_or_assign(StringRef(To),
+ std::move(*ErrorOrConverter));
+ return getConverter(To);
+}
+
+void LiteralConverter::setConvertersFromOptions(
+ const clang::LangOptions &Opts, const clang::TargetInfo &TInfo,
+ clang::DiagnosticsEngine &Diags) {
+ using namespace llvm;
+ SystemCharset = TInfo.getTriple().getSystemCharset();
+ InternalCharset = "UTF-8";
+ ExecCharset = Opts.ExecCharset.empty() ? InternalCharset : Opts.ExecCharset;
+ // Create converter between internal and system charset
+ if (InternalCharset != SystemCharset)
+ createAndInsertCharConverter(SystemCharset.data());
+
+ // Create converter between internal and exec charset specified
+ // in fexec-charset option.
+ if (InternalCharset == ExecCharset)
+ return;
+ if (!createAndInsertCharConverter(ExecCharset.data())) {
+ Diags.Report(clang::diag::err_drv_invalid_value)
+ << "-fexec-charset" << ExecCharset;
+ }
+}
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index c220821a0098f..600a308543292 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -134,7 +134,8 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
FullSourceLoc Loc, unsigned CharWidth,
DiagnosticsEngine *Diags,
const LangOptions &Features,
- StringLiteralEvalMethod EvalMethod) {
+ StringLiteralEvalMethod EvalMethod,
+ llvm::TextEncodingConverter *Converter) {
const char *EscapeBegin = ThisTokBuf;
bool Delimited = false;
bool EndDelimiterFound = false;
@@ -146,6 +147,8 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
// that would have been \", which would not have been the end of string.
unsigned ResultChar = *ThisTokBuf++;
char Escape = ResultChar;
+ bool Translate = true;
+ bool Invalid = false;
switch (ResultChar) {
// These map to themselves.
case '\\': case '\'': case '"': case '?': break;
@@ -186,6 +189,7 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
ResultChar = 11;
break;
case 'x': { // Hex escape.
+ Translate = false;
ResultChar = 0;
if (ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
Delimited = true;
@@ -249,6 +253,7 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
case '4': case '5': case '6': case '7': {
// Octal escapes.
--ThisTokBuf;
+ Translate = false;
ResultChar = 0;
// Octal escapes are a series of octal digits with maximum length 3.
@@ -334,6 +339,7 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
<< std::string(1, ResultChar);
break;
default:
+ Invalid = true;
if (!Diags)
break;
@@ -367,6 +373,15 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
HadError = true;
}
+ if (Translate && Converter) {
+ // Invalid escapes are written as '?' and then translated.
+ char ByteChar = Invalid ? '?' : ResultChar;
+ SmallString<8> ResultCharConv;
+ Converter->convert(StringRef(&ByteChar, 1), ResultCharConv);
+ assert(ResultCharConv.size() == 1 &&
+ "Char size increased after translation");
+ ResultChar = ResultCharConv[0];
+ }
return ResultChar;
}
@@ -1751,6 +1766,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
HadError = false;
Kind = kind;
+ LiteralConverter *LiteralConv = &PP.getLiteralConverter();
const char *TokBegin = begin;
@@ -1817,6 +1833,10 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
largest_character_for_kind = 0x7Fu;
}
+ llvm::TextEncodingConverter *Converter = nullptr;
+ if (!isUTFLiteral(Kind) && LiteralConv)
+ Converter = LiteralConv->getConverter(ToExecCharset);
+
while (begin != end) {
// Is this a span of non-escape characters?
if (begin[0] != '\\') {
@@ -1854,6 +1874,16 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
HadError = true;
PP.Diag(Loc, diag::err_character_too_large);
}
+ if (!HadError && Converter) {
+ assert(Kind != tok::wide_char_constant &&
+ "Wide character translation not supported");
+ char ByteChar = *tmp_out_start;
+ SmallString<1> ConvertedChar;
+ Converter->convert(StringRef(&ByteChar, 1), ConvertedChar);
+ assert(ConvertedChar.size() == 1 &&
+ "Char size increased after translation");
+ *tmp_out_start = ConvertedChar[0];
+ }
}
}
@@ -1861,16 +1891,35 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
}
// Is this a Universal Character Name escape?
if (begin[1] == 'u' || begin[1] == 'U' || begin[1] == 'N') {
- unsigned short UcnLen = 0;
- if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
- FullSourceLoc(Loc, PP.getSourceManager()),
- &PP.getDiagnostics(), PP.getLangOpts(), true)) {
- HadError = true;
- } else if (*buffer_begin > largest_character_for_kind) {
- HadError = true;
- PP.Diag(Loc, diag::err_character_too_large);
+ if (Converter == nullptr) {
+ unsigned short UcnLen = 0;
+ if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
+ FullSourceLoc(Loc, PP.getSourceManager()),
+ &PP.getDiagnostics(), PP.getLangOpts(), true)) {
+ HadError = true;
+ } else if (*buffer_begin > largest_character_for_kind) {
+ HadError = true;
+ PP.Diag(Loc, diag::err_character_too_large);
+ }
+ } else {
+ char Cp[8];
+ char *ResultPtr = Cp;
+ unsigned CharByteWidth = 1;
+ EncodeUCNEscape(TokBegin, begin, end, ResultPtr, HadError,
+ FullSourceLoc(Loc, PP.getSourceManager()),
+ CharByteWidth, &PP.getDiagnostics(), PP.getLangOpts());
+ if (!HadError) {
+ SmallString<8> CpConv;
+ Converter->convert(StringRef(Cp), CpConv);
+ if (CpConv.size() > 1) {
+ HadError = true;
+ PP.Diag(Loc, diag::err_character_too_large);
+ } else {
+ memcpy(Cp, CpConv.data(), CpConv.size());
+ *buffer_begin = *Cp;
+ }
+ }
}
-
++buffer_begin;
continue;
}
@@ -1879,7 +1928,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
ProcessCharEscape(TokBegin, begin, end, HadError,
FullSourceLoc(Loc, PP.getSourceManager()), CharWidth,
&PP.getDiagnostics(), PP.getLangOpts(),
- StringLiteralEvalMethod::Evaluated);
+ StringLiteralEvalMethod::Evaluated, nullptr);
*buffer_begin++ = result;
}
@@ -1989,16 +2038,18 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
///
StringLiteralParser::StringLiteralParser(ArrayRef<Token> StringToks,
Preprocessor &PP,
- StringLiteralEvalMethod EvalMethod)
+ StringLiteralEvalMethod EvalMethod,
+ ConversionAction Action)
: SM(PP.getSourceManager()), Features(PP.getLangOpts()),
Target(PP.getTargetInfo()), Diags(&PP.getDiagnostics()),
- MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
- ResultPtr(ResultBuf.data()), EvalMethod(EvalMethod), hadError(false),
- Pascal(false) {
- init(StringToks);
+ LiteralConv(&PP.getLiteralConverter()), MaxTokenLength(0), SizeBound(0),
+ CharByteWidth(0), Kind(tok::unknown), ResultPtr(ResultBuf.data()),
+ EvalMethod(EvalMethod), hadError(false), Pascal(false) {
+ init(StringToks, Action);
}
-void StringLiteralParser::init(ArrayRef<Token> StringToks){
+void StringLiteralParser::init(ArrayRef<Token> StringToks,
+ ConversionAction Action) {
// The literal token may have come from an invalid source location (e.g. due
// to a PCH error), in which case the token length will be 0.
if (StringToks.empty() || StringToks[0].getLength() < 2)
@@ -2090,6 +2141,10 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks){
SourceLocation UDSuffixTokLoc;
+ llvm::TextEncodingConverter *Converter = nullptr;
+ if (!isUTFLiteral(Kind) && LiteralConv)
+ Converter = LiteralConv->getConverter(Action);
+
for (unsigned i = 0, e = StringToks.size(); i != e; ++i) {
const char *ThisTokBuf = &TokenBuf[0];
// Get the spelling of the token, which eliminates trigraphs, etc. We know
@@ -2203,6 +2258,16 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks){
if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF))
hadError = true;
+ if (!hadError && Converter) {
+ assert(Kind != tok::wide_string_literal &&
+ "Wide character translation not supported");
+ SmallString<256> CpConv;
+ int ResultLength = BeforeCRLF.size() * CharByteWidth;
+ char *Cp = ResultPtr - ResultLength;
+ Converter->convert(StringRef(Cp, ResultLength), CpConv);
+ memcpy(Cp, CpConv.data(), ResultLength);
+ ResultPtr = Cp + CpConv.size();
+ }
// Point into the \n inside the \r\n sequence and operate on the
// remaining portion of the literal.
RemainingTokenSpan = AfterCRLF.substr(1);
@@ -2237,26 +2302,45 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks){
++ThisTokBuf;
} while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
+ int Length = ThisTokBuf - InStart;
// Copy the character span over.
if (CopyStringFragment(StringToks[i], ThisTokBegin,
StringRef(InStart, ThisTokBuf - InStart)))
hadError = true;
+
+ if (!hadError && Converter) {
+ assert(Kind != tok::wide_string_literal &&
+ "Wide character translation not supported");
+ SmallString<256> CpConv;
+ int ResultLength = Length * CharByteWidth;
+ char *Cp = ResultPtr - ResultLength;
+ Converter->convert(StringRef(Cp, ResultLength), CpConv);
+ memcpy(Cp, CpConv.data(), ResultLength);
+ ResultPtr = Cp + CpConv.size();
+ }
continue;
}
// Is this a Universal Character Name escape?
if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U' ||
ThisTokBuf[1] == 'N') {
- EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
- ResultPtr, hadError,
+ char *Cp = ResultPtr;
+ EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, ResultPtr,
+ hadError,
FullSourceLoc(StringToks[i].getLocation(), SM),
CharByteWidth, Diags, Features);
+ if (!hadError && Converter) {
+ SmallString<8> CpConv;
+ Converter->convert(StringRef(Cp), CpConv);
+ memcpy(Cp, CpConv.data(), CpConv.size());
+ ResultPtr = Cp + CpConv.size();
+ }
continue;
}
// Otherwise, this is a non-UCN escape character. Process it.
- unsigned ResultChar =
- ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
- FullSourceLoc(StringToks[i].getLocation(), SM),
- CharByteWidth * 8, Diags, Features, EvalMethod);
+ unsigned ResultChar = ProcessCharEscape(
+ ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
+ FullSourceLoc(StringToks[i].getLocation(), SM), CharByteWidth * 8,
+ Diags, Features, EvalMethod, Converter);
if (CharByteWidth == 4) {
// FIXME: Make the type of the result buffer correct instead of
@@ -2454,7 +2538,8 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
} else {
ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
FullSourceLoc(Tok.getLocation(), SM), CharByteWidth * 8,
- Diags, Features, StringLiteralEvalMethod::Evaluated);
+ Diags, Features, StringLiteralEvalMethod::Evaluated,
+ nullptr);
--ByteNo;
}
assert(!HadError && "This method isn't valid on erroneous strings");
diff --git a/clang/test/CodeGen/systemz-charset.c b/clang/test/CodeGen/systemz-charset.c
new file mode 100644
index 0000000000000..aab43157b1be4
--- /dev/null
+++ b/clang/test/CodeGen/systemz-charset.c
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -fexec-charset IBM-1047 -o - | FileCheck %s
+// RUN: %clang %s -emit-llvm -S -target s390x-ibm-zos -o - | FileCheck %s
+
+const char *UpperCaseLetters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+// CHECK: c"\C1\C2\C3\C4\C5\C6\C7\C8\C9\D1\D2\D3\D4\D5\D6\D7\D8\D9\E2\E3\E4\E5\E6\E7\E8\E9\00"
+
+const char *LowerCaseLetters = "abcdefghijklmnopqrstuvwxyz";
+//CHECK: c"\81\82\83\84\85\86\87\88\89\91\92\93\94\95\96\97\98\99\A2\A3\A4\A5\A6\A7\A8\A9\00"
+
+const char *Digits = "0123456789";
+// CHECK: c"\F0\F1\F2\F3\F4\F5\F6\F7\F8\F9\00"
+
+const char *SpecialCharacters = " .<(+|&!$*);^-/,%%_>`:#@=";
+// CHECK: c"@KLMNOPZ[\\]^_`akllmnyz{|~\00"
+
+const char *EscapeCharacters = "\a\b\f\n\r\t\v\\\'\"\?";
+//CHECK: c"/\16\0C\15\0D\05\0B\E0}\7Fo\00"
+
+const char *InvalidEscape = "\y\z";
+//CHECK: c"oo\00"
+
+const char *HexCharacters = "\x12\x13\x14";
+//CHECK: c"\12\13\14\00"
+
+const char *OctalCharacters = "\141\142\143";
+//CHECK: c"abc\00"
+
+const char singleChar = 'a';
+//CHECK: i8 -127
+
+const char *UcnCharacters = "\u00E2\u00AC\U000000DF";
+//CHECK: c"B\B0Y\00"
+
+const char *Unicode = "ÿ";
+//CHECK: c"\DF\00"
diff --git a/clang/test/CodeGen/systemz-charset.cpp b/clang/test/CodeGen/systemz-charset.cpp
new file mode 100644
index 0000000000000..7e66407fd2ff1
--- /dev/null
+++ b/clang/test/CodeGen/systemz-charset.cpp
@@ -0,0 +1,46 @@
+// RUN: %clang %s -std=c++17 -emit-llvm -S -target s390x-ibm-zos -o - | FileCheck %s
+
+const char *RawString = R"(Hello\n)";
+//CHECK: c"\C8\85\93\93\96\E0\95\00"
+
+const char *MultiLineRawString = R"(
+Hello
+There)";
+//CHECK: c"\15\C8\85\93\93\96\15\E3\88\85\99\85\00"
+
+char UnicodeChar8 = u8'1';
+//CHECK: i8 49
+char16_t UnicodeChar16 = u'1';
+//CHECK: i16 49
+char32_t UnicodeChar32 = U'1';
+//CHECK: i32 49
+
+const char *EscapeCharacters8 = u8"\a\b\f\n\r\t\v\\\'\"\?";
+//CHECK: c"\07\08\0C\0A\0D\09\0B\\'\22?\00"
+
+const char16_t *EscapeCharacters16 = u"\a\b\f\n\r\t\v\\\'\"\?";
+//CHECK: [12 x i16] [i16 7, i16 8, i16 12, i16 10, i16 13, i16 9, i16 11, i16 92, i16 39, i16 34, i16 63, i16 0]
+
+const char32_t *EscapeCharacters32 = U"\a\b\f\n\r\t\v\\\'\"\?";
+//CHECK: [12 x i32] [i32 7, i32 8, i32 12, i32 10, i32 13, i32 9, i32 11, i32 92, i32 39, i32 34, i32 63, i32 0]
+
+const char *UnicodeString8 = u8"Hello";
+//CHECK: c"Hello\00"
+const char16_t *UnicodeString16 = u"Hello";
+//CHECK: [6 x i16] [i16 72, i16 101, i16 108, i16 108, i16 111, i16 0]
+const char32_t *UnicodeString32 = U"Hello";
+//CHECK: [6 x i32] [i32 72, i32 101, i32 108, i32 108, i32 111, i32 0]
+
+const char *UnicodeRawString8 = u8R"("Hello\")";
+//CHECK: c"\22Hello\\\22\00"
+const char16_t *UnicodeRawString16 = uR"("Hello\")";
+//CHECK: [9 x i16] [i16 34, i16 72, i16 101, i16 108, i16 108, i16 111, i16 92, i16 34, i16 0]
+const char32_t *UnicodeRawString32 = UR"("Hello\")";
+//CHECK: [9 x i32] [i32 34, i32 72, i32 101, i32 108, i32 108, i32 111, i32 92, i32 34, i32 0]
+
+const char *UnicodeUCNString8 = u8"\u00E2\u00AC\U000000DF";
+//CHECK: c"\C3\A2\C2\AC\C3\9F\00"
+const char16_t *UnicodeUCNString16 = u"\u00E2\u00AC\U000000DF";
+//CHECK: [4 x i16] [i16 226, i16 172, i16 223, i16 0]
+const char32_t *UnicodeUCNString32 = U"\u00E2\u00AC\U000000DF";
+//CHECK: [4 x i32] [i32 226, i32 172, i32 223, i32 0]
diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c
index 9fd3ff6326dcc..b0caab09405b3 100644
--- a/clang/test/Driver/cl-options.c
+++ b/clang/test/Driver/cl-options.c
@@ -250,10 +250,11 @@
// RUN: not %clang_cl /source-charset:utf-16 -### -- %s 2>&1 | FileCheck -check-prefix=source-charset-utf-16 %s
// source-charset-utf-16: invalid value 'utf-16' in '/source-charset:utf-16'
-// /execution-charset: should warn on everything except UTF-8.
-// RUN: not %clang_cl /execution-charset:utf-16 -### -- %s 2>&1 | FileCheck -check-prefix=execution-charset-utf-16 %s
-// execution-charset-utf-16: invalid value 'utf-16' in '/execution-charset:utf-16'
+// /execution-charset: should warn on invalid charsets.
+// RUN: %clang_cl /execution-charset:invalid-charset -### -- %s 2>&1 | FileCheck -check-prefix=execution-charset-invalid %s
+// execution-charset-invalid: invalid value 'invalid-charset' in '/execution-charset:invalid-charset'
//
+
// RUN: %clang_cl /Umymacro -### -- %s 2>&1 | FileCheck -check-prefix=U %s
// RUN: %clang_cl /U mymacro -### -- %s 2>&1 | FileCheck -check-prefix=U %s
// U: "-U" "mymacro"
diff --git a/clang/test/Driver/clang_f_opts.c b/clang/test/Driver/clang_f_opts.c
index 5871f1580d6b7..d791329b6c438 100644
--- a/clang/test/Driver/clang_f_opts.c
+++ b/clang/test/Driver/clang_f_opts.c
@@ -232,8 +232,14 @@
// RUN: not %clang -### -S -finput-charset=iso-8859-1 -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-INPUT-CHARSET %s
// CHECK-INVALID-INPUT-CHARSET: error: invalid value 'iso-8859-1' in '-finput-charset=iso-8859-1'
-// RUN: not %clang -### -S -fexec-charset=iso-8859-1 -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-EXEC-CHARSET %s
-// CHECK-INVALID-EXEC-CHARSET: error: invalid value 'iso-8859-1' in '-fexec-charset=iso-8859-1'
+// RUN: %clang -### -S -fexec-charset=invalid-charset -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-INPUT-CHARSET %s
+// CHECK-INVALID-INPUT-CHARSET: error: invalid value 'invalid-charset' in '-fexec-charset=invalid-charset'
+
+// Test that we support the following exec charsets.
+// RUN: %clang -### -S -fexec-charset=UTF-8 -o /dev/null %s 2>&1 | FileCheck --check-prefix=INVALID %s
+// RUN: %clang -### -S -fexec-charset=ISO8859-1 -o /dev/null %s 2>&1 | FileCheck --check-prefix=INVALID %s
+// RUN: %clang -### -S -fexec-charset=IBM-1047 -o /dev/null %s 2>&1 | FileCheck --check-prefix=INVALID %s
+// INVALID-NOT: error: invalid value
// Test that we don't error on these.
// RUN: not %clang -### -S -Werror \
@@ -247,7 +253,7 @@
// RUN: -fident -fno-ident \
// RUN: -fimplicit-templates -fno-implicit-templates \
// RUN: -finput-charset=UTF-8 \
-// RUN: -fexec-charset=UTF-8 \
+// RUN: -fexec-charset=UTF-8 \
// RUN: -fivopts -fno-ivopts \
// RUN: -fnon-call-exceptions -fno-non-call-exceptions \
// RUN: -fpermissive -fno-permissive \
diff --git a/clang/test/Preprocessor/init-s390x.c b/clang/test/Preprocessor/init-s390x.c
index a8fbde46cbb75..9ff122def913f 100644
--- a/clang/test/Preprocessor/init-s390x.c
+++ b/clang/test/Preprocessor/init-s390x.c
@@ -206,4 +206,5 @@
// S390X-ZOS: #define __TOS_390__ 1
// S390X-ZOS: #define __TOS_MVS__ 1
// S390X-ZOS: #define __XPLINK__ 1
+// S390X-ZOS: #define __clang_literal_encoding__ IBM-1047
// S390X-ZOS-GNUXX: #define __wchar_t 1
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index a940236c6aca4..e55f67feaf2d2 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -513,6 +513,9 @@ class Triple {
/// For example, "fooos1.2.3" would return "1.2.3".
LLVM_ABI StringRef getEnvironmentVersionString() const;
+ /// getSystemCharset - Get the system charset of the triple.
+ StringRef getSystemCharset() const;
+
/// @}
/// @name Convenience Predicates
/// @{
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index 0a775fda7e542..1d7650a40f76e 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -1446,6 +1446,13 @@ StringRef Triple::getOSAndEnvironmentName() const {
return Tmp.split('-').second; // Strip second component
}
+// System charset on z/OS is IBM-1047 and UTF-8 otherwise
+StringRef Triple::getSystemCharset() const {
+ if (getOS() == llvm::Triple::ZOS)
+ return "IBM-1047";
+ return "UTF-8";
+}
+
static VersionTuple parseVersionFromName(StringRef Name) {
VersionTuple Version;
Version.tryParse(Name);
>From 13ce473e88e176c4db03bca283d20799f2f91570 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Wed, 11 Jun 2025 11:22:40 -0400
Subject: [PATCH 02/15] replace StringMap with pointer
---
clang/docs/LanguageExtensions.rst | 5 +-
clang/include/clang/Basic/LangOptions.h | 4 +-
clang/include/clang/Lex/LiteralConverter.h | 13 ++---
clang/include/clang/Lex/LiteralSupport.h | 2 +-
clang/include/clang/Options/Options.td | 2 +-
clang/lib/Driver/ToolChains/Clang.cpp | 8 +--
clang/lib/Frontend/InitPreprocessor.cpp | 6 +-
clang/lib/Lex/LiteralConverter.cpp | 66 ++++++++--------------
clang/lib/Lex/LiteralSupport.cpp | 2 +-
llvm/include/llvm/TargetParser/Triple.h | 4 +-
llvm/lib/TargetParser/Triple.cpp | 4 +-
11 files changed, 50 insertions(+), 66 deletions(-)
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 16be538069a32..74f78cd57bdbf 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -421,8 +421,9 @@ Builtin Macros
``__clang_literal_encoding__``
Defined to a narrow string literal that represents the current encoding of
- narrow string literals, e.g., ``"hello"``. This macro typically expands to
- the text encoding specified by -fexec-charset if specified, or the system charset.
+ narrow string literals, e.g., ``"hello"``. This macro expands to the text
+ encoding specified by ``-fexec-charset`` if any, or a system-specific default
+ otherwise: ``"IBM-1047"`` on z/OS and ``"UTF-8"`` on all other systems.
``__clang_wide_literal_encoding__``
Defined to a narrow string literal that represents the current encoding of
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 9c2c79ff33366..3a1ea63972d9c 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -593,8 +593,8 @@ class LangOptions : public LangOptionsBase {
/// The allocation token mode.
std::optional<llvm::AllocTokenMode> AllocTokenMode;
- /// Name of the exec charset to convert the internal charset to.
- std::string ExecCharset;
+ /// Name of the execution encoding to convert the internal encoding to.
+ std::string ExecEncoding;
LangOptions();
diff --git a/clang/include/clang/Lex/LiteralConverter.h b/clang/include/clang/Lex/LiteralConverter.h
index 999b2c146930f..ee489bf6ce510 100644
--- a/clang/include/clang/Lex/LiteralConverter.h
+++ b/clang/include/clang/Lex/LiteralConverter.h
@@ -16,18 +16,17 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/TextEncoding.h"
-enum ConversionAction { NoConversion, ToSystemCharset, ToExecCharset };
+enum ConversionAction { NoConversion, ToSystemEncoding, ToExecEncoding };
class LiteralConverter {
- llvm::StringRef InternalCharset;
- llvm::StringRef SystemCharset;
- llvm::StringRef ExecCharset;
- llvm::StringMap<llvm::TextEncodingConverter> TextEncodingConverters;
+ llvm::StringRef InternalEncoding;
+ llvm::StringRef SystemEncoding;
+ llvm::StringRef ExecEncoding;
+ llvm::TextEncodingConverter *ToSystemEncodingConverter;
+ llvm::TextEncodingConverter *ToExecEncodingConverter;
public:
- llvm::TextEncodingConverter *getConverter(const char *Codepage);
llvm::TextEncodingConverter *getConverter(ConversionAction Action);
- llvm::TextEncodingConverter *createAndInsertCharConverter(const char *To);
void setConvertersFromOptions(const clang::LangOptions &Opts,
const clang::TargetInfo &TInfo,
clang::DiagnosticsEngine &Diags);
diff --git a/clang/include/clang/Lex/LiteralSupport.h b/clang/include/clang/Lex/LiteralSupport.h
index eaa2016c6a888..af0296912b8df 100644
--- a/clang/include/clang/Lex/LiteralSupport.h
+++ b/clang/include/clang/Lex/LiteralSupport.h
@@ -251,7 +251,7 @@ class StringLiteralParser {
StringLiteralParser(
ArrayRef<Token> StringToks, Preprocessor &PP,
StringLiteralEvalMethod StringMethod = StringLiteralEvalMethod::Evaluated,
- ConversionAction Action = ToExecCharset);
+ ConversionAction Action = ToExecEncoding);
StringLiteralParser(ArrayRef<Token> StringToks, const SourceManager &sm,
const LangOptions &features, const TargetInfo &target,
DiagnosticsEngine *diags = nullptr)
diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td
index 63832667aeb53..183c4f824897c 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -7674,7 +7674,7 @@ def fexec_charset : Separate<["-"], "fexec-charset">, MetaVarName<"<charset>">,
HelpText<"Set the execution <charset> for string and character literals. "
"Supported character encodings include ISO8859-1, UTF-8, IBM-1047 "
"and those supported by the host icu or iconv library.">,
- MarshallingInfoString<LangOpts<"ExecCharset">>;
+ MarshallingInfoString<LangOpts<"ExecEncoding">>;
def target_cpu : Separate<["-"], "target-cpu">,
HelpText<"Target a specific cpu type">,
MarshallingInfoString<TargetOpts<"CPU">>;
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 2eeacc438bc2f..24d75ffc93aa2 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7567,9 +7567,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
// Set the default fexec-charset as the system charset.
CmdArgs.push_back("-fexec-charset");
- CmdArgs.push_back(Args.MakeArgString(Triple.getSystemCharset()));
- if (Arg *execCharset = Args.getLastArg(options::OPT_fexec_charset_EQ)) {
- StringRef value = execCharset->getValue();
+ CmdArgs.push_back(Args.MakeArgString(Triple.getDefaultTextEncoding()));
+ if (Arg *execEncoding = Args.getLastArg(options::OPT_fexec_charset_EQ)) {
+ StringRef value = execEncoding->getValue();
llvm::ErrorOr<llvm::TextEncodingConverter> ErrorOrConverter =
llvm::TextEncodingConverter::create("UTF-8", value.data());
if (ErrorOrConverter) {
@@ -7577,7 +7577,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
CmdArgs.push_back(Args.MakeArgString(value));
} else {
D.Diag(diag::err_drv_invalid_value)
- << execCharset->getAsString(Args) << value;
+ << execEncoding->getAsString(Args) << value;
}
}
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 07c7551162cb1..dda530f286173 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1030,11 +1030,11 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
// Macros to help identify the narrow and wide character sets. This is set
// to fexec-charset. If fexec-charset is not specified, the default is the
// system charset.
- if (!LangOpts.ExecCharset.empty())
- Builder.defineMacro("__clang_literal_encoding__", LangOpts.ExecCharset);
+ if (!LangOpts.ExecEncoding.empty())
+ Builder.defineMacro("__clang_literal_encoding__", LangOpts.ExecEncoding);
else
Builder.defineMacro("__clang_literal_encoding__",
- TI.getTriple().getSystemCharset());
+ TI.getTriple().getDefaultTextEncoding());
if (TI.getTypeWidth(TI.getWCharType()) >= 32) {
// FIXME: 32-bit wchar_t signals UTF-32. This may change
// if -fwide-exec-charset= is ever supported.
diff --git a/clang/lib/Lex/LiteralConverter.cpp b/clang/lib/Lex/LiteralConverter.cpp
index b00f44a238ec0..e9f8981336e8e 100644
--- a/clang/lib/Lex/LiteralConverter.cpp
+++ b/clang/lib/Lex/LiteralConverter.cpp
@@ -11,59 +11,43 @@
using namespace llvm;
-llvm::TextEncodingConverter *
-LiteralConverter::getConverter(const char *Codepage) {
- auto Iter = TextEncodingConverters.find(Codepage);
- if (Iter != TextEncodingConverters.end())
- return &Iter->second;
- return nullptr;
-}
-
llvm::TextEncodingConverter *
LiteralConverter::getConverter(ConversionAction Action) {
- StringRef CodePage;
- if (Action == ToSystemCharset)
- CodePage = SystemCharset;
- else if (Action == ToExecCharset)
- CodePage = ExecCharset;
+ if (Action == ToSystemEncoding)
+ return ToSystemEncodingConverter;
+ else if (Action == ToExecEncoding)
+ return ToExecEncodingConverter;
else
- CodePage = InternalCharset;
- return getConverter(CodePage.data());
-}
-
-llvm::TextEncodingConverter *
-LiteralConverter::createAndInsertCharConverter(const char *To) {
- const char *From = InternalCharset.data();
- llvm::TextEncodingConverter *Converter = getConverter(To);
- if (Converter)
- return Converter;
-
- ErrorOr<TextEncodingConverter> ErrorOrConverter =
- llvm::TextEncodingConverter::create(From, To);
- if (!ErrorOrConverter)
return nullptr;
- TextEncodingConverters.insert_or_assign(StringRef(To),
- std::move(*ErrorOrConverter));
- return getConverter(To);
}
void LiteralConverter::setConvertersFromOptions(
const clang::LangOptions &Opts, const clang::TargetInfo &TInfo,
clang::DiagnosticsEngine &Diags) {
using namespace llvm;
- SystemCharset = TInfo.getTriple().getSystemCharset();
- InternalCharset = "UTF-8";
- ExecCharset = Opts.ExecCharset.empty() ? InternalCharset : Opts.ExecCharset;
- // Create converter between internal and system charset
- if (InternalCharset != SystemCharset)
- createAndInsertCharConverter(SystemCharset.data());
+ InternalEncoding = "UTF-8";
+ SystemEncoding = TInfo.getTriple().getDefaultTextEncoding();
+ ExecEncoding =
+ Opts.ExecEncoding.empty() ? InternalEncoding : Opts.ExecEncoding;
+ // Create converter between internal and system encoding
+ if (InternalEncoding != SystemEncoding) {
+ ErrorOr<TextEncodingConverter> ErrorOrConverter =
+ llvm::TextEncodingConverter::create(InternalEncoding, SystemEncoding);
+ if (!ErrorOrConverter)
+ return;
+ ToSystemEncodingConverter =
+ new TextEncodingConverter(std::move(*ErrorOrConverter));
+ }
- // Create converter between internal and exec charset specified
+ // Create converter between internal and exec encoding specified
// in fexec-charset option.
- if (InternalCharset == ExecCharset)
+ if (InternalEncoding == ExecEncoding)
return;
- if (!createAndInsertCharConverter(ExecCharset.data())) {
+ ErrorOr<TextEncodingConverter> ErrorOrConverter =
+ llvm::TextEncodingConverter::create(InternalEncoding, ExecEncoding);
+ if (!ErrorOrConverter)
Diags.Report(clang::diag::err_drv_invalid_value)
- << "-fexec-charset" << ExecCharset;
- }
+ << "-fexec-charset" << ExecEncoding;
+ ToExecEncodingConverter =
+ new TextEncodingConverter(std::move(*ErrorOrConverter));
}
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index 600a308543292..55d9aced73ad0 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -1835,7 +1835,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
llvm::TextEncodingConverter *Converter = nullptr;
if (!isUTFLiteral(Kind) && LiteralConv)
- Converter = LiteralConv->getConverter(ToExecCharset);
+ Converter = LiteralConv->getConverter(ToExecEncoding);
while (begin != end) {
// Is this a span of non-escape characters?
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index e55f67feaf2d2..b88200eb0cf27 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -513,8 +513,8 @@ class Triple {
/// For example, "fooos1.2.3" would return "1.2.3".
LLVM_ABI StringRef getEnvironmentVersionString() const;
- /// getSystemCharset - Get the system charset of the triple.
- StringRef getSystemCharset() const;
+ /// getDefaultTextEncoding - Get the default encoding of the triple.
+ StringRef getDefaultTextEncoding() const;
/// @}
/// @name Convenience Predicates
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index 1d7650a40f76e..b10421dc9dfae 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -1446,8 +1446,8 @@ StringRef Triple::getOSAndEnvironmentName() const {
return Tmp.split('-').second; // Strip second component
}
-// System charset on z/OS is IBM-1047 and UTF-8 otherwise
-StringRef Triple::getSystemCharset() const {
+// Default encoding on z/OS is IBM-1047 and UTF-8 otherwise
+StringRef Triple::getDefaultTextEncoding() const {
if (getOS() == llvm::Triple::ZOS)
return "IBM-1047";
return "UTF-8";
>From 94d0ff2f2e304e4a4701c89b354e3cd4a98938bc Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Tue, 21 Oct 2025 09:12:02 -0400
Subject: [PATCH 03/15] do not translate line/digit directives, do not
translate filename
---
clang/lib/Frontend/FrontendAction.cpp | 4 +++-
clang/lib/Lex/PPDirectives.cpp | 6 ++++--
2 files changed, 7 insertions(+), 3 deletions(-)
diff --git a/clang/lib/Frontend/FrontendAction.cpp b/clang/lib/Frontend/FrontendAction.cpp
index 79e862f01be14..ee33e30afdfb0 100644
--- a/clang/lib/Frontend/FrontendAction.cpp
+++ b/clang/lib/Frontend/FrontendAction.cpp
@@ -525,7 +525,9 @@ static SourceLocation ReadOriginalFileName(CompilerInstance &CI,
if (T.isAtStartOfLine() || T.getKind() != tok::string_literal)
return SourceLocation();
- StringLiteralParser Literal(T, CI.getPreprocessor());
+ StringLiteralParser Literal(T, CI.getPreprocessor(),
+ StringLiteralEvalMethod::Evaluated,
+ CA_NoConversion);
if (Literal.hadError)
return SourceLocation();
RawLexer->LexFromRawLexer(T);
diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp
index 85edbabf09ed3..866cc7e728064 100644
--- a/clang/lib/Lex/PPDirectives.cpp
+++ b/clang/lib/Lex/PPDirectives.cpp
@@ -1646,7 +1646,8 @@ void Preprocessor::HandleLineDirective() {
return;
} else {
// Parse and validate the string, converting it into a unique ID.
- StringLiteralParser Literal(StrTok, *this);
+ StringLiteralParser Literal(
+ StrTok, *this, StringLiteralEvalMethod::Evaluated, CA_NoConversion);
assert(Literal.isOrdinary() && "Didn't allow wide strings in");
if (Literal.hadError) {
DiscardUntilEndOfDirective();
@@ -1797,7 +1798,8 @@ void Preprocessor::HandleDigitDirective(Token &DigitTok) {
return;
} else {
// Parse and validate the string, converting it into a unique ID.
- StringLiteralParser Literal(StrTok, *this);
+ StringLiteralParser Literal(
+ StrTok, *this, StringLiteralEvalMethod::Evaluated, CA_NoConversion);
assert(Literal.isOrdinary() && "Didn't allow wide strings in");
if (Literal.hadError) {
DiscardUntilEndOfDirective();
>From 2e5d6a00e85c487f6cdaaa5a4d2b93188fbdd92b Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Fri, 17 Oct 2025 12:14:09 -0400
Subject: [PATCH 04/15] address comments
---
.../clang/Basic/DiagnosticFrontendKinds.td | 3 +-
.../include/clang/Basic/DiagnosticLexKinds.td | 2 +
clang/include/clang/Basic/TokenKinds.h | 7 ++-
clang/include/clang/Lex/LiteralConverter.h | 17 +++---
clang/include/clang/Lex/LiteralSupport.h | 4 +-
clang/include/clang/Options/Options.td | 13 +++--
clang/lib/Driver/ToolChains/Clang.cpp | 26 +++++----
clang/lib/Frontend/CompilerInstance.cpp | 5 +-
clang/lib/Frontend/InitPreprocessor.cpp | 2 +-
clang/lib/Lex/LiteralConverter.cpp | 53 +++++++++++--------
clang/lib/Lex/LiteralSupport.cpp | 44 ++++++++++-----
clang/test/Driver/cl-options.c | 2 +-
clang/test/Driver/clang_f_opts.c | 18 ++++---
llvm/include/llvm/Support/TextEncoding.h | 3 ++
llvm/include/llvm/TargetParser/Triple.h | 4 +-
llvm/lib/Support/TextEncoding.cpp | 3 +-
llvm/lib/TargetParser/Triple.cpp | 2 +-
17 files changed, 131 insertions(+), 77 deletions(-)
diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
index 5c62bb70ebd0f..78677396eeab1 100644
--- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td
+++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
@@ -338,7 +338,8 @@ def err_non_default_visibility_dllimport : Error<
"non-default visibility cannot be applied to 'dllimport' declaration">;
def err_ifunc_resolver_return : Error<
"ifunc resolver function must return a pointer">;
-
+def err_fe_literal_conv_config : Error<
+ "failed to configure the literal converter">;
def warn_atomic_op_misaligned : Warning<
"misaligned atomic operation may incur "
"significant performance penalty"
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index 77feea9f869e9..2d239a7880ac8 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -287,6 +287,8 @@ def ext_string_too_long : Extension<"string literal of length %0 exceeds "
"support">, InGroup<OverlengthStrings>;
def err_character_too_large : Error<
"character too large for enclosing character literal type">;
+def err_exec_charset_conversion_failed : Error<
+ "conversion to execution encoding failed: '%0'">;
def warn_c99_compat_unicode_literal : Warning<
"unicode literals are incompatible with C99">,
InGroup<C99Compat>, DefaultIgnore;
diff --git a/clang/include/clang/Basic/TokenKinds.h b/clang/include/clang/Basic/TokenKinds.h
index 1b2152ae518c7..a18b81c4dcc26 100644
--- a/clang/include/clang/Basic/TokenKinds.h
+++ b/clang/include/clang/Basic/TokenKinds.h
@@ -115,13 +115,18 @@ inline bool isLiteral(TokenKind K) {
return isInLiteralRange;
}
-/// Return true if this is a utf literal kind.
+/// Return true if this is a UTF literal kind.
inline bool isUTFLiteral(TokenKind K) {
return K == tok::utf8_char_constant || K == tok::utf8_string_literal ||
K == tok::utf16_char_constant || K == tok::utf16_string_literal ||
K == tok::utf32_char_constant || K == tok::utf32_string_literal;
}
+/// Return true if this is a wide literal kind.
+inline bool isWideLiteral(TokenKind K) {
+ return K == tok::wide_char_constant || K == tok::wide_string_literal;
+}
+
/// Return true if this is any of tok::annot_* kinds.
bool isAnnotation(TokenKind K);
diff --git a/clang/include/clang/Lex/LiteralConverter.h b/clang/include/clang/Lex/LiteralConverter.h
index ee489bf6ce510..6a66d2d0ff707 100644
--- a/clang/include/clang/Lex/LiteralConverter.h
+++ b/clang/include/clang/Lex/LiteralConverter.h
@@ -16,20 +16,25 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/TextEncoding.h"
-enum ConversionAction { NoConversion, ToSystemEncoding, ToExecEncoding };
+enum ConversionAction {
+ CA_NoConversion,
+ CA_ToSystemEncoding,
+ CA_ToExecEncoding
+};
class LiteralConverter {
llvm::StringRef InternalEncoding;
llvm::StringRef SystemEncoding;
llvm::StringRef ExecEncoding;
- llvm::TextEncodingConverter *ToSystemEncodingConverter;
- llvm::TextEncodingConverter *ToExecEncodingConverter;
+ llvm::TextEncodingConverter *ToSystemEncodingConverter = nullptr;
+ llvm::TextEncodingConverter *ToExecEncodingConverter = nullptr;
public:
llvm::TextEncodingConverter *getConverter(ConversionAction Action);
- void setConvertersFromOptions(const clang::LangOptions &Opts,
- const clang::TargetInfo &TInfo,
- clang::DiagnosticsEngine &Diags);
+ static std::error_code
+ setConvertersFromOptions(LiteralConverter &LiteralConv,
+ const clang::LangOptions &Opts,
+ const clang::TargetInfo &TInfo);
};
#endif
diff --git a/clang/include/clang/Lex/LiteralSupport.h b/clang/include/clang/Lex/LiteralSupport.h
index af0296912b8df..32ae829096592 100644
--- a/clang/include/clang/Lex/LiteralSupport.h
+++ b/clang/include/clang/Lex/LiteralSupport.h
@@ -251,7 +251,7 @@ class StringLiteralParser {
StringLiteralParser(
ArrayRef<Token> StringToks, Preprocessor &PP,
StringLiteralEvalMethod StringMethod = StringLiteralEvalMethod::Evaluated,
- ConversionAction Action = ToExecEncoding);
+ ConversionAction Action = CA_ToExecEncoding);
StringLiteralParser(ArrayRef<Token> StringToks, const SourceManager &sm,
const LangOptions &features, const TargetInfo &target,
DiagnosticsEngine *diags = nullptr)
@@ -260,7 +260,7 @@ class StringLiteralParser {
Kind(tok::unknown), ResultPtr(ResultBuf.data()),
EvalMethod(StringLiteralEvalMethod::Evaluated), hadError(false),
Pascal(false) {
- init(StringToks, NoConversion);
+ init(StringToks, CA_NoConversion);
}
bool hadError;
diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td
index 183c4f824897c..a343eaa32117a 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -3075,7 +3075,10 @@ def fexperimental_strict_floating_point : Flag<["-"], "fexperimental-strict-floa
def finput_charset_EQ : Joined<["-"], "finput-charset=">,
Visibility<[ClangOption, FlangOption, FC1Option]>, Group<f_Group>,
HelpText<"Specify the default character set for source files">;
-def fexec_charset_EQ : Joined<["-"], "fexec-charset=">, Group<f_Group>;
+def fexec_charset_EQ : Joined<["-"], "fexec-charset=">, Group<f_Group>,
+ HelpText<"Set the execution <charset> for string and character literals. "
+ "Supported character encodings include ISO-8859-1, UTF-8, IBM1047, "
+ "and possibly those supported by ICU or the host iconv library.">;
def finstrument_functions
: Flag<["-"], "finstrument-functions">,
Group<f_Group>,
@@ -7672,8 +7675,8 @@ def tune_cpu : Separate<["-"], "tune-cpu">,
MarshallingInfoString<TargetOpts<"TuneCPU">>;
def fexec_charset : Separate<["-"], "fexec-charset">, MetaVarName<"<charset>">,
HelpText<"Set the execution <charset> for string and character literals. "
- "Supported character encodings include ISO8859-1, UTF-8, IBM-1047 "
- "and those supported by the host icu or iconv library.">,
+ "Supported character encodings include ISO-8859-1, UTF-8, IBM1047, "
+ "and possibly those supported by ICU or the host iconv library.">,
MarshallingInfoString<LangOpts<"ExecEncoding">>;
def target_cpu : Separate<["-"], "target-cpu">,
HelpText<"Target a specific cpu type">,
@@ -9315,7 +9318,9 @@ def _SLASH_source_charset : CLCompileJoined<"source-charset:">,
HelpText<"Set source encoding, supports only UTF-8">,
Alias<finput_charset_EQ>;
def _SLASH_execution_charset : CLCompileJoined<"execution-charset:">,
- HelpText<"Set runtime encoding, supports only UTF-8">,
+ HelpText<"Set the execution <charset> for string and character literals. "
+ "Supported character encodings include ISO-8859-1, UTF-8, IBM1047, "
+ "and possibly those supported by ICU or the host iconv library.">,
Alias<fexec_charset_EQ>;
def _SLASH_std : CLCompileJoined<"std:">,
HelpText<"Set language version (c++14,c++17,c++20,c++23preview,c++latest,c11,c17)">;
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 24d75ffc93aa2..631fe204cdd22 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7565,20 +7565,24 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
<< value;
}
- // Set the default fexec-charset as the system charset.
- CmdArgs.push_back("-fexec-charset");
- CmdArgs.push_back(Args.MakeArgString(Triple.getDefaultTextEncoding()));
if (Arg *execEncoding = Args.getLastArg(options::OPT_fexec_charset_EQ)) {
StringRef value = execEncoding->getValue();
- llvm::ErrorOr<llvm::TextEncodingConverter> ErrorOrConverter =
- llvm::TextEncodingConverter::create("UTF-8", value.data());
- if (ErrorOrConverter) {
- CmdArgs.push_back("-fexec-charset");
- CmdArgs.push_back(Args.MakeArgString(value));
- } else {
- D.Diag(diag::err_drv_invalid_value)
- << execEncoding->getAsString(Args) << value;
+ bool KnownEncoding =
+ llvm::TextEncodingConverter::getKnownEncoding(value).has_value();
+ if (!KnownEncoding) {
+ llvm::ErrorOr<llvm::TextEncodingConverter> ErrorOrConverter =
+ llvm::TextEncodingConverter::create("UTF-8", value.data());
+ if (!ErrorOrConverter)
+ D.Diag(diag::err_drv_invalid_value)
+ << execEncoding->getAsString(Args) << value;
}
+ CmdArgs.push_back("-fexec-charset");
+ CmdArgs.push_back(Args.MakeArgString(value));
+ } else {
+ // Set the default fexec-charset as the system charset.
+ CmdArgs.push_back("-fexec-charset");
+ CmdArgs.push_back(
+ Args.MakeArgString(Triple.getDefaultNarrowTextEncoding()));
}
RenderDiagnosticsOptions(D, Args, CmdArgs);
diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index 976e9806cf680..c151d18e7e05e 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -546,8 +546,9 @@ void CompilerInstance::createPreprocessor(TranslationUnitKind TUKind) {
if (GetDependencyDirectives)
PP->setDependencyDirectivesGetter(*GetDependencyDirectives);
- PP->getLiteralConverter().setConvertersFromOptions(getLangOpts(), getTarget(),
- getDiagnostics());
+ if (!LiteralConverter::setConvertersFromOptions(PP->getLiteralConverter(),
+ getLangOpts(), getTarget()))
+ PP->getDiagnostics().Report(clang::diag::err_fe_literal_conv_config);
}
std::string
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index dda530f286173..ee43c2552dc4a 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1034,7 +1034,7 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
Builder.defineMacro("__clang_literal_encoding__", LangOpts.ExecEncoding);
else
Builder.defineMacro("__clang_literal_encoding__",
- TI.getTriple().getDefaultTextEncoding());
+ TI.getTriple().getDefaultNarrowTextEncoding());
if (TI.getTypeWidth(TI.getWCharType()) >= 32) {
// FIXME: 32-bit wchar_t signals UTF-32. This may change
// if -fwide-exec-charset= is ever supported.
diff --git a/clang/lib/Lex/LiteralConverter.cpp b/clang/lib/Lex/LiteralConverter.cpp
index e9f8981336e8e..2bd177d499b87 100644
--- a/clang/lib/Lex/LiteralConverter.cpp
+++ b/clang/lib/Lex/LiteralConverter.cpp
@@ -13,41 +13,48 @@ using namespace llvm;
llvm::TextEncodingConverter *
LiteralConverter::getConverter(ConversionAction Action) {
- if (Action == ToSystemEncoding)
+ if (Action == CA_ToSystemEncoding)
return ToSystemEncodingConverter;
- else if (Action == ToExecEncoding)
+ else if (Action == CA_ToExecEncoding)
return ToExecEncodingConverter;
else
return nullptr;
}
-void LiteralConverter::setConvertersFromOptions(
- const clang::LangOptions &Opts, const clang::TargetInfo &TInfo,
- clang::DiagnosticsEngine &Diags) {
+std::error_code
+LiteralConverter::setConvertersFromOptions(LiteralConverter &LiteralConv,
+ const clang::LangOptions &Opts,
+ const clang::TargetInfo &TInfo) {
using namespace llvm;
- InternalEncoding = "UTF-8";
- SystemEncoding = TInfo.getTriple().getDefaultTextEncoding();
- ExecEncoding =
- Opts.ExecEncoding.empty() ? InternalEncoding : Opts.ExecEncoding;
+ LiteralConv.InternalEncoding = "UTF-8";
+ LiteralConv.SystemEncoding = TInfo.getTriple().getDefaultNarrowTextEncoding();
+ LiteralConv.ExecEncoding = Opts.ExecEncoding.empty()
+ ? LiteralConv.InternalEncoding
+ : Opts.ExecEncoding;
+
// Create converter between internal and system encoding
- if (InternalEncoding != SystemEncoding) {
+ if (LiteralConv.InternalEncoding != LiteralConv.SystemEncoding) {
ErrorOr<TextEncodingConverter> ErrorOrConverter =
- llvm::TextEncodingConverter::create(InternalEncoding, SystemEncoding);
- if (!ErrorOrConverter)
- return;
- ToSystemEncodingConverter =
- new TextEncodingConverter(std::move(*ErrorOrConverter));
+ llvm::TextEncodingConverter::create(LiteralConv.InternalEncoding,
+ LiteralConv.SystemEncoding);
+ if (ErrorOrConverter) {
+ LiteralConv.ToSystemEncodingConverter =
+ new TextEncodingConverter(std::move(*ErrorOrConverter));
+ } else
+ return ErrorOrConverter.getError();
}
// Create converter between internal and exec encoding specified
// in fexec-charset option.
- if (InternalEncoding == ExecEncoding)
- return;
+ if (LiteralConv.InternalEncoding == LiteralConv.ExecEncoding)
+ return std::error_code();
ErrorOr<TextEncodingConverter> ErrorOrConverter =
- llvm::TextEncodingConverter::create(InternalEncoding, ExecEncoding);
- if (!ErrorOrConverter)
- Diags.Report(clang::diag::err_drv_invalid_value)
- << "-fexec-charset" << ExecEncoding;
- ToExecEncodingConverter =
- new TextEncodingConverter(std::move(*ErrorOrConverter));
+ llvm::TextEncodingConverter::create(LiteralConv.InternalEncoding,
+ LiteralConv.ExecEncoding);
+ if (ErrorOrConverter) {
+ LiteralConv.ToExecEncodingConverter =
+ new TextEncodingConverter(std::move(*ErrorOrConverter));
+ } else
+ return ErrorOrConverter.getError();
+ return std::error_code();
}
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index 55d9aced73ad0..b60369da069b3 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -147,7 +147,7 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
// that would have been \", which would not have been the end of string.
unsigned ResultChar = *ThisTokBuf++;
char Escape = ResultChar;
- bool Translate = true;
+ bool Transcode = true;
bool Invalid = false;
switch (ResultChar) {
// These map to themselves.
@@ -189,7 +189,7 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
ResultChar = 11;
break;
case 'x': { // Hex escape.
- Translate = false;
+ Transcode = false;
ResultChar = 0;
if (ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
Delimited = true;
@@ -253,7 +253,7 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
case '4': case '5': case '6': case '7': {
// Octal escapes.
--ThisTokBuf;
- Translate = false;
+ Transcode = false;
ResultChar = 0;
// Octal escapes are a series of octal digits with maximum length 3.
@@ -373,7 +373,7 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
HadError = true;
}
- if (Translate && Converter) {
+ if (Transcode && Converter) {
// Invalid escapes are written as '?' and then translated.
char ByteChar = Invalid ? '?' : ResultChar;
SmallString<8> ResultCharConv;
@@ -1834,8 +1834,8 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
}
llvm::TextEncodingConverter *Converter = nullptr;
- if (!isUTFLiteral(Kind) && LiteralConv)
- Converter = LiteralConv->getConverter(ToExecEncoding);
+ if (!isUTFLiteral(Kind) && !isWideLiteral(Kind) && LiteralConv)
+ Converter = LiteralConv->getConverter(CA_ToExecEncoding);
while (begin != end) {
// Is this a span of non-escape characters?
@@ -2142,7 +2142,7 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks,
SourceLocation UDSuffixTokLoc;
llvm::TextEncodingConverter *Converter = nullptr;
- if (!isUTFLiteral(Kind) && LiteralConv)
+ if (!isUTFLiteral(Kind) && !isWideLiteral(Kind) && LiteralConv)
Converter = LiteralConv->getConverter(Action);
for (unsigned i = 0, e = StringToks.size(); i != e; ++i) {
@@ -2264,9 +2264,18 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks,
SmallString<256> CpConv;
int ResultLength = BeforeCRLF.size() * CharByteWidth;
char *Cp = ResultPtr - ResultLength;
- Converter->convert(StringRef(Cp, ResultLength), CpConv);
- memcpy(Cp, CpConv.data(), ResultLength);
- ResultPtr = Cp + CpConv.size();
+ std::error_code EC =
+ Converter->convert(StringRef(Cp, ResultLength), CpConv);
+ if (EC) {
+ if (Diags)
+ Diags->Report(StringToks[i].getLocation(),
+ diag::err_exec_charset_conversion_failed)
+ << EC.message();
+ hadError = true;
+ } else {
+ memcpy(Cp, CpConv.data(), ResultLength);
+ ResultPtr = Cp + CpConv.size();
+ }
}
// Point into the \n inside the \r\n sequence and operate on the
// remaining portion of the literal.
@@ -2314,9 +2323,18 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks,
SmallString<256> CpConv;
int ResultLength = Length * CharByteWidth;
char *Cp = ResultPtr - ResultLength;
- Converter->convert(StringRef(Cp, ResultLength), CpConv);
- memcpy(Cp, CpConv.data(), ResultLength);
- ResultPtr = Cp + CpConv.size();
+ std::error_code EC =
+ Converter->convert(StringRef(Cp, ResultLength), CpConv);
+ if (EC) {
+ if (Diags)
+ Diags->Report(StringToks[i].getLocation(),
+ diag::err_exec_charset_conversion_failed)
+ << EC.message();
+ hadError = true;
+ } else {
+ memcpy(Cp, CpConv.data(), ResultLength);
+ ResultPtr = Cp + CpConv.size();
+ }
}
continue;
}
diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c
index b0caab09405b3..caa370934065d 100644
--- a/clang/test/Driver/cl-options.c
+++ b/clang/test/Driver/cl-options.c
@@ -251,7 +251,7 @@
// source-charset-utf-16: invalid value 'utf-16' in '/source-charset:utf-16'
// /execution-charset: should warn on invalid charsets.
-// RUN: %clang_cl /execution-charset:invalid-charset -### -- %s 2>&1 | FileCheck -check-prefix=execution-charset-invalid %s
+// RUN: not %clang_cl /execution-charset:invalid-charset -### -- %s 2>&1 | FileCheck -check-prefix=execution-charset-invalid %s
// execution-charset-invalid: invalid value 'invalid-charset' in '/execution-charset:invalid-charset'
//
diff --git a/clang/test/Driver/clang_f_opts.c b/clang/test/Driver/clang_f_opts.c
index d791329b6c438..c9d5785783033 100644
--- a/clang/test/Driver/clang_f_opts.c
+++ b/clang/test/Driver/clang_f_opts.c
@@ -232,14 +232,16 @@
// RUN: not %clang -### -S -finput-charset=iso-8859-1 -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-INPUT-CHARSET %s
// CHECK-INVALID-INPUT-CHARSET: error: invalid value 'iso-8859-1' in '-finput-charset=iso-8859-1'
-// RUN: %clang -### -S -fexec-charset=invalid-charset -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-INPUT-CHARSET %s
-// CHECK-INVALID-INPUT-CHARSET: error: invalid value 'invalid-charset' in '-fexec-charset=invalid-charset'
-
-// Test that we support the following exec charsets.
-// RUN: %clang -### -S -fexec-charset=UTF-8 -o /dev/null %s 2>&1 | FileCheck --check-prefix=INVALID %s
-// RUN: %clang -### -S -fexec-charset=ISO8859-1 -o /dev/null %s 2>&1 | FileCheck --check-prefix=INVALID %s
-// RUN: %clang -### -S -fexec-charset=IBM-1047 -o /dev/null %s 2>&1 | FileCheck --check-prefix=INVALID %s
-// INVALID-NOT: error: invalid value
+// RUN: not %clang -### -S -fexec-charset=invalid-charset -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-EXEC-CHARSET %s
+// CHECK-INVALID-EXEC-CHARSET: error: invalid value 'invalid-charset' in '-fexec-charset=invalid-charset'
+
+// Test that we support the following exec charsets. The preferred MIME name is
+// `IBM1047`, but `IBM-1047` is the name used by z/OS USS utilities such as
+// `chtag`.
+// RUN: %clang -### -S -fexec-charset=UTF-8 -o /dev/null %s 2>&1 | FileCheck --check-prefix=CHECK-EXEC-CHARSET-UTF-8 %s
+// RUN: %clang -### -S -fexec-charset=IBM-1047 -o /dev/null %s 2>&1 | FileCheck --check-prefix=CHECK-EXEC-CHARSET-IBM-1047 %s
+// CHECK-EXEC-CHARSET-UTF-8: "-fexec-charset" "UTF-8"
+// CHECK-EXEC-CHARSET-IBM-1047: "-fexec-charset" "IBM-1047"
// Test that we don't error on these.
// RUN: not %clang -### -S -Werror \
diff --git a/llvm/include/llvm/Support/TextEncoding.h b/llvm/include/llvm/Support/TextEncoding.h
index 8a304910aa5dd..bda6f2a088eb2 100644
--- a/llvm/include/llvm/Support/TextEncoding.h
+++ b/llvm/include/llvm/Support/TextEncoding.h
@@ -135,6 +135,9 @@ class TextEncodingConverter {
return std::string(Result);
return EC;
}
+
+ // Maps the encoding name to enum constant if possible.
+ static std::optional<TextEncoding> getKnownEncoding(StringRef Name);
};
} // namespace llvm
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index b88200eb0cf27..19586b7df9934 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -513,8 +513,8 @@ class Triple {
/// For example, "fooos1.2.3" would return "1.2.3".
LLVM_ABI StringRef getEnvironmentVersionString() const;
- /// getDefaultTextEncoding - Get the default encoding of the triple.
- StringRef getDefaultTextEncoding() const;
+ /// getDefaultNarrowTextEncoding - Get the default encoding of the triple.
+ StringRef getDefaultNarrowTextEncoding() const;
/// @}
/// @name Convenience Predicates
diff --git a/llvm/lib/Support/TextEncoding.cpp b/llvm/lib/Support/TextEncoding.cpp
index 453af6f7287bc..91bd482ea4a0c 100644
--- a/llvm/lib/Support/TextEncoding.cpp
+++ b/llvm/lib/Support/TextEncoding.cpp
@@ -44,7 +44,8 @@ static void normalizeCharSetName(StringRef CSName,
}
// Maps the encoding name to enum constant if possible.
-static std::optional<TextEncoding> getKnownEncoding(StringRef Name) {
+std::optional<TextEncoding>
+TextEncodingConverter::getKnownEncoding(StringRef Name) {
SmallString<16> Normalized;
normalizeCharSetName(Name, Normalized);
if (Normalized.equals("utf8"))
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index b10421dc9dfae..9175a165e7691 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -1447,7 +1447,7 @@ StringRef Triple::getOSAndEnvironmentName() const {
}
// Default encoding on z/OS is IBM-1047 and UTF-8 otherwise
-StringRef Triple::getDefaultTextEncoding() const {
+StringRef Triple::getDefaultNarrowTextEncoding() const {
if (getOS() == llvm::Triple::ZOS)
return "IBM-1047";
return "UTF-8";
>From 6930ac2086347aae08ee51d09e2078957f70abbf Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Mon, 10 Nov 2025 15:12:53 -0500
Subject: [PATCH 05/15] fix CI
---
clang/lib/Frontend/CompilerInstance.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index c151d18e7e05e..85ac1bfd1303e 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -546,8 +546,8 @@ void CompilerInstance::createPreprocessor(TranslationUnitKind TUKind) {
if (GetDependencyDirectives)
PP->setDependencyDirectivesGetter(*GetDependencyDirectives);
- if (!LiteralConverter::setConvertersFromOptions(PP->getLiteralConverter(),
- getLangOpts(), getTarget()))
+ if (LiteralConverter::setConvertersFromOptions(PP->getLiteralConverter(),
+ getLangOpts(), getTarget()))
PP->getDiagnostics().Report(clang::diag::err_fe_literal_conv_config);
}
>From 45422864b078f3f4fd49ed4619e7e382dc299da4 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Wed, 12 Nov 2025 11:51:52 -0500
Subject: [PATCH 06/15] add more error handling, address comments
---
clang/lib/Lex/LiteralSupport.cpp | 110 +++++++++++++++++++------------
1 file changed, 68 insertions(+), 42 deletions(-)
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index b60369da069b3..f652a41300a43 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -1766,7 +1766,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
HadError = false;
Kind = kind;
- LiteralConverter *LiteralConv = &PP.getLiteralConverter();
+ LiteralConverter LiteralConv = PP.getLiteralConverter();
const char *TokBegin = begin;
@@ -1834,8 +1834,8 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
}
llvm::TextEncodingConverter *Converter = nullptr;
- if (!isUTFLiteral(Kind) && !isWideLiteral(Kind) && LiteralConv)
- Converter = LiteralConv->getConverter(CA_ToExecEncoding);
+ if (!isUTFLiteral(Kind) && !isWideLiteral(Kind))
+ Converter = LiteralConv.getConverter(CA_ToExecEncoding);
while (begin != end) {
// Is this a span of non-escape characters?
@@ -1902,21 +1902,29 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
PP.Diag(Loc, diag::err_character_too_large);
}
} else {
- char Cp[8];
+ char Cp[5];
char *ResultPtr = Cp;
- unsigned CharByteWidth = 1;
EncodeUCNEscape(TokBegin, begin, end, ResultPtr, HadError,
FullSourceLoc(Loc, PP.getSourceManager()),
- CharByteWidth, &PP.getDiagnostics(), PP.getLangOpts());
+ /*CharByteWidth=*/1u, &PP.getDiagnostics(),
+ PP.getLangOpts());
+ assert(ResultPtr - Cp <= 4 &&
+ "unexpected result size for UCN escape character");
if (!HadError) {
SmallString<8> CpConv;
- Converter->convert(StringRef(Cp), CpConv);
- if (CpConv.size() > 1) {
+ StringRef ToConvert(Cp, ResultPtr - Cp);
+ std::error_code EC = Converter->convert(StringRef(Cp), CpConv);
+ if (EC) {
+ PP.Diag(Loc, diag::err_exec_charset_conversion_failed)
+ << EC.message();
HadError = true;
- PP.Diag(Loc, diag::err_character_too_large);
} else {
- memcpy(Cp, CpConv.data(), CpConv.size());
- *buffer_begin = *Cp;
+ if (CpConv.size() > 1) {
+ HadError = true;
+ PP.Diag(Loc, diag::err_character_too_large);
+ } else {
+ *buffer_begin = CpConv[0];
+ }
}
}
}
@@ -2048,6 +2056,42 @@ StringLiteralParser::StringLiteralParser(ArrayRef<Token> StringToks,
init(StringToks, Action);
}
+static char *convertCharactersInPlace(char *ResultPtr, char *ResultPtrBefore,
+ const unsigned CharByteWidth,
+ bool &hadError,
+ llvm::TextEncodingConverter &Converter) {
+ assert(!hadError && "Unexpected call to convertCharactersInPlace");
+
+ SmallString<256> CpConv;
+ int ResultLength = ResultPtr - ResultPtrBefore;
+ assert(ResultLength % CharByteWidth == 0 &&
+ "Unexpected span of bytes for the characters.");
+ char *Cp = ResultPtrBefore;
+ if (Converter.convert(StringRef(Cp, ResultLength / CharByteWidth), CpConv)) {
+ hadError = true;
+ return ResultPtr;
+ }
+ if (CharByteWidth == 1) {
+ memcpy(Cp, CpConv.data(), CpConv.size());
+ return Cp + CpConv.size();
+ }
+ std::string UTF8String;
+ if (CharByteWidth == 4)
+ convertUTF32ToUTF8String(ArrayRef<char>(Cp, ResultLength), UTF8String);
+ else if (CharByteWidth == 2)
+ convertUTF16ToUTF8String(ArrayRef<char>(Cp, ResultLength), UTF8String);
+ if (Converter.convert(UTF8String, CpConv)) {
+ hadError = true;
+ return ResultPtr;
+ }
+ int NewCharByteWidth = ((int)CpConv.size()) / (ResultLength / CharByteWidth);
+ unsigned EndianOffset = llvm::sys::IsBigEndianHost ? CharByteWidth - 1 : 0;
+ for (int i = 0; i < (int)CpConv.size(); i += NewCharByteWidth)
+ memcpy(Cp + EndianOffset + i * CharByteWidth, CpConv.data() + i,
+ NewCharByteWidth);
+ return Cp + CpConv.size() * CharByteWidth;
+}
+
void StringLiteralParser::init(ArrayRef<Token> StringToks,
ConversionAction Action) {
// The literal token may have come from an invalid source location (e.g. due
@@ -2254,6 +2298,7 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks,
StringRef BeforeCRLF = RemainingTokenSpan.substr(0, CRLFPos);
StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos);
+ char *ResultPtrBefore = ResultPtr;
// Copy everything before the \r\n sequence into the string literal.
if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF))
hadError = true;
@@ -2261,21 +2306,11 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks,
if (!hadError && Converter) {
assert(Kind != tok::wide_string_literal &&
"Wide character translation not supported");
- SmallString<256> CpConv;
- int ResultLength = BeforeCRLF.size() * CharByteWidth;
- char *Cp = ResultPtr - ResultLength;
- std::error_code EC =
- Converter->convert(StringRef(Cp, ResultLength), CpConv);
- if (EC) {
- if (Diags)
- Diags->Report(StringToks[i].getLocation(),
- diag::err_exec_charset_conversion_failed)
- << EC.message();
- hadError = true;
- } else {
- memcpy(Cp, CpConv.data(), ResultLength);
- ResultPtr = Cp + CpConv.size();
- }
+ ResultPtr = convertCharactersInPlace(
+ ResultPtr, ResultPtrBefore, CharByteWidth, hadError, *Converter);
+ if (hadError && Diags)
+ Diags->Report(StringToks[i].getLocation(),
+ diag::err_exec_charset_conversion_failed);
}
// Point into the \n inside the \r\n sequence and operate on the
// remaining portion of the literal.
@@ -2311,7 +2346,7 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks,
++ThisTokBuf;
} while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
- int Length = ThisTokBuf - InStart;
+ char *ResultPtrBefore = ResultPtr;
// Copy the character span over.
if (CopyStringFragment(StringToks[i], ThisTokBegin,
StringRef(InStart, ThisTokBuf - InStart)))
@@ -2320,21 +2355,12 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks,
if (!hadError && Converter) {
assert(Kind != tok::wide_string_literal &&
"Wide character translation not supported");
- SmallString<256> CpConv;
- int ResultLength = Length * CharByteWidth;
- char *Cp = ResultPtr - ResultLength;
- std::error_code EC =
- Converter->convert(StringRef(Cp, ResultLength), CpConv);
- if (EC) {
- if (Diags)
- Diags->Report(StringToks[i].getLocation(),
- diag::err_exec_charset_conversion_failed)
- << EC.message();
- hadError = true;
- } else {
- memcpy(Cp, CpConv.data(), ResultLength);
- ResultPtr = Cp + CpConv.size();
- }
+ ResultPtr =
+ convertCharactersInPlace(ResultPtr, ResultPtrBefore,
+ CharByteWidth, hadError, *Converter);
+ if (hadError && Diags)
+ Diags->Report(StringToks[i].getLocation(),
+ diag::err_exec_charset_conversion_failed);
}
continue;
}
>From 123048c74b572233e2e9c8f8d47b11a83c3c8566 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Mon, 15 Dec 2025 11:39:03 -0500
Subject: [PATCH 07/15] add proper diagnostics for char conversion
---
.../include/clang/Basic/DiagnosticLexKinds.td | 2 ++
clang/lib/Lex/LiteralSupport.cpp | 34 ++++++++++++++-----
2 files changed, 28 insertions(+), 8 deletions(-)
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index 2d239a7880ac8..1af3e5d20f99a 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -289,6 +289,8 @@ def err_character_too_large : Error<
"character too large for enclosing character literal type">;
def err_exec_charset_conversion_failed : Error<
"conversion to execution encoding failed: '%0'">;
+def err_char_size_increased_after_conversion
+ : Error<"character size of '%0' increased after conversion">;
def warn_c99_compat_unicode_literal : Warning<
"unicode literals are incompatible with C99">,
InGroup<C99Compat>, DefaultIgnore;
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index f652a41300a43..73e119139a80b 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -377,10 +377,20 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
// Invalid escapes are written as '?' and then translated.
char ByteChar = Invalid ? '?' : ResultChar;
SmallString<8> ResultCharConv;
- Converter->convert(StringRef(&ByteChar, 1), ResultCharConv);
- assert(ResultCharConv.size() == 1 &&
- "Char size increased after translation");
- ResultChar = ResultCharConv[0];
+ std::error_code EC =
+ Converter->convert(StringRef(&ByteChar, 1), ResultCharConv);
+ if (EC) {
+ Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+ diag::err_exec_charset_conversion_failed)
+ << EC.message();
+ HadError = true;
+ } else {
+ if (ResultCharConv.size() > 1)
+ Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+ diag::err_char_size_increased_after_conversion)
+ << ByteChar;
+ ResultChar = ResultCharConv[0];
+ }
}
return ResultChar;
}
@@ -1879,10 +1889,18 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
"Wide character translation not supported");
char ByteChar = *tmp_out_start;
SmallString<1> ConvertedChar;
- Converter->convert(StringRef(&ByteChar, 1), ConvertedChar);
- assert(ConvertedChar.size() == 1 &&
- "Char size increased after translation");
- *tmp_out_start = ConvertedChar[0];
+ std::error_code EC =
+ Converter->convert(StringRef(&ByteChar, 1), ConvertedChar);
+ if (EC) {
+ PP.Diag(Loc, diag::err_exec_charset_conversion_failed)
+ << EC.message();
+ HadError = true;
+ } else {
+ if (ConvertedChar.size() > 1)
+ PP.Diag(Loc, diag::err_char_size_increased_after_conversion)
+ << ByteChar;
+ *tmp_out_start = ConvertedChar[0];
+ }
}
}
}
>From b3bc569c8dace6105d2f8e8bb84c978f141a9478 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Tue, 16 Dec 2025 09:02:22 -0500
Subject: [PATCH 08/15] rename LiteralConverter to TextEncodingConfig
---
clang/include/clang/Lex/LiteralSupport.h | 10 ++---
clang/include/clang/Lex/Preprocessor.h | 6 +--
...iteralConverter.h => TextEncodingConfig.h} | 6 +--
clang/lib/Frontend/CompilerInstance.cpp | 6 +--
clang/lib/Lex/CMakeLists.txt | 2 +-
clang/lib/Lex/LiteralSupport.cpp | 10 ++---
...alConverter.cpp => TextEncodingConfig.cpp} | 37 +++++++++----------
7 files changed, 38 insertions(+), 39 deletions(-)
rename clang/include/clang/Lex/{LiteralConverter.h => TextEncodingConfig.h} (88%)
rename clang/lib/Lex/{LiteralConverter.cpp => TextEncodingConfig.cpp} (50%)
diff --git a/clang/include/clang/Lex/LiteralSupport.h b/clang/include/clang/Lex/LiteralSupport.h
index 32ae829096592..b4defac24bf7c 100644
--- a/clang/include/clang/Lex/LiteralSupport.h
+++ b/clang/include/clang/Lex/LiteralSupport.h
@@ -17,7 +17,7 @@
#include "clang/Basic/CharInfo.h"
#include "clang/Basic/LLVM.h"
#include "clang/Basic/TokenKinds.h"
-#include "clang/Lex/LiteralConverter.h"
+#include "clang/Lex/TextEncodingConfig.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallString.h"
@@ -234,7 +234,7 @@ class StringLiteralParser {
const LangOptions &Features;
const TargetInfo &Target;
DiagnosticsEngine *Diags;
- LiteralConverter *LiteralConv;
+ TextEncodingConfig *TEC;
unsigned MaxTokenLength;
unsigned SizeBound;
@@ -255,9 +255,9 @@ class StringLiteralParser {
StringLiteralParser(ArrayRef<Token> StringToks, const SourceManager &sm,
const LangOptions &features, const TargetInfo &target,
DiagnosticsEngine *diags = nullptr)
- : SM(sm), Features(features), Target(target), Diags(diags),
- LiteralConv(nullptr), MaxTokenLength(0), SizeBound(0), CharByteWidth(0),
- Kind(tok::unknown), ResultPtr(ResultBuf.data()),
+ : SM(sm), Features(features), Target(target), Diags(diags), TEC(nullptr),
+ MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
+ ResultPtr(ResultBuf.data()),
EvalMethod(StringLiteralEvalMethod::Evaluated), hadError(false),
Pascal(false) {
init(StringToks, CA_NoConversion);
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 62a65d4127947..b5c6970e4f64f 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -25,12 +25,12 @@
#include "clang/Basic/TokenKinds.h"
#include "clang/Lex/HeaderSearch.h"
#include "clang/Lex/Lexer.h"
-#include "clang/Lex/LiteralConverter.h"
#include "clang/Lex/MacroInfo.h"
#include "clang/Lex/ModuleLoader.h"
#include "clang/Lex/ModuleMap.h"
#include "clang/Lex/PPCallbacks.h"
#include "clang/Lex/PPEmbedParameters.h"
+#include "clang/Lex/TextEncodingConfig.h"
#include "clang/Lex/Token.h"
#include "clang/Lex/TokenLexer.h"
#include "clang/Support/Compiler.h"
@@ -223,7 +223,7 @@ class Preprocessor {
std::unique_ptr<ScratchBuffer> ScratchBuf;
HeaderSearch &HeaderInfo;
ModuleLoader &TheModuleLoader;
- LiteralConverter LiteralConv;
+ TextEncodingConfig TEC;
/// External source of macros.
ExternalPreprocessorSource *ExternalSource;
@@ -1301,7 +1301,7 @@ class Preprocessor {
SelectorTable &getSelectorTable() { return Selectors; }
Builtin::Context &getBuiltinInfo() { return *BuiltinInfo; }
llvm::BumpPtrAllocator &getPreprocessorAllocator() { return BP; }
- LiteralConverter &getLiteralConverter() { return LiteralConv; }
+ TextEncodingConfig &getTextEncodingConfig() { return TEC; }
void setExternalSource(ExternalPreprocessorSource *Source) {
ExternalSource = Source;
diff --git a/clang/include/clang/Lex/LiteralConverter.h b/clang/include/clang/Lex/TextEncodingConfig.h
similarity index 88%
rename from clang/include/clang/Lex/LiteralConverter.h
rename to clang/include/clang/Lex/TextEncodingConfig.h
index 6a66d2d0ff707..43933801ca00d 100644
--- a/clang/include/clang/Lex/LiteralConverter.h
+++ b/clang/include/clang/Lex/TextEncodingConfig.h
@@ -1,4 +1,4 @@
-//===--- clang/Lex/LiteralConverter.h - Translator for Literals -*- C++ -*-===//
+//===-- clang/Lex/TextEncodingConfig.h - Text Conversion Config -*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -22,7 +22,7 @@ enum ConversionAction {
CA_ToExecEncoding
};
-class LiteralConverter {
+class TextEncodingConfig {
llvm::StringRef InternalEncoding;
llvm::StringRef SystemEncoding;
llvm::StringRef ExecEncoding;
@@ -32,7 +32,7 @@ class LiteralConverter {
public:
llvm::TextEncodingConverter *getConverter(ConversionAction Action);
static std::error_code
- setConvertersFromOptions(LiteralConverter &LiteralConv,
+ setConvertersFromOptions(TextEncodingConfig &TEC,
const clang::LangOptions &Opts,
const clang::TargetInfo &TInfo);
};
diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index 85ac1bfd1303e..6a35e1c1b234b 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -32,9 +32,9 @@
#include "clang/Frontend/Utils.h"
#include "clang/Frontend/VerifyDiagnosticConsumer.h"
#include "clang/Lex/HeaderSearch.h"
-#include "clang/Lex/LiteralConverter.h"
#include "clang/Lex/Preprocessor.h"
#include "clang/Lex/PreprocessorOptions.h"
+#include "clang/Lex/TextEncodingConfig.h"
#include "clang/Sema/CodeCompleteConsumer.h"
#include "clang/Sema/ParsedAttr.h"
#include "clang/Sema/Sema.h"
@@ -546,8 +546,8 @@ void CompilerInstance::createPreprocessor(TranslationUnitKind TUKind) {
if (GetDependencyDirectives)
PP->setDependencyDirectivesGetter(*GetDependencyDirectives);
- if (LiteralConverter::setConvertersFromOptions(PP->getLiteralConverter(),
- getLangOpts(), getTarget()))
+ if (TextEncodingConfig::setConvertersFromOptions(PP->getTextEncodingConfig(),
+ getLangOpts(), getTarget()))
PP->getDiagnostics().Report(clang::diag::err_fe_literal_conv_config);
}
diff --git a/clang/lib/Lex/CMakeLists.txt b/clang/lib/Lex/CMakeLists.txt
index 9e38a1b8fbb44..106a5d3b126be 100644
--- a/clang/lib/Lex/CMakeLists.txt
+++ b/clang/lib/Lex/CMakeLists.txt
@@ -12,7 +12,6 @@ add_clang_library(clangLex
InitHeaderSearch.cpp
Lexer.cpp
LexHLSLRootSignature.cpp
- LiteralConverter.cpp
LiteralSupport.cpp
MacroArgs.cpp
MacroInfo.cpp
@@ -30,6 +29,7 @@ add_clang_library(clangLex
Preprocessor.cpp
PreprocessorLexer.cpp
ScratchBuffer.cpp
+ TextEncodingConfig.cpp
TokenConcatenation.cpp
TokenLexer.cpp
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index 73e119139a80b..e57b5a8b48c43 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -1776,7 +1776,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
HadError = false;
Kind = kind;
- LiteralConverter LiteralConv = PP.getLiteralConverter();
+ TextEncodingConfig TEC = PP.getTextEncodingConfig();
const char *TokBegin = begin;
@@ -1845,7 +1845,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
llvm::TextEncodingConverter *Converter = nullptr;
if (!isUTFLiteral(Kind) && !isWideLiteral(Kind))
- Converter = LiteralConv.getConverter(CA_ToExecEncoding);
+ Converter = TEC.getConverter(CA_ToExecEncoding);
while (begin != end) {
// Is this a span of non-escape characters?
@@ -2068,7 +2068,7 @@ StringLiteralParser::StringLiteralParser(ArrayRef<Token> StringToks,
ConversionAction Action)
: SM(PP.getSourceManager()), Features(PP.getLangOpts()),
Target(PP.getTargetInfo()), Diags(&PP.getDiagnostics()),
- LiteralConv(&PP.getLiteralConverter()), MaxTokenLength(0), SizeBound(0),
+ TEC(&PP.getTextEncodingConfig()), MaxTokenLength(0), SizeBound(0),
CharByteWidth(0), Kind(tok::unknown), ResultPtr(ResultBuf.data()),
EvalMethod(EvalMethod), hadError(false), Pascal(false) {
init(StringToks, Action);
@@ -2204,8 +2204,8 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks,
SourceLocation UDSuffixTokLoc;
llvm::TextEncodingConverter *Converter = nullptr;
- if (!isUTFLiteral(Kind) && !isWideLiteral(Kind) && LiteralConv)
- Converter = LiteralConv->getConverter(Action);
+ if (!isUTFLiteral(Kind) && !isWideLiteral(Kind) && TEC)
+ Converter = TEC->getConverter(Action);
for (unsigned i = 0, e = StringToks.size(); i != e; ++i) {
const char *ThisTokBuf = &TokenBuf[0];
diff --git a/clang/lib/Lex/LiteralConverter.cpp b/clang/lib/Lex/TextEncodingConfig.cpp
similarity index 50%
rename from clang/lib/Lex/LiteralConverter.cpp
rename to clang/lib/Lex/TextEncodingConfig.cpp
index 2bd177d499b87..7fcaabdeb4b88 100644
--- a/clang/lib/Lex/LiteralConverter.cpp
+++ b/clang/lib/Lex/TextEncodingConfig.cpp
@@ -1,4 +1,4 @@
-//===--- LiteralConverter.cpp - Translator for String Literals -----------===//
+//===--- TextEncodingConfig.cpp -------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -6,13 +6,13 @@
//
//===----------------------------------------------------------------------===//
-#include "clang/Lex/LiteralConverter.h"
+#include "clang/Lex/TextEncodingConfig.h"
#include "clang/Basic/DiagnosticDriver.h"
using namespace llvm;
llvm::TextEncodingConverter *
-LiteralConverter::getConverter(ConversionAction Action) {
+TextEncodingConfig::getConverter(ConversionAction Action) {
if (Action == CA_ToSystemEncoding)
return ToSystemEncodingConverter;
else if (Action == CA_ToExecEncoding)
@@ -22,23 +22,22 @@ LiteralConverter::getConverter(ConversionAction Action) {
}
std::error_code
-LiteralConverter::setConvertersFromOptions(LiteralConverter &LiteralConv,
- const clang::LangOptions &Opts,
- const clang::TargetInfo &TInfo) {
+TextEncodingConfig::setConvertersFromOptions(TextEncodingConfig &TEC,
+ const clang::LangOptions &Opts,
+ const clang::TargetInfo &TInfo) {
using namespace llvm;
- LiteralConv.InternalEncoding = "UTF-8";
- LiteralConv.SystemEncoding = TInfo.getTriple().getDefaultNarrowTextEncoding();
- LiteralConv.ExecEncoding = Opts.ExecEncoding.empty()
- ? LiteralConv.InternalEncoding
- : Opts.ExecEncoding;
+ TEC.InternalEncoding = "UTF-8";
+ TEC.SystemEncoding = TInfo.getTriple().getDefaultNarrowTextEncoding();
+ TEC.ExecEncoding =
+ Opts.ExecEncoding.empty() ? TEC.InternalEncoding : Opts.ExecEncoding;
// Create converter between internal and system encoding
- if (LiteralConv.InternalEncoding != LiteralConv.SystemEncoding) {
+ if (TEC.InternalEncoding != TEC.SystemEncoding) {
ErrorOr<TextEncodingConverter> ErrorOrConverter =
- llvm::TextEncodingConverter::create(LiteralConv.InternalEncoding,
- LiteralConv.SystemEncoding);
+ llvm::TextEncodingConverter::create(TEC.InternalEncoding,
+ TEC.SystemEncoding);
if (ErrorOrConverter) {
- LiteralConv.ToSystemEncodingConverter =
+ TEC.ToSystemEncodingConverter =
new TextEncodingConverter(std::move(*ErrorOrConverter));
} else
return ErrorOrConverter.getError();
@@ -46,13 +45,13 @@ LiteralConverter::setConvertersFromOptions(LiteralConverter &LiteralConv,
// Create converter between internal and exec encoding specified
// in fexec-charset option.
- if (LiteralConv.InternalEncoding == LiteralConv.ExecEncoding)
+ if (TEC.InternalEncoding == TEC.ExecEncoding)
return std::error_code();
ErrorOr<TextEncodingConverter> ErrorOrConverter =
- llvm::TextEncodingConverter::create(LiteralConv.InternalEncoding,
- LiteralConv.ExecEncoding);
+ llvm::TextEncodingConverter::create(TEC.InternalEncoding,
+ TEC.ExecEncoding);
if (ErrorOrConverter) {
- LiteralConv.ToExecEncodingConverter =
+ TEC.ToExecEncodingConverter =
new TextEncodingConverter(std::move(*ErrorOrConverter));
} else
return ErrorOrConverter.getError();
>From f3448bfc65c6e70a4b6f0f655158e4132ac33ca0 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Tue, 16 Dec 2025 15:09:39 -0500
Subject: [PATCH 09/15] improve testcase
---
.../clang/Basic/DiagnosticFrontendKinds.td | 4 ++--
clang/lib/Frontend/CompilerInstance.cpp | 2 +-
clang/test/CodeGen/systemz-charset.c | 18 +++++++++++++++---
3 files changed, 18 insertions(+), 6 deletions(-)
diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
index 78677396eeab1..33bc20bfb4cc0 100644
--- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td
+++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
@@ -338,8 +338,8 @@ def err_non_default_visibility_dllimport : Error<
"non-default visibility cannot be applied to 'dllimport' declaration">;
def err_ifunc_resolver_return : Error<
"ifunc resolver function must return a pointer">;
-def err_fe_literal_conv_config : Error<
- "failed to configure the literal converter">;
+def err_fe_text_encoding_config
+ : Error<"failed to configure the text encoding config">;
def warn_atomic_op_misaligned : Warning<
"misaligned atomic operation may incur "
"significant performance penalty"
diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index 6a35e1c1b234b..9d523d096f47d 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -548,7 +548,7 @@ void CompilerInstance::createPreprocessor(TranslationUnitKind TUKind) {
if (TextEncodingConfig::setConvertersFromOptions(PP->getTextEncodingConfig(),
getLangOpts(), getTarget()))
- PP->getDiagnostics().Report(clang::diag::err_fe_literal_conv_config);
+ PP->getDiagnostics().Report(clang::diag::err_fe_text_encoding_config);
}
std::string
diff --git a/clang/test/CodeGen/systemz-charset.c b/clang/test/CodeGen/systemz-charset.c
index aab43157b1be4..c68c58219c2af 100644
--- a/clang/test/CodeGen/systemz-charset.c
+++ b/clang/test/CodeGen/systemz-charset.c
@@ -1,35 +1,47 @@
// RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -fexec-charset IBM-1047 -o - | FileCheck %s
// RUN: %clang %s -emit-llvm -S -target s390x-ibm-zos -o - | FileCheck %s
+// RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -fexec-charset UTF-8 -o - | FileCheck %s --check-prefix=CHECK-UTF8
const char *UpperCaseLetters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
-// CHECK: c"\C1\C2\C3\C4\C5\C6\C7\C8\C9\D1\D2\D3\D4\D5\D6\D7\D8\D9\E2\E3\E4\E5\E6\E7\E8\E9\00"
+//CHECK: c"\C1\C2\C3\C4\C5\C6\C7\C8\C9\D1\D2\D3\D4\D5\D6\D7\D8\D9\E2\E3\E4\E5\E6\E7\E8\E9\00"
+//CHECK-UTF8: c"ABCDEFGHIJKLMNOPQRSTUVWXYZ\00"
const char *LowerCaseLetters = "abcdefghijklmnopqrstuvwxyz";
//CHECK: c"\81\82\83\84\85\86\87\88\89\91\92\93\94\95\96\97\98\99\A2\A3\A4\A5\A6\A7\A8\A9\00"
+//CHECK-UTF8: c"abcdefghijklmnopqrstuvwxyz\00"
const char *Digits = "0123456789";
-// CHECK: c"\F0\F1\F2\F3\F4\F5\F6\F7\F8\F9\00"
+//CHECK: c"\F0\F1\F2\F3\F4\F5\F6\F7\F8\F9\00"
+//CHECK-UTF8: c"0123456789\00"
const char *SpecialCharacters = " .<(+|&!$*);^-/,%%_>`:#@=";
-// CHECK: c"@KLMNOPZ[\\]^_`akllmnyz{|~\00"
+//CHECK: c"@KLMNOPZ[\\]^_`akllmnyz{|~\00"
+//CHECK-UTF8: c" .<(+|&!$*);^-/,%%_>`:#@=\00"
const char *EscapeCharacters = "\a\b\f\n\r\t\v\\\'\"\?";
//CHECK: c"/\16\0C\15\0D\05\0B\E0}\7Fo\00"
+//CHECK-UTF8: c"\07\08\0C\0A\0D\09\0B\\'\22?\00"
const char *InvalidEscape = "\y\z";
//CHECK: c"oo\00"
+//CHECK-UTF8: c"yz\00"
const char *HexCharacters = "\x12\x13\x14";
//CHECK: c"\12\13\14\00"
+//CHECK-UTF8: c"\12\13\14\00"
const char *OctalCharacters = "\141\142\143";
//CHECK: c"abc\00"
+//CHECK-UTF8: c"abc\00"
const char singleChar = 'a';
//CHECK: i8 -127
+//CHECK-UTF8: 97
const char *UcnCharacters = "\u00E2\u00AC\U000000DF";
//CHECK: c"B\B0Y\00"
+//CHECK-UTF8: c"\C3\A2\C2\AC\C3\9F\00"
const char *Unicode = "ÿ";
//CHECK: c"\DF\00"
+//CHECK-UTF8: c"\C3\BF\00"
>From 96bf61e3e27af1caeca58208855a63a7ac1556db Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Mon, 12 Jan 2026 14:24:09 -0500
Subject: [PATCH 10/15] address comments
---
clang/include/clang/Lex/LiteralSupport.h | 1 +
clang/include/clang/Lex/TextEncodingConfig.h | 6 +++---
clang/lib/Driver/ToolChains/Clang.cpp | 12 ++++++------
clang/lib/Lex/LiteralSupport.cpp | 6 +++---
clang/lib/Lex/TextEncodingConfig.cpp | 10 ++++++----
llvm/include/llvm/TargetParser/Triple.h | 3 ++-
6 files changed, 21 insertions(+), 17 deletions(-)
diff --git a/clang/include/clang/Lex/LiteralSupport.h b/clang/include/clang/Lex/LiteralSupport.h
index b4defac24bf7c..4af60814668e6 100644
--- a/clang/include/clang/Lex/LiteralSupport.h
+++ b/clang/include/clang/Lex/LiteralSupport.h
@@ -24,6 +24,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/DataTypes.h"
#include "llvm/Support/TextEncoding.h"
+
namespace clang {
class DiagnosticsEngine;
diff --git a/clang/include/clang/Lex/TextEncodingConfig.h b/clang/include/clang/Lex/TextEncodingConfig.h
index 43933801ca00d..492551571ae70 100644
--- a/clang/include/clang/Lex/TextEncodingConfig.h
+++ b/clang/include/clang/Lex/TextEncodingConfig.h
@@ -6,8 +6,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_CLANG_LEX_LITERALCONVERTER_H
-#define LLVM_CLANG_LEX_LITERALCONVERTER_H
+#ifndef LLVM_CLANG_LEX_TEXTENCODINGCONFIG_H
+#define LLVM_CLANG_LEX_TEXTENCODINGCONFIG_H
#include "clang/Basic/Diagnostic.h"
#include "clang/Basic/LangOptions.h"
@@ -30,7 +30,7 @@ class TextEncodingConfig {
llvm::TextEncodingConverter *ToExecEncodingConverter = nullptr;
public:
- llvm::TextEncodingConverter *getConverter(ConversionAction Action);
+ llvm::TextEncodingConverter *getConverter(ConversionAction Action) const;
static std::error_code
setConvertersFromOptions(TextEncodingConfig &TEC,
const clang::LangOptions &Opts,
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 631fe204cdd22..e0b863d0598cc 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7565,19 +7565,19 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
<< value;
}
- if (Arg *execEncoding = Args.getLastArg(options::OPT_fexec_charset_EQ)) {
- StringRef value = execEncoding->getValue();
+ if (Arg *ExecEncoding = Args.getLastArg(options::OPT_fexec_charset_EQ)) {
+ StringRef Value = ExecEncoding->getValue();
bool KnownEncoding =
- llvm::TextEncodingConverter::getKnownEncoding(value).has_value();
+ llvm::TextEncodingConverter::getKnownEncoding(Value).has_value();
if (!KnownEncoding) {
llvm::ErrorOr<llvm::TextEncodingConverter> ErrorOrConverter =
- llvm::TextEncodingConverter::create("UTF-8", value.data());
+ llvm::TextEncodingConverter::create("UTF-8", Value.data());
if (!ErrorOrConverter)
D.Diag(diag::err_drv_invalid_value)
- << execEncoding->getAsString(Args) << value;
+ << ExecEncoding->getAsString(Args) << Value;
}
CmdArgs.push_back("-fexec-charset");
- CmdArgs.push_back(Args.MakeArgString(value));
+ CmdArgs.push_back(Args.MakeArgString(Value));
} else {
// Set the default fexec-charset as the system charset.
CmdArgs.push_back("-fexec-charset");
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index e57b5a8b48c43..ce5030240e133 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -1776,7 +1776,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
HadError = false;
Kind = kind;
- TextEncodingConfig TEC = PP.getTextEncodingConfig();
+ const TextEncodingConfig TEC = PP.getTextEncodingConfig();
const char *TokBegin = begin;
@@ -1844,7 +1844,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
}
llvm::TextEncodingConverter *Converter = nullptr;
- if (!isUTFLiteral(Kind) && !isWideLiteral(Kind))
+ if (isOrdinary())
Converter = TEC.getConverter(CA_ToExecEncoding);
while (begin != end) {
@@ -2204,7 +2204,7 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks,
SourceLocation UDSuffixTokLoc;
llvm::TextEncodingConverter *Converter = nullptr;
- if (!isUTFLiteral(Kind) && !isWideLiteral(Kind) && TEC)
+ if (isOrdinary() && TEC)
Converter = TEC->getConverter(Action);
for (unsigned i = 0, e = StringToks.size(); i != e; ++i) {
diff --git a/clang/lib/Lex/TextEncodingConfig.cpp b/clang/lib/Lex/TextEncodingConfig.cpp
index 7fcaabdeb4b88..b8e5109241128 100644
--- a/clang/lib/Lex/TextEncodingConfig.cpp
+++ b/clang/lib/Lex/TextEncodingConfig.cpp
@@ -12,13 +12,15 @@
using namespace llvm;
llvm::TextEncodingConverter *
-TextEncodingConfig::getConverter(ConversionAction Action) {
- if (Action == CA_ToSystemEncoding)
+TextEncodingConfig::getConverter(ConversionAction Action) const {
+ switch (Action) {
+ case CA_ToSystemEncoding:
return ToSystemEncodingConverter;
- else if (Action == CA_ToExecEncoding)
+ case CA_ToExecEncoding:
return ToExecEncodingConverter;
- else
+ default:
return nullptr;
+ }
}
std::error_code
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index 19586b7df9934..192647977db76 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -513,7 +513,8 @@ class Triple {
/// For example, "fooos1.2.3" would return "1.2.3".
LLVM_ABI StringRef getEnvironmentVersionString() const;
- /// getDefaultNarrowTextEncoding - Get the default encoding of the triple.
+ /// Get the default system encoding of the triple.
+ /// For example, "IBM-1047" for z/OS, "UTF-8" for others
StringRef getDefaultNarrowTextEncoding() const;
/// @}
>From 4ca764cf7ebb174151cdc71768af122c0aca930b Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Tue, 13 Jan 2026 09:13:42 -0500
Subject: [PATCH 11/15] remove implementation details from driver, use const
reference
---
clang/lib/Driver/ToolChains/Clang.cpp | 16 ++++++----------
clang/lib/Lex/LiteralSupport.cpp | 2 +-
llvm/include/llvm/Support/TextEncoding.h | 3 +--
llvm/include/llvm/TargetParser/Triple.h | 2 +-
llvm/lib/Support/TextEncoding.cpp | 13 +++++++++++--
5 files changed, 20 insertions(+), 16 deletions(-)
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index e0b863d0598cc..e2de73313f42e 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7567,17 +7567,13 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
if (Arg *ExecEncoding = Args.getLastArg(options::OPT_fexec_charset_EQ)) {
StringRef Value = ExecEncoding->getValue();
- bool KnownEncoding =
- llvm::TextEncodingConverter::getKnownEncoding(Value).has_value();
- if (!KnownEncoding) {
- llvm::ErrorOr<llvm::TextEncodingConverter> ErrorOrConverter =
- llvm::TextEncodingConverter::create("UTF-8", Value.data());
- if (!ErrorOrConverter)
- D.Diag(diag::err_drv_invalid_value)
- << ExecEncoding->getAsString(Args) << Value;
+ if (llvm::TextEncodingConverter::isEncodingSupported(Value)) {
+ CmdArgs.push_back("-fexec-charset");
+ CmdArgs.push_back(Args.MakeArgString(Value));
+ } else {
+ D.Diag(diag::err_drv_invalid_value)
+ << ExecEncoding->getAsString(Args) << Value;
}
- CmdArgs.push_back("-fexec-charset");
- CmdArgs.push_back(Args.MakeArgString(Value));
} else {
// Set the default fexec-charset as the system charset.
CmdArgs.push_back("-fexec-charset");
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index ce5030240e133..d0f6b79500fe0 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -1776,7 +1776,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
HadError = false;
Kind = kind;
- const TextEncodingConfig TEC = PP.getTextEncodingConfig();
+ const TextEncodingConfig &TEC = PP.getTextEncodingConfig();
const char *TokBegin = begin;
diff --git a/llvm/include/llvm/Support/TextEncoding.h b/llvm/include/llvm/Support/TextEncoding.h
index bda6f2a088eb2..3d31505c5bc6b 100644
--- a/llvm/include/llvm/Support/TextEncoding.h
+++ b/llvm/include/llvm/Support/TextEncoding.h
@@ -136,8 +136,7 @@ class TextEncodingConverter {
return EC;
}
- // Maps the encoding name to enum constant if possible.
- static std::optional<TextEncoding> getKnownEncoding(StringRef Name);
+ LLVM_ABI static bool isEncodingSupported(StringRef Name);
};
} // namespace llvm
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index 192647977db76..b23dde01ee4e2 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -515,7 +515,7 @@ class Triple {
/// Get the default system encoding of the triple.
/// For example, "IBM-1047" for z/OS, "UTF-8" for others
- StringRef getDefaultNarrowTextEncoding() const;
+ LLVM_ABI StringRef getDefaultNarrowTextEncoding() const;
/// @}
/// @name Convenience Predicates
diff --git a/llvm/lib/Support/TextEncoding.cpp b/llvm/lib/Support/TextEncoding.cpp
index 91bd482ea4a0c..8b98ae373370d 100644
--- a/llvm/lib/Support/TextEncoding.cpp
+++ b/llvm/lib/Support/TextEncoding.cpp
@@ -44,8 +44,7 @@ static void normalizeCharSetName(StringRef CSName,
}
// Maps the encoding name to enum constant if possible.
-std::optional<TextEncoding>
-TextEncodingConverter::getKnownEncoding(StringRef Name) {
+static std::optional<TextEncoding> getKnownEncoding(StringRef Name) {
SmallString<16> Normalized;
normalizeCharSetName(Name, Normalized);
if (Normalized.equals("utf8"))
@@ -55,6 +54,16 @@ TextEncodingConverter::getKnownEncoding(StringRef Name) {
return std::nullopt;
}
+bool TextEncodingConverter::isEncodingSupported(StringRef Name) {
+ if (getKnownEncoding(Name))
+ return true;
+ llvm::ErrorOr<llvm::TextEncodingConverter> ErrorOrConverter =
+ llvm::TextEncodingConverter::create("UTF-8", Name.data());
+ if (ErrorOrConverter)
+ return true;
+ return false;
+}
+
[[maybe_unused]] static void HandleOverflow(size_t &Capacity, char *&Output,
size_t &OutputLength,
SmallVectorImpl<char> &Result) {
>From 84b3a1d8435c06cc3ab6f3a1ff2fb99f616aee09 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Thu, 5 Feb 2026 10:34:49 -0500
Subject: [PATCH 12/15] translate typeinfo to default encoding, address other
comments
---
clang/include/clang/Basic/TokenKinds.h | 7 -------
clang/lib/Lex/TextEncodingConfig.cpp | 8 ++++----
2 files changed, 4 insertions(+), 11 deletions(-)
diff --git a/clang/include/clang/Basic/TokenKinds.h b/clang/include/clang/Basic/TokenKinds.h
index a18b81c4dcc26..ace510278186b 100644
--- a/clang/include/clang/Basic/TokenKinds.h
+++ b/clang/include/clang/Basic/TokenKinds.h
@@ -115,13 +115,6 @@ inline bool isLiteral(TokenKind K) {
return isInLiteralRange;
}
-/// Return true if this is a UTF literal kind.
-inline bool isUTFLiteral(TokenKind K) {
- return K == tok::utf8_char_constant || K == tok::utf8_string_literal ||
- K == tok::utf16_char_constant || K == tok::utf16_string_literal ||
- K == tok::utf32_char_constant || K == tok::utf32_string_literal;
-}
-
/// Return true if this is a wide literal kind.
inline bool isWideLiteral(TokenKind K) {
return K == tok::wide_char_constant || K == tok::wide_string_literal;
diff --git a/clang/lib/Lex/TextEncodingConfig.cpp b/clang/lib/Lex/TextEncodingConfig.cpp
index b8e5109241128..4a1fc477c4ade 100644
--- a/clang/lib/Lex/TextEncodingConfig.cpp
+++ b/clang/lib/Lex/TextEncodingConfig.cpp
@@ -38,10 +38,10 @@ TextEncodingConfig::setConvertersFromOptions(TextEncodingConfig &TEC,
ErrorOr<TextEncodingConverter> ErrorOrConverter =
llvm::TextEncodingConverter::create(TEC.InternalEncoding,
TEC.SystemEncoding);
- if (ErrorOrConverter) {
+ if (ErrorOrConverter)
TEC.ToSystemEncodingConverter =
new TextEncodingConverter(std::move(*ErrorOrConverter));
- } else
+ else
return ErrorOrConverter.getError();
}
@@ -52,10 +52,10 @@ TextEncodingConfig::setConvertersFromOptions(TextEncodingConfig &TEC,
ErrorOr<TextEncodingConverter> ErrorOrConverter =
llvm::TextEncodingConverter::create(TEC.InternalEncoding,
TEC.ExecEncoding);
- if (ErrorOrConverter) {
+ if (ErrorOrConverter)
TEC.ToExecEncodingConverter =
new TextEncodingConverter(std::move(*ErrorOrConverter));
- } else
+ else
return ErrorOrConverter.getError();
return std::error_code();
}
>From 3715ec00258784ea664c7b3b1ffd4a7a83afa3a7 Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Wed, 11 Feb 2026 14:01:19 -0500
Subject: [PATCH 13/15] remove ToSystemEncodingConverter, support translating
multibyte characters to single byte in the exec encoding
---
clang/include/clang/Lex/TextEncodingConfig.h | 7 +------
clang/lib/Lex/LiteralSupport.cpp | 19 ++++++++++---------
clang/lib/Lex/TextEncodingConfig.cpp | 14 --------------
clang/test/CodeGen/systemz-charset.c | 10 +++++++++-
4 files changed, 20 insertions(+), 30 deletions(-)
diff --git a/clang/include/clang/Lex/TextEncodingConfig.h b/clang/include/clang/Lex/TextEncodingConfig.h
index 492551571ae70..8a10122e9bb37 100644
--- a/clang/include/clang/Lex/TextEncodingConfig.h
+++ b/clang/include/clang/Lex/TextEncodingConfig.h
@@ -16,17 +16,12 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/TextEncoding.h"
-enum ConversionAction {
- CA_NoConversion,
- CA_ToSystemEncoding,
- CA_ToExecEncoding
-};
+enum ConversionAction { CA_NoConversion, CA_ToExecEncoding };
class TextEncodingConfig {
llvm::StringRef InternalEncoding;
llvm::StringRef SystemEncoding;
llvm::StringRef ExecEncoding;
- llvm::TextEncodingConverter *ToSystemEncodingConverter = nullptr;
llvm::TextEncodingConverter *ToExecEncodingConverter = nullptr;
public:
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index d0f6b79500fe0..e5dd72cf4e143 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -1826,6 +1826,10 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
uint32_t *buffer_begin = &codepoint_buffer.front();
uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
+ llvm::TextEncodingConverter *Converter = nullptr;
+ if (isOrdinary())
+ Converter = TEC.getConverter(CA_ToExecEncoding);
+
// Unicode escapes representing characters that cannot be correctly
// represented in a single code unit are disallowed in character literals
// by this implementation.
@@ -1840,13 +1844,9 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
} else if (tok::utf32_char_constant == Kind) {
largest_character_for_kind = 0x10FFFF;
} else {
- largest_character_for_kind = 0x7Fu;
+ largest_character_for_kind = (Converter == nullptr) ? 0x7Fu : 0xFFu;
}
- llvm::TextEncodingConverter *Converter = nullptr;
- if (isOrdinary())
- Converter = TEC.getConverter(CA_ToExecEncoding);
-
while (begin != end) {
// Is this a span of non-escape characters?
if (begin[0] != '\\') {
@@ -1887,10 +1887,11 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
if (!HadError && Converter) {
assert(Kind != tok::wide_char_constant &&
"Wide character translation not supported");
- char ByteChar = *tmp_out_start;
+ std::string UTF8String;
+ convertUTF32ToUTF8String(
+ ArrayRef<char>((const char *)tmp_out_start, 4), UTF8String);
SmallString<1> ConvertedChar;
- std::error_code EC =
- Converter->convert(StringRef(&ByteChar, 1), ConvertedChar);
+ std::error_code EC = Converter->convert(UTF8String, ConvertedChar);
if (EC) {
PP.Diag(Loc, diag::err_exec_charset_conversion_failed)
<< EC.message();
@@ -1898,7 +1899,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
} else {
if (ConvertedChar.size() > 1)
PP.Diag(Loc, diag::err_char_size_increased_after_conversion)
- << ByteChar;
+ << UTF8String;
*tmp_out_start = ConvertedChar[0];
}
}
diff --git a/clang/lib/Lex/TextEncodingConfig.cpp b/clang/lib/Lex/TextEncodingConfig.cpp
index 4a1fc477c4ade..8dff4063fe481 100644
--- a/clang/lib/Lex/TextEncodingConfig.cpp
+++ b/clang/lib/Lex/TextEncodingConfig.cpp
@@ -14,8 +14,6 @@ using namespace llvm;
llvm::TextEncodingConverter *
TextEncodingConfig::getConverter(ConversionAction Action) const {
switch (Action) {
- case CA_ToSystemEncoding:
- return ToSystemEncodingConverter;
case CA_ToExecEncoding:
return ToExecEncodingConverter;
default:
@@ -33,18 +31,6 @@ TextEncodingConfig::setConvertersFromOptions(TextEncodingConfig &TEC,
TEC.ExecEncoding =
Opts.ExecEncoding.empty() ? TEC.InternalEncoding : Opts.ExecEncoding;
- // Create converter between internal and system encoding
- if (TEC.InternalEncoding != TEC.SystemEncoding) {
- ErrorOr<TextEncodingConverter> ErrorOrConverter =
- llvm::TextEncodingConverter::create(TEC.InternalEncoding,
- TEC.SystemEncoding);
- if (ErrorOrConverter)
- TEC.ToSystemEncodingConverter =
- new TextEncodingConverter(std::move(*ErrorOrConverter));
- else
- return ErrorOrConverter.getError();
- }
-
// Create converter between internal and exec encoding specified
// in fexec-charset option.
if (TEC.InternalEncoding == TEC.ExecEncoding)
diff --git a/clang/test/CodeGen/systemz-charset.c b/clang/test/CodeGen/systemz-charset.c
index c68c58219c2af..33d9ec32d533e 100644
--- a/clang/test/CodeGen/systemz-charset.c
+++ b/clang/test/CodeGen/systemz-charset.c
@@ -1,6 +1,6 @@
// RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -fexec-charset IBM-1047 -o - | FileCheck %s
// RUN: %clang %s -emit-llvm -S -target s390x-ibm-zos -o - | FileCheck %s
-// RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -fexec-charset UTF-8 -o - | FileCheck %s --check-prefix=CHECK-UTF8
+// RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -fexec-charset UTF-8 -DIBM1047_ONLY=1 -o - | FileCheck %s --check-prefix=CHECK-UTF8
const char *UpperCaseLetters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
//CHECK: c"\C1\C2\C3\C4\C5\C6\C7\C8\C9\D1\D2\D3\D4\D5\D6\D7\D8\D9\E2\E3\E4\E5\E6\E7\E8\E9\00"
@@ -38,6 +38,14 @@ const char singleChar = 'a';
//CHECK: i8 -127
//CHECK-UTF8: 97
+#ifndef IBM1047_ONLY
+const char cent = '¢';
+//CHECK: i8 74
+
+const char currency = '¤';
+//CHECK: i8 -97
+#endif
+
const char *UcnCharacters = "\u00E2\u00AC\U000000DF";
//CHECK: c"B\B0Y\00"
//CHECK-UTF8: c"\C3\A2\C2\AC\C3\9F\00"
>From 2755287cda7e4a11160c77b151cf682dc356c3ef Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Wed, 11 Feb 2026 14:22:51 -0500
Subject: [PATCH 14/15] add test for cc1 error
---
clang/test/CodeGen/systemz-charset.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/clang/test/CodeGen/systemz-charset.c b/clang/test/CodeGen/systemz-charset.c
index 33d9ec32d533e..3bfe0890f0b4a 100644
--- a/clang/test/CodeGen/systemz-charset.c
+++ b/clang/test/CodeGen/systemz-charset.c
@@ -53,3 +53,7 @@ const char *UcnCharacters = "\u00E2\u00AC\U000000DF";
const char *Unicode = "ÿ";
//CHECK: c"\DF\00"
//CHECK-UTF8: c"\C3\BF\00"
+
+// RUN: not %clang_cc1 -fexec-charset invalid %s 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+// CHECK-ERROR: error: failed to configure the text encoding config
+
>From a8be1dec0937e1fceccac8085af68270a19d745a Mon Sep 17 00:00:00 2001
From: Abhina Sreeskantharajan <Abhina.Sreeskantharajan at ibm.com>
Date: Fri, 27 Feb 2026 09:00:13 -0500
Subject: [PATCH 15/15] address comments
---
.../clang/Basic/DiagnosticDriverKinds.td | 2 +
.../clang/Basic/DiagnosticFrontendKinds.td | 3 +-
clang/include/clang/Lex/LiteralSupport.h | 3 +-
clang/include/clang/Lex/TextEncodingConfig.h | 4 +-
clang/lib/Driver/ToolChains/Clang.cpp | 4 +-
clang/lib/Frontend/CompilerInstance.cpp | 7 +-
clang/lib/Lex/LiteralSupport.cpp | 143 +++++++-----------
clang/lib/Lex/TextEncodingConfig.cpp | 11 +-
clang/test/CodeGen/systemz-charset.c | 2 +-
clang/test/Driver/cl-options.c | 2 +-
clang/test/Driver/clang_f_opts.c | 2 +-
11 files changed, 73 insertions(+), 110 deletions(-)
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 90a92b1602231..e3736e99a9c57 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -149,6 +149,8 @@ def warn_drv_unsupported_option_part_for_target : Warning<
InGroup<OptionIgnored>;
def err_drv_unsupported_option_part_for_target : Error<
"'%0' in '%1' option is not currently supported for target '%2'">;
+def err_drv_unsupported_encoding_for_target
+ : Error<"'%0' is not a supported encoding in target '%1'">;
def warn_drv_invalid_argument_for_flang : Warning<
"'%0' is not valid for Fortran">,
InGroup<OptionIgnored>;
diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
index 33bc20bfb4cc0..e7c6a52a8284e 100644
--- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td
+++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
@@ -338,8 +338,7 @@ def err_non_default_visibility_dllimport : Error<
"non-default visibility cannot be applied to 'dllimport' declaration">;
def err_ifunc_resolver_return : Error<
"ifunc resolver function must return a pointer">;
-def err_fe_text_encoding_config
- : Error<"failed to configure the text encoding config">;
+def err_fe_text_encoding_config : Error<"failed to set fexec-charset to '%0'">;
def warn_atomic_op_misaligned : Warning<
"misaligned atomic operation may incur "
"significant performance penalty"
diff --git a/clang/include/clang/Lex/LiteralSupport.h b/clang/include/clang/Lex/LiteralSupport.h
index 4af60814668e6..6b404403ed95f 100644
--- a/clang/include/clang/Lex/LiteralSupport.h
+++ b/clang/include/clang/Lex/LiteralSupport.h
@@ -311,7 +311,8 @@ class StringLiteralParser {
private:
void init(ArrayRef<Token> StringToks, ConversionAction Action);
bool CopyStringFragment(const Token &Tok, const char *TokBegin,
- StringRef Fragment);
+ StringRef Fragment,
+ llvm::TextEncodingConverter *Converter);
void DiagnoseLexingError(SourceLocation Loc);
};
diff --git a/clang/include/clang/Lex/TextEncodingConfig.h b/clang/include/clang/Lex/TextEncodingConfig.h
index 8a10122e9bb37..a785edea923de 100644
--- a/clang/include/clang/Lex/TextEncodingConfig.h
+++ b/clang/include/clang/Lex/TextEncodingConfig.h
@@ -19,8 +19,6 @@
enum ConversionAction { CA_NoConversion, CA_ToExecEncoding };
class TextEncodingConfig {
- llvm::StringRef InternalEncoding;
- llvm::StringRef SystemEncoding;
llvm::StringRef ExecEncoding;
llvm::TextEncodingConverter *ToExecEncodingConverter = nullptr;
@@ -30,6 +28,8 @@ class TextEncodingConfig {
setConvertersFromOptions(TextEncodingConfig &TEC,
const clang::LangOptions &Opts,
const clang::TargetInfo &TInfo);
+
+ llvm::StringRef getExecEncoding() { return ExecEncoding; }
};
#endif
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index e2de73313f42e..8873d3735d0c9 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7571,8 +7571,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
CmdArgs.push_back("-fexec-charset");
CmdArgs.push_back(Args.MakeArgString(Value));
} else {
- D.Diag(diag::err_drv_invalid_value)
- << ExecEncoding->getAsString(Args) << Value;
+ D.Diag(diag::err_drv_unsupported_encoding_for_target)
+ << Value << Triple.getTriple();
}
} else {
// Set the default fexec-charset as the system charset.
diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index 9d523d096f47d..b6cc6c8c60a05 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -546,9 +546,10 @@ void CompilerInstance::createPreprocessor(TranslationUnitKind TUKind) {
if (GetDependencyDirectives)
PP->setDependencyDirectivesGetter(*GetDependencyDirectives);
- if (TextEncodingConfig::setConvertersFromOptions(PP->getTextEncodingConfig(),
- getLangOpts(), getTarget()))
- PP->getDiagnostics().Report(clang::diag::err_fe_text_encoding_config);
+ if (auto EC = TextEncodingConfig::setConvertersFromOptions(
+ PP->getTextEncodingConfig(), getLangOpts(), getTarget()))
+ PP->getDiagnostics().Report(clang::diag::err_fe_text_encoding_config)
+ << PP->getTextEncodingConfig().getExecEncoding();
}
std::string
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index e5dd72cf4e143..3838d42d9482b 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -126,6 +126,17 @@ static bool IsEscapeValidInUnevaluatedStringLiteral(char Escape) {
return false;
}
+static llvm::ErrorOr<char>
+convertCharacter(StringRef Char, llvm::TextEncodingConverter *Converter) {
+ SmallString<8> ResultCharConv;
+ std::error_code EC = Converter->convert(Char, ResultCharConv);
+ if (EC)
+ return EC;
+ else if (ResultCharConv.size() > 1)
+ return std::error_code(E2BIG, std::generic_category());
+ return ResultCharConv[0];
+}
+
/// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
/// either a character or a string literal.
static unsigned ProcessCharEscape(const char *ThisTokBegin,
@@ -376,20 +387,14 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
if (Transcode && Converter) {
// Invalid escapes are written as '?' and then translated.
char ByteChar = Invalid ? '?' : ResultChar;
- SmallString<8> ResultCharConv;
- std::error_code EC =
- Converter->convert(StringRef(&ByteChar, 1), ResultCharConv);
- if (EC) {
+ auto ErrorOrChar = convertCharacter(StringRef(&ByteChar, 1), Converter);
+ if (ErrorOrChar)
+ ResultChar = *ErrorOrChar;
+ else {
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
diag::err_exec_charset_conversion_failed)
- << EC.message();
+ << ErrorOrChar.getError().message();
HadError = true;
- } else {
- if (ResultCharConv.size() > 1)
- Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
- diag::err_char_size_increased_after_conversion)
- << ByteChar;
- ResultChar = ResultCharConv[0];
}
}
return ResultChar;
@@ -1885,8 +1890,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
PP.Diag(Loc, diag::err_character_too_large);
}
if (!HadError && Converter) {
- assert(Kind != tok::wide_char_constant &&
- "Wide character translation not supported");
+ assert(isOrdinary() && "Only ordinary characters are supported");
std::string UTF8String;
convertUTF32ToUTF8String(
ArrayRef<char>((const char *)tmp_out_start, 4), UTF8String);
@@ -1897,9 +1901,11 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
<< EC.message();
HadError = true;
} else {
- if (ConvertedChar.size() > 1)
+ if (ConvertedChar.size() > 1) {
+ HadError = true;
PP.Diag(Loc, diag::err_char_size_increased_after_conversion)
<< UTF8String;
+ }
*tmp_out_start = ConvertedChar[0];
}
}
@@ -1930,20 +1936,14 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
assert(ResultPtr - Cp <= 4 &&
"unexpected result size for UCN escape character");
if (!HadError) {
- SmallString<8> CpConv;
- StringRef ToConvert(Cp, ResultPtr - Cp);
- std::error_code EC = Converter->convert(StringRef(Cp), CpConv);
- if (EC) {
+ auto ErrorOrChar =
+ convertCharacter(StringRef(Cp, ResultPtr - Cp), Converter);
+ if (ErrorOrChar)
+ *buffer_begin = *ErrorOrChar;
+ else {
PP.Diag(Loc, diag::err_exec_charset_conversion_failed)
- << EC.message();
+ << ErrorOrChar.getError().message();
HadError = true;
- } else {
- if (CpConv.size() > 1) {
- HadError = true;
- PP.Diag(Loc, diag::err_character_too_large);
- } else {
- *buffer_begin = CpConv[0];
- }
}
}
}
@@ -2075,42 +2075,6 @@ StringLiteralParser::StringLiteralParser(ArrayRef<Token> StringToks,
init(StringToks, Action);
}
-static char *convertCharactersInPlace(char *ResultPtr, char *ResultPtrBefore,
- const unsigned CharByteWidth,
- bool &hadError,
- llvm::TextEncodingConverter &Converter) {
- assert(!hadError && "Unexpected call to convertCharactersInPlace");
-
- SmallString<256> CpConv;
- int ResultLength = ResultPtr - ResultPtrBefore;
- assert(ResultLength % CharByteWidth == 0 &&
- "Unexpected span of bytes for the characters.");
- char *Cp = ResultPtrBefore;
- if (Converter.convert(StringRef(Cp, ResultLength / CharByteWidth), CpConv)) {
- hadError = true;
- return ResultPtr;
- }
- if (CharByteWidth == 1) {
- memcpy(Cp, CpConv.data(), CpConv.size());
- return Cp + CpConv.size();
- }
- std::string UTF8String;
- if (CharByteWidth == 4)
- convertUTF32ToUTF8String(ArrayRef<char>(Cp, ResultLength), UTF8String);
- else if (CharByteWidth == 2)
- convertUTF16ToUTF8String(ArrayRef<char>(Cp, ResultLength), UTF8String);
- if (Converter.convert(UTF8String, CpConv)) {
- hadError = true;
- return ResultPtr;
- }
- int NewCharByteWidth = ((int)CpConv.size()) / (ResultLength / CharByteWidth);
- unsigned EndianOffset = llvm::sys::IsBigEndianHost ? CharByteWidth - 1 : 0;
- for (int i = 0; i < (int)CpConv.size(); i += NewCharByteWidth)
- memcpy(Cp + EndianOffset + i * CharByteWidth, CpConv.data() + i,
- NewCharByteWidth);
- return Cp + CpConv.size() * CharByteWidth;
-}
-
void StringLiteralParser::init(ArrayRef<Token> StringToks,
ConversionAction Action) {
// The literal token may have come from an invalid source location (e.g. due
@@ -2317,20 +2281,11 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks,
StringRef BeforeCRLF = RemainingTokenSpan.substr(0, CRLFPos);
StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos);
- char *ResultPtrBefore = ResultPtr;
// Copy everything before the \r\n sequence into the string literal.
- if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF))
+ if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF,
+ Converter))
hadError = true;
- if (!hadError && Converter) {
- assert(Kind != tok::wide_string_literal &&
- "Wide character translation not supported");
- ResultPtr = convertCharactersInPlace(
- ResultPtr, ResultPtrBefore, CharByteWidth, hadError, *Converter);
- if (hadError && Diags)
- Diags->Report(StringToks[i].getLocation(),
- diag::err_exec_charset_conversion_failed);
- }
// Point into the \n inside the \r\n sequence and operate on the
// remaining portion of the literal.
RemainingTokenSpan = AfterCRLF.substr(1);
@@ -2365,22 +2320,11 @@ void StringLiteralParser::init(ArrayRef<Token> StringToks,
++ThisTokBuf;
} while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
- char *ResultPtrBefore = ResultPtr;
// Copy the character span over.
if (CopyStringFragment(StringToks[i], ThisTokBegin,
- StringRef(InStart, ThisTokBuf - InStart)))
+ StringRef(InStart, ThisTokBuf - InStart),
+ Converter))
hadError = true;
-
- if (!hadError && Converter) {
- assert(Kind != tok::wide_string_literal &&
- "Wide character translation not supported");
- ResultPtr =
- convertCharactersInPlace(ResultPtr, ResultPtrBefore,
- CharByteWidth, hadError, *Converter);
- if (hadError && Diags)
- Diags->Report(StringToks[i].getLocation(),
- diag::err_exec_charset_conversion_failed);
- }
continue;
}
// Is this a Universal Character Name escape?
@@ -2479,12 +2423,29 @@ static const char *resyncUTF8(const char *Err, const char *End) {
/// This function copies from Fragment, which is a sequence of bytes
/// within Tok's contents (which begin at TokBegin) into ResultPtr.
/// Performs widening for multi-byte characters.
-bool StringLiteralParser::CopyStringFragment(const Token &Tok,
- const char *TokBegin,
- StringRef Fragment) {
+bool StringLiteralParser::CopyStringFragment(
+ const Token &Tok, const char *TokBegin, StringRef Fragment,
+ llvm::TextEncodingConverter *Converter) {
+
const llvm::UTF8 *ErrorPtrTmp;
- if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
+ if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp)) {
+ if (Converter) {
+ assert(isOrdinary() && "Only ordinary literals are supported");
+ SmallString<64> CpConv;
+ char *Cp = ResultPtr - Fragment.size();
+ auto EC = Converter->convert(Fragment, CpConv);
+ if (!EC) {
+ memcpy(Cp, CpConv.data(), CpConv.size());
+ ResultPtr = Cp + CpConv.size();
+ } else { // there was a conversion error
+ if (Diags)
+ Diags->Report(Tok.getLocation(),
+ diag::err_exec_charset_conversion_failed)
+ << EC.message();
+ }
+ }
return false;
+ }
// If we see bad encoding for unprefixed string literals, warn and
// simply copy the byte values, for compatibility with gcc and older
@@ -2602,7 +2563,7 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
FullSourceLoc(Tok.getLocation(), SM), CharByteWidth * 8,
Diags, Features, StringLiteralEvalMethod::Evaluated,
- nullptr);
+ /*TextEncodingConfig=*/nullptr);
--ByteNo;
}
assert(!HadError && "This method isn't valid on erroneous strings");
diff --git a/clang/lib/Lex/TextEncodingConfig.cpp b/clang/lib/Lex/TextEncodingConfig.cpp
index 8dff4063fe481..f511b22f3906f 100644
--- a/clang/lib/Lex/TextEncodingConfig.cpp
+++ b/clang/lib/Lex/TextEncodingConfig.cpp
@@ -26,18 +26,17 @@ TextEncodingConfig::setConvertersFromOptions(TextEncodingConfig &TEC,
const clang::LangOptions &Opts,
const clang::TargetInfo &TInfo) {
using namespace llvm;
- TEC.InternalEncoding = "UTF-8";
- TEC.SystemEncoding = TInfo.getTriple().getDefaultNarrowTextEncoding();
+
+ const char *UTF8 = "UTF-8";
TEC.ExecEncoding =
- Opts.ExecEncoding.empty() ? TEC.InternalEncoding : Opts.ExecEncoding;
+ Opts.ExecEncoding.empty() ? UTF8 : Opts.ExecEncoding.c_str();
// Create converter between internal and exec encoding specified
// in fexec-charset option.
- if (TEC.InternalEncoding == TEC.ExecEncoding)
+ if (TEC.ExecEncoding == UTF8)
return std::error_code();
ErrorOr<TextEncodingConverter> ErrorOrConverter =
- llvm::TextEncodingConverter::create(TEC.InternalEncoding,
- TEC.ExecEncoding);
+ llvm::TextEncodingConverter::create(UTF8, TEC.ExecEncoding);
if (ErrorOrConverter)
TEC.ToExecEncodingConverter =
new TextEncodingConverter(std::move(*ErrorOrConverter));
diff --git a/clang/test/CodeGen/systemz-charset.c b/clang/test/CodeGen/systemz-charset.c
index 3bfe0890f0b4a..fa5c2ea5ef8d5 100644
--- a/clang/test/CodeGen/systemz-charset.c
+++ b/clang/test/CodeGen/systemz-charset.c
@@ -55,5 +55,5 @@ const char *Unicode = "ÿ";
//CHECK-UTF8: c"\C3\BF\00"
// RUN: not %clang_cc1 -fexec-charset invalid %s 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
-// CHECK-ERROR: error: failed to configure the text encoding config
+// CHECK-ERROR: error: failed to set fexec-charset to 'invalid'
diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c
index caa370934065d..3d48aa6b7f804 100644
--- a/clang/test/Driver/cl-options.c
+++ b/clang/test/Driver/cl-options.c
@@ -252,7 +252,7 @@
// /execution-charset: should warn on invalid charsets.
// RUN: not %clang_cl /execution-charset:invalid-charset -### -- %s 2>&1 | FileCheck -check-prefix=execution-charset-invalid %s
-// execution-charset-invalid: invalid value 'invalid-charset' in '/execution-charset:invalid-charset'
+// execution-charset-invalid: 'invalid-charset' is not a supported encoding in target
//
// RUN: %clang_cl /Umymacro -### -- %s 2>&1 | FileCheck -check-prefix=U %s
diff --git a/clang/test/Driver/clang_f_opts.c b/clang/test/Driver/clang_f_opts.c
index c9d5785783033..5eadf355de367 100644
--- a/clang/test/Driver/clang_f_opts.c
+++ b/clang/test/Driver/clang_f_opts.c
@@ -233,7 +233,7 @@
// CHECK-INVALID-INPUT-CHARSET: error: invalid value 'iso-8859-1' in '-finput-charset=iso-8859-1'
// RUN: not %clang -### -S -fexec-charset=invalid-charset -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-EXEC-CHARSET %s
-// CHECK-INVALID-EXEC-CHARSET: error: invalid value 'invalid-charset' in '-fexec-charset=invalid-charset'
+// CHECK-INVALID-EXEC-CHARSET: error: 'invalid-charset' is not a supported encoding in target
// Test that we support the following exec charsets. The preferred MIME name is
// `IBM1047`, but `IBM-1047` is the name used by z/OS USS utilities such as
More information about the cfe-commits
mailing list