[llvm-branch-commits] [clang] [llvm] Continuation of fexec-charset (PR #169803)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Nov 27 05:50:34 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-support
Author: Abhina Sree (abhina-sree)
<details>
<summary>Changes</summary>
This patch builds upon https://github.com/llvm/llvm-project/pull/138895 and introduces a ParserConversionAction which is able to control which charset to use for various string literals. I also introduce a FormatStrConverter which is used to do format string checking
---
Patch is 58.80 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/169803.diff
22 Files Affected:
- (modified) clang/include/clang/AST/Expr.h (+6)
- (modified) clang/include/clang/AST/FormatString.h (+7-6)
- (modified) clang/include/clang/Basic/TargetInfo.h (+3)
- (modified) clang/include/clang/Lex/LiteralConverter.h (+1-1)
- (modified) clang/include/clang/Parse/Parser.h (+1)
- (modified) clang/include/clang/Sema/Sema.h (+6-2)
- (modified) clang/lib/AST/Expr.cpp (+15)
- (modified) clang/lib/AST/FormatString.cpp (+126-116)
- (modified) clang/lib/AST/FormatStringParsing.h (+23-16)
- (modified) clang/lib/AST/PrintfFormatString.cpp (+84-65)
- (modified) clang/lib/AST/ScanfFormatString.cpp (+19-12)
- (modified) clang/lib/Basic/TargetInfo.cpp (+3)
- (modified) clang/lib/Lex/LiteralConverter.cpp (+9-1)
- (modified) clang/lib/Parse/ParseDecl.cpp (+13)
- (modified) clang/lib/Parse/ParseDeclCXX.cpp (+7-3)
- (modified) clang/lib/Parse/ParseExpr.cpp (+5-4)
- (modified) clang/lib/Parse/Parser.cpp (+4)
- (modified) clang/lib/Sema/SemaChecking.cpp (+40-36)
- (modified) clang/lib/Sema/SemaExpr.cpp (+10-7)
- (modified) clang/test/CodeGen/systemz-charset.c (+8)
- (modified) llvm/include/llvm/Support/TextEncoding.h (+10)
- (modified) llvm/lib/Support/TextEncoding.cpp (+19)
``````````diff
diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h
index 573cc72db35c6..7d1ac3193812f 100644
--- a/clang/include/clang/AST/Expr.h
+++ b/clang/include/clang/AST/Expr.h
@@ -28,6 +28,7 @@
#include "clang/Basic/LangOptions.h"
#include "clang/Basic/SyncScope.h"
#include "clang/Basic/TypeTraits.h"
+#include "clang/Lex/LiteralConverter.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APSInt.h"
#include "llvm/ADT/SmallVector.h"
@@ -2063,6 +2064,11 @@ class PredefinedExpr final
return getIdentKindName(getIdentKind());
}
+ static std::string
+ ComputeNameAndTranslate(PredefinedIdentKind IK, const Decl *CurrentDecl,
+ LiteralConverter &LiteralConv,
+ bool ForceElaboratedPrinting = false);
+
static std::string ComputeName(PredefinedIdentKind IK,
const Decl *CurrentDecl,
bool ForceElaboratedPrinting = false);
diff --git a/clang/include/clang/AST/FormatString.h b/clang/include/clang/AST/FormatString.h
index a284f2c44d633..12083a0d00b4b 100644
--- a/clang/include/clang/AST/FormatString.h
+++ b/clang/include/clang/AST/FormatString.h
@@ -19,6 +19,7 @@
#define LLVM_CLANG_AST_FORMATSTRING_H
#include "clang/AST/CanonicalType.h"
+#include "llvm/Support/TextEncoding.h"
#include <optional>
namespace clang {
@@ -744,9 +745,9 @@ class FormatStringHandler {
// Printf-specific handlers.
virtual bool HandleInvalidPrintfConversionSpecifier(
- const analyze_printf::PrintfSpecifier &FS,
- const char *startSpecifier,
- unsigned specifierLen) {
+ const analyze_printf::PrintfSpecifier &FS, const char *startSpecifier,
+ unsigned specifierLen,
+ const llvm::TextEncodingConverter &FormatStrConverter) {
return true;
}
@@ -763,9 +764,9 @@ class FormatStringHandler {
// Scanf-specific handlers.
virtual bool HandleInvalidScanfConversionSpecifier(
- const analyze_scanf::ScanfSpecifier &FS,
- const char *startSpecifier,
- unsigned specifierLen) {
+ const analyze_scanf::ScanfSpecifier &FS, const char *startSpecifier,
+ unsigned specifierLen,
+ const llvm::TextEncodingConverter &FormatStrConverter) {
return true;
}
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index 1c16f9f79ae68..b3d507e1170dc 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -38,6 +38,7 @@
#include "llvm/IR/DerivedTypes.h"
#include "llvm/Support/DataTypes.h"
#include "llvm/Support/Error.h"
+#include "llvm/Support/TextEncoding.h"
#include "llvm/Support/VersionTuple.h"
#include "llvm/TargetParser/Triple.h"
#include <cassert>
@@ -320,6 +321,8 @@ class TargetInfo : public TransferrableTargetInfo,
virtual ~TargetInfo();
+ llvm::TextEncodingConverter *FormatStrConverter;
+
/// Retrieve the target options.
TargetOptions &getTargetOpts() const {
assert(TargetOpts && "Missing target options");
diff --git a/clang/include/clang/Lex/LiteralConverter.h b/clang/include/clang/Lex/LiteralConverter.h
index 6a66d2d0ff707..ba6fb6c87a782 100644
--- a/clang/include/clang/Lex/LiteralConverter.h
+++ b/clang/include/clang/Lex/LiteralConverter.h
@@ -34,7 +34,7 @@ class LiteralConverter {
static std::error_code
setConvertersFromOptions(LiteralConverter &LiteralConv,
const clang::LangOptions &Opts,
- const clang::TargetInfo &TInfo);
+ clang::TargetInfo &TInfo);
};
#endif
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 58eb1c0a7c114..97867183b5a1d 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -5633,6 +5633,7 @@ class Parser : public CodeCompletionHandler {
bool Finished;
};
ObjCImplParsingDataRAII *CurParsedObjCImpl;
+ ConversionAction ParserConversionAction;
/// StashAwayMethodOrFunctionBodyTokens - Consume the tokens and store them
/// for later parsing.
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index cbfcc9bc0ea99..65567e367dea4 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -54,6 +54,7 @@
#include "clang/Basic/TemplateKinds.h"
#include "clang/Basic/TokenKinds.h"
#include "clang/Basic/TypeTraits.h"
+#include "clang/Lex/LiteralConverter.h"
#include "clang/Sema/AnalysisBasedWarnings.h"
#include "clang/Sema/Attr.h"
#include "clang/Sema/CleanupInfo.h"
@@ -7272,9 +7273,12 @@ class Sema final : public SemaBase {
/// from multiple tokens. However, the common case is that StringToks points
/// to one string.
ExprResult ActOnStringLiteral(ArrayRef<Token> StringToks,
- Scope *UDLScope = nullptr);
+ Scope *UDLScope = nullptr,
+ ConversionAction Action = CA_ToExecEncoding);
- ExprResult ActOnUnevaluatedStringLiteral(ArrayRef<Token> StringToks);
+ ExprResult
+ ActOnUnevaluatedStringLiteral(ArrayRef<Token> StringToks,
+ ConversionAction Action = CA_ToExecEncoding);
/// ControllingExprOrType is either an opaque pointer coming out of a
/// ParsedType or an Expr *. FIXME: it'd be better to split this interface
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index 1d914fa876759..d9765f4a73fcd 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -667,6 +667,21 @@ StringRef PredefinedExpr::getIdentKindName(PredefinedIdentKind IK) {
llvm_unreachable("Unknown ident kind for PredefinedExpr");
}
+std::string PredefinedExpr::ComputeNameAndTranslate(
+ PredefinedIdentKind IK, const Decl *CurrentDecl,
+ LiteralConverter &LiteralConv, bool ForceElaboratedPrinting) {
+ using namespace clang::charinfo;
+ std::string Result = ComputeName(IK, CurrentDecl, ForceElaboratedPrinting);
+ llvm::TextEncodingConverter *Converter =
+ LiteralConv.getConverter(CA_ToExecEncoding);
+ if (Converter) {
+ SmallString<128> Converted;
+ Converter->convert(Result, Converted);
+ Result = std::string(Converted);
+ }
+ return Result;
+}
+
// FIXME: Maybe this should use DeclPrinter with a special "print predefined
// expr" policy instead.
std::string PredefinedExpr::ComputeName(PredefinedIdentKind IK,
diff --git a/clang/lib/AST/FormatString.cpp b/clang/lib/AST/FormatString.cpp
index d4cb89b43ae87..be0f527da92e5 100644
--- a/clang/lib/AST/FormatString.cpp
+++ b/clang/lib/AST/FormatString.cpp
@@ -33,8 +33,9 @@ FormatStringHandler::~FormatStringHandler() {}
// scanf format strings.
//===----------------------------------------------------------------------===//
-OptionalAmount
-clang::analyze_format_string::ParseAmount(const char *&Beg, const char *E) {
+OptionalAmount clang::analyze_format_string::ParseAmount(
+ const char *&Beg, const char *E,
+ const llvm::TextEncodingConverter &FormatStrConverter) {
const char *I = Beg;
UpdateOnReturn <const char*> UpdateBeg(Beg, I);
@@ -42,7 +43,7 @@ clang::analyze_format_string::ParseAmount(const char *&Beg, const char *E) {
bool hasDigits = false;
for ( ; I != E; ++I) {
- char c = *I;
+ char c = FormatStrConverter.convert(*I);
if (c >= '0' && c <= '9') {
hasDigits = true;
accumulator = (accumulator * 10) + (c - '0');
@@ -59,27 +60,23 @@ clang::analyze_format_string::ParseAmount(const char *&Beg, const char *E) {
return OptionalAmount();
}
-OptionalAmount
-clang::analyze_format_string::ParseNonPositionAmount(const char *&Beg,
- const char *E,
- unsigned &argIndex) {
- if (*Beg == '*') {
+OptionalAmount clang::analyze_format_string::ParseNonPositionAmount(
+ const char *&Beg, const char *E, unsigned &argIndex,
+ const llvm::TextEncodingConverter &FormatStrConverter) {
+ if (FormatStrConverter.convert(*Beg) == '*') {
++Beg;
return OptionalAmount(OptionalAmount::Arg, argIndex++, Beg, 0, false);
}
- return ParseAmount(Beg, E);
+ return ParseAmount(Beg, E, FormatStrConverter);
}
-OptionalAmount
-clang::analyze_format_string::ParsePositionAmount(FormatStringHandler &H,
- const char *Start,
- const char *&Beg,
- const char *E,
- PositionContext p) {
- if (*Beg == '*') {
+OptionalAmount clang::analyze_format_string::ParsePositionAmount(
+ FormatStringHandler &H, const char *Start, const char *&Beg, const char *E,
+ PositionContext p, const llvm::TextEncodingConverter &FormatStrConverter) {
+ if (FormatStrConverter.convert(*Beg) == '*') {
const char *I = Beg + 1;
- const OptionalAmount &Amt = ParseAmount(I, E);
+ const OptionalAmount &Amt = ParseAmount(I, E, FormatStrConverter);
if (Amt.getHowSpecified() == OptionalAmount::NotSpecified) {
H.HandleInvalidPosition(Beg, I - Beg, p);
@@ -94,7 +91,7 @@ clang::analyze_format_string::ParsePositionAmount(FormatStringHandler &H,
assert(Amt.getHowSpecified() == OptionalAmount::Constant);
- if (*I == '$') {
+ if (FormatStrConverter.convert(*I) == '$') {
// Handle positional arguments
// Special case: '*0$', since this is an easy mistake.
@@ -114,24 +111,22 @@ clang::analyze_format_string::ParsePositionAmount(FormatStringHandler &H,
return OptionalAmount(false);
}
- return ParseAmount(Beg, E);
+ return ParseAmount(Beg, E, FormatStrConverter);
}
-
-bool
-clang::analyze_format_string::ParseFieldWidth(FormatStringHandler &H,
- FormatSpecifier &CS,
- const char *Start,
- const char *&Beg, const char *E,
- unsigned *argIndex) {
+bool clang::analyze_format_string::ParseFieldWidth(
+ FormatStringHandler &H, FormatSpecifier &CS, const char *Start,
+ const char *&Beg, const char *E, unsigned *argIndex,
+ const llvm::TextEncodingConverter &FormatStrConverter) {
// FIXME: Support negative field widths.
if (argIndex) {
- CS.setFieldWidth(ParseNonPositionAmount(Beg, E, *argIndex));
+ CS.setFieldWidth(
+ ParseNonPositionAmount(Beg, E, *argIndex, FormatStrConverter));
}
else {
- const OptionalAmount Amt =
- ParsePositionAmount(H, Start, Beg, E,
- analyze_format_string::FieldWidthPos);
+ const OptionalAmount Amt = ParsePositionAmount(
+ H, Start, Beg, E, analyze_format_string::FieldWidthPos,
+ FormatStrConverter);
if (Amt.isInvalid())
return true;
@@ -140,15 +135,13 @@ clang::analyze_format_string::ParseFieldWidth(FormatStringHandler &H,
return false;
}
-bool
-clang::analyze_format_string::ParseArgPosition(FormatStringHandler &H,
- FormatSpecifier &FS,
- const char *Start,
- const char *&Beg,
- const char *E) {
+bool clang::analyze_format_string::ParseArgPosition(
+ FormatStringHandler &H, FormatSpecifier &FS, const char *Start,
+ const char *&Beg, const char *E,
+ const llvm::TextEncodingConverter &FormatStrConverter) {
const char *I = Beg;
- const OptionalAmount &Amt = ParseAmount(I, E);
+ const OptionalAmount &Amt = ParseAmount(I, E, FormatStrConverter);
if (I == E) {
// No more characters left?
@@ -156,7 +149,8 @@ clang::analyze_format_string::ParseArgPosition(FormatStringHandler &H,
return true;
}
- if (Amt.getHowSpecified() == OptionalAmount::Constant && *(I++) == '$') {
+ if (Amt.getHowSpecified() == OptionalAmount::Constant &&
+ FormatStrConverter.convert(*(I++)) == '$') {
// Warn that positional arguments are non-standard.
H.HandlePosition(Start, I - Start);
@@ -177,17 +171,15 @@ clang::analyze_format_string::ParseArgPosition(FormatStringHandler &H,
return false;
}
-bool
-clang::analyze_format_string::ParseVectorModifier(FormatStringHandler &H,
- FormatSpecifier &FS,
- const char *&I,
- const char *E,
- const LangOptions &LO) {
+bool clang::analyze_format_string::ParseVectorModifier(
+ FormatStringHandler &H, FormatSpecifier &FS, const char *&I, const char *E,
+ const LangOptions &LO,
+ const llvm::TextEncodingConverter &FormatStrConverter) {
if (!LO.OpenCL)
return false;
const char *Start = I;
- if (*I == 'v') {
+ if (FormatStrConverter.convert(*I) == 'v') {
++I;
if (I == E) {
@@ -195,7 +187,7 @@ clang::analyze_format_string::ParseVectorModifier(FormatStringHandler &H,
return true;
}
- OptionalAmount NumElts = ParseAmount(I, E);
+ OptionalAmount NumElts = ParseAmount(I, E, FormatStrConverter);
if (NumElts.getHowSpecified() != OptionalAmount::Constant) {
H.HandleIncompleteSpecifier(Start, E - Start);
return true;
@@ -207,86 +199,104 @@ clang::analyze_format_string::ParseVectorModifier(FormatStringHandler &H,
return false;
}
-bool
-clang::analyze_format_string::ParseLengthModifier(FormatSpecifier &FS,
- const char *&I,
- const char *E,
- const LangOptions &LO,
- bool IsScanf) {
+bool clang::analyze_format_string::ParseLengthModifier(
+ FormatSpecifier &FS, const char *&I, const char *E, const LangOptions &LO,
+ const llvm::TextEncodingConverter &FormatStrConverter, bool IsScanf) {
LengthModifier::Kind lmKind = LengthModifier::None;
const char *lmPosition = I;
- switch (*I) {
- default:
- return false;
- case 'h':
+ switch (FormatStrConverter.convert(*I)) {
+ default:
+ return false;
+ case 'h':
+ ++I;
+ if (I != E && FormatStrConverter.convert(*I) == 'h') {
++I;
- if (I != E && *I == 'h') {
- ++I;
- lmKind = LengthModifier::AsChar;
- } else if (I != E && *I == 'l' && LO.OpenCL) {
- ++I;
- lmKind = LengthModifier::AsShortLong;
- } else {
- lmKind = LengthModifier::AsShort;
- }
- break;
- case 'l':
+ lmKind = LengthModifier::AsChar;
+ } else if (I != E && FormatStrConverter.convert(*I) == 'l' && LO.OpenCL) {
+ ++I;
+ lmKind = LengthModifier::AsShortLong;
+ } else {
+ lmKind = LengthModifier::AsShort;
+ }
+ break;
+ case 'l':
+ ++I;
+ if (I != E && FormatStrConverter.convert(*I) == 'l') {
+ ++I;
+ lmKind = LengthModifier::AsLongLong;
+ } else {
+ lmKind = LengthModifier::AsLong;
+ }
+ break;
+ case 'j':
+ lmKind = LengthModifier::AsIntMax;
+ ++I;
+ break;
+ case 'z':
+ lmKind = LengthModifier::AsSizeT;
+ ++I;
+ break;
+ case 't':
+ lmKind = LengthModifier::AsPtrDiff;
+ ++I;
+ break;
+ case 'L':
+ lmKind = LengthModifier::AsLongDouble;
+ ++I;
+ break;
+ case 'q':
+ lmKind = LengthModifier::AsQuad;
+ ++I;
+ break;
+ case 'a':
+ if (IsScanf && !LO.C99 && !LO.CPlusPlus11) {
+ // For scanf in C90, look at the next character to see if this should
+ // be parsed as the GNU extension 'a' length modifier. If not, this
+ // will be parsed as a conversion specifier.
++I;
- if (I != E && *I == 'l') {
- ++I;
- lmKind = LengthModifier::AsLongLong;
- } else {
- lmKind = LengthModifier::AsLong;
+ if (I != E && (FormatStrConverter.convert(*I) == 's' ||
+ FormatStrConverter.convert(*I) == 'S' ||
+ FormatStrConverter.convert(*I) == '[')) {
+ lmKind = LengthModifier::AsAllocate;
+ break;
}
+ --I;
+ }
+ return false;
+ case 'm':
+ if (IsScanf) {
+ lmKind = LengthModifier::AsMAllocate;
+ ++I;
break;
- case 'j': lmKind = LengthModifier::AsIntMax; ++I; break;
- case 'z': lmKind = LengthModifier::AsSizeT; ++I; break;
- case 't': lmKind = LengthModifier::AsPtrDiff; ++I; break;
- case 'L': lmKind = LengthModifier::AsLongDouble; ++I; break;
- case 'q': lmKind = LengthModifier::AsQuad; ++I; break;
- case 'a':
- if (IsScanf && !LO.C99 && !LO.CPlusPlus11) {
- // For scanf in C90, look at the next character to see if this should
- // be parsed as the GNU extension 'a' length modifier. If not, this
- // will be parsed as a conversion specifier.
- ++I;
- if (I != E && (*I == 's' || *I == 'S' || *I == '[')) {
- lmKind = LengthModifier::AsAllocate;
- break;
- }
- --I;
- }
- return false;
- case 'm':
- if (IsScanf) {
- lmKind = LengthModifier::AsMAllocate;
- ++I;
+ }
+ return false;
+ // printf: AsInt64, AsInt32, AsInt3264
+ // scanf: AsInt64
+ case 'I':
+ if (I + 1 != E && I + 2 != E) {
+ if (FormatStrConverter.convert(I[1]) == '6' &&
+ FormatStrConverter.convert(I[2]) == '4') {
+ I += 3;
+ lmKind = LengthModifier::AsInt64;
break;
}
- return false;
- // printf: AsInt64, AsInt32, AsInt3264
- // scanf: AsInt64
- case 'I':
- if (I + 1 != E && I + 2 != E) {
- if (I[1] == '6' && I[2] == '4') {
- I += 3;
- lmKind = LengthModifier::AsInt64;
- break;
- }
- if (IsScanf)
- return false;
+ if (IsScanf)
+ return false;
- if (I[1] == '3' && I[2] == '2') {
- I += 3;
- lmKind = LengthModifier::AsInt32;
- break;
- }
+ if (FormatStrConverter.convert(I[1]) == '3' &&
+ FormatStrConverter.convert(I[2]) == '2') {
+ I += 3;
+ lmKind = LengthModifier::AsInt32;
+ break;
}
- ++I;
- lmKind = LengthModifier::AsInt3264;
- break;
- case 'w':
- lmKind = LengthModifier::AsWide; ++I; break;
+ }
+ ++I;
+ lmKind = LengthModifier::AsInt3264;
+ break;
+ case 'w':
+ lmKind = LengthModifier::AsWide;
+ ++I;
+ break;
}
LengthModifier lm(lmPosition, lmKind);
FS.setLengthModifier(lm);
diff --git a/clang/lib/AST/FormatStringParsing.h b/clang/lib/AST/FormatStringParsing.h
index 764e5d46394d7..7ad6d4b98d2ac 100644
--- a/clang/lib/AST/FormatStringParsing.h
+++ b/clang/lib/AST/FormatStringParsing.h
@@ -37,31 +37,38 @@ class UpdateOnReturn {
namespace analyze_format_string {
-OptionalAmount ParseAmount(const char *&Beg, const char *E);
-OptionalAmount ParseNonPositionAmount(const char *&Beg, const char *E,
- unsigned &argIndex);
+OptionalAmount
+ParseAmount(const char *&Beg, const char *E,
+ const llvm::TextEncodingConverter &FormatStrConverter);
-OptionalAmount ParsePositionAmount(FormatStringHandler &H,
- const char *Start, const char *&Beg,
- const char *E, PositionContext p);
+OptionalAmount
+ParseNonPositionAmount(const char *&Beg, const char *E, unsigned &argIndex,
+ const llvm::TextEncodingConverter &FormatStrConverter);
-bool ParseFieldWidth(FormatStringHandler &H,
- FormatSpecifier &CS,
+OptionalAmount
+ParsePositionAmount(FormatStringHandler &H, const char *Start, const char *&Beg,
+ const char *E, PositionContext p,
+ const llvm::TextEncodingConverter &FormatStrConverter);
+
+bool ParseFieldWidth(FormatStringHandler &H, FormatSpecifier &CS,
const char *Start, const char *&Beg, const char *E,
- unsigned *argIndex);
+ ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/169803
More information about the llvm-branch-commits
mailing list