[clang] [libcxx] [llvm] [Clang] Add warnings when mixing different charN_t types (PR #138708)
via llvm-commits
llvm-commits at lists.llvm.org
Sun May 11 01:22:18 PDT 2025
https://github.com/cor3ntin updated https://github.com/llvm/llvm-project/pull/138708
>From c23bf23ddc8e1c8f50a57fcaf74682e86d8ade16 Mon Sep 17 00:00:00 2001
From: Corentin Jabot <corentinjabot at gmail.com>
Date: Tue, 6 May 2025 17:14:35 +0200
Subject: [PATCH 1/8] [Clang] Add warnings when mixing different charN_t types
charN_t represent code units of different UTF encodings.
Therefore the values of 2 different charN_t objects do not represent
the same characters.
In order to avoid comparing apples and oranges, we add new warnings to
warn on:
- Implicit conversions
- Comparisons
- Other cases involving arithmetic conversions
We only produce the warning if we cannot establish the comparison
would be safe through constant evaluation.
The new `-Wimplicit-unicode-conversion` warning is enabled by default.
Note that this PR intentionally doesn;t touches char/wchar_t,
but it would be worth considering also warning on extending the new
warnings to these types (in a follow up)
Additionally most arithmetic operations on charN_t
don't really make sense (ie what does it mean to addition code units),
so we could add warnings for that.
Fixes #138526
---
clang/docs/ReleaseNotes.rst | 4 +
clang/include/clang/AST/ASTDiagnostic.h | 3 +
clang/include/clang/AST/Type.h | 1 +
clang/include/clang/Basic/DiagnosticGroups.td | 1 +
.../clang/Basic/DiagnosticSemaKinds.td | 25 +++
clang/lib/AST/ASTDiagnostic.cpp | 29 ++++
clang/lib/AST/Type.cpp | 14 ++
clang/lib/Sema/SemaChecking.cpp | 48 ++++++
clang/lib/Sema/SemaExpr.cpp | 71 ++++++++
.../warn-implicit-unicode-conversions.cpp | 155 ++++++++++++++++++
llvm/include/llvm/Support/ConvertUTF.h | 4 +
llvm/lib/Support/ConvertUTFWrapper.cpp | 10 ++
12 files changed, 365 insertions(+)
create mode 100644 clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 203958dab7430..3a42f43d79fd1 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -503,6 +503,10 @@ Improvements to Clang's diagnostics
- ``-Wreserved-identifier`` now fires on reserved parameter names in a function
declaration which is not a definition.
+- A new ``-Wimplicit-unicode-conversion`` warns where comparing or implicitly converting
+ between different Unicode character types (``char8_t``, ``char16_t``, ``char32_t``).
+ This warning only triggers in C++ as these types are aliases in C. (#GH138526)
+
Improvements to Clang's time-trace
----------------------------------
diff --git a/clang/include/clang/AST/ASTDiagnostic.h b/clang/include/clang/AST/ASTDiagnostic.h
index ef22249828629..baa410e3e4a03 100644
--- a/clang/include/clang/AST/ASTDiagnostic.h
+++ b/clang/include/clang/AST/ASTDiagnostic.h
@@ -38,6 +38,9 @@ namespace clang {
/// is initialized before passing it in.
QualType desugarForDiagnostic(ASTContext &Context, QualType QT,
bool &ShouldAKA);
+
+ std::string FormatUTFCodeUnitAsCodepoint(unsigned Value, QualType T);
+
} // end namespace clang
#endif
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 02a6fb5333538..7fca11fb708cf 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -2524,6 +2524,7 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
bool isChar16Type() const;
bool isChar32Type() const;
bool isAnyCharacterType() const;
+ bool isUnicodeCharacterType() const;
bool isIntegralType(const ASTContext &Ctx) const;
/// Determine whether this type is an integral or enumeration type.
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 1faf8508121f4..e5b5dbbd07f10 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -111,6 +111,7 @@ def EnumConversion : DiagGroup<"enum-conversion",
ImplicitEnumEnumCast,
EnumFloatConversion,
EnumCompareConditional]>;
+def ImplicitUnicodeConversion : DiagGroup<"implicit-unicode-conversion">;
def DeprecatedOFast : DiagGroup<"deprecated-ofast">;
def ObjCSignedCharBoolImplicitIntConversion :
DiagGroup<"objc-signed-char-bool-implicit-int-conversion">;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index e5a7cdc14a737..a018f6693cff2 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -4357,6 +4357,26 @@ def warn_address_of_reference_bool_conversion : Warning<
"code; pointer may be assumed to always convert to true">,
InGroup<UndefinedBoolConversion>;
+def warn_impcast_unicode_char_type : Warning<
+ "implicit conversion from %0 to %1 may change the meaning of the represented code unit">,
+ InGroup<ImplicitUnicodeConversion>;
+def warn_impcast_unicode_precision : Warning<
+ "implicit conversion from %0 to %1 may lose precision and change the meaning of the represented code unit">,
+ InGroup<ImplicitUnicodeConversion>;
+def warn_impcast_unicode_char_type_constant
+ : Warning<"implicit conversion from %0 to %1 changes the meaning of the "
+ "%select{code unit|codepoint}2 '%3'">,
+ InGroup<ImplicitUnicodeConversion>;
+
+def warn_comparison_unicode_mixed_types : Warning<
+ "comparing values of different Unicode code unit types %0 and %1 may compare different codepoints">,
+ InGroup<ImplicitUnicodeConversion>;
+
+def warn_comparison_unicode_mixed_types_constant
+ : Warning<"comparing values of different Unicode code unit types %0 and %1 "
+ "compares unrelated code units '%2' and '%3'">,
+ InGroup<ImplicitUnicodeConversion>;
+
def warn_xor_used_as_pow : Warning<
"result of '%0' is %1; did you mean exponentiation?">,
InGroup<XorUsedAsPow>;
@@ -7719,6 +7739,11 @@ def warn_comparison_of_mixed_enum_types_switch : Warning<
"%diff{ ($ and $)|}0,1">,
InGroup<EnumCompareSwitch>;
+def warn_arith_conv_mixed__unicode_types
+ : Warning<"%sub{select_arith_conv_kind}0 "
+ "different Unicode character types %1 and %2">,
+ InGroup<ImplicitUnicodeConversion>;
+
def err_typecheck_assign_const : Error<
"%select{"
"cannot assign to return value because function %1 returns a const value|"
diff --git a/clang/lib/AST/ASTDiagnostic.cpp b/clang/lib/AST/ASTDiagnostic.cpp
index 6cb09b0492ac9..0c9f50fb1a01c 100644
--- a/clang/lib/AST/ASTDiagnostic.cpp
+++ b/clang/lib/AST/ASTDiagnostic.cpp
@@ -20,6 +20,8 @@
#include "clang/AST/TemplateBase.h"
#include "clang/AST/Type.h"
#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/ConvertUTF.h"
+#include "llvm/Support/Format.h"
#include "llvm/Support/raw_ostream.h"
using namespace clang;
@@ -2190,3 +2192,30 @@ static bool FormatTemplateTypeDiff(ASTContext &Context, QualType FromType,
TD.DiffTemplate();
return TD.Emit();
}
+
+std::string clang::FormatUTFCodeUnitAsCodepoint(unsigned Value, QualType T) {
+ auto IsSingleCodeUnitCP = [](unsigned Value, QualType T) {
+ if (T->isChar8Type()) {
+ assert(Value <= 0xFF && "not a valid UTF-8 code unit");
+ return Value <= 0x7F;
+ }
+ if (T->isChar16Type()) {
+ assert(Value <= 0xFFFF && "not a valid UTF-16 code unit");
+ return llvm::IsSingleCodeUnitUTF16Codepoint(Value);
+ }
+ return llvm::IsSingleCodeUnitUTF32Codepoint(Value);
+ };
+ llvm::SmallVector<char, 4> Str;
+ if (!IsSingleCodeUnitCP(Value, T)) {
+ llvm::raw_svector_ostream OS(Str);
+ OS << "<" << llvm::format_hex(Value, 1, /*Upper=*/true) << ">";
+ return std::string(Str.begin(), Str.end());
+ }
+
+ char Buffer[UNI_MAX_UTF8_BYTES_PER_CODE_POINT];
+ char *Ptr = Buffer;
+ [[maybe_unused]] bool Converted = llvm::ConvertCodePointToUTF8(Value, Ptr);
+ assert(Converted && "trying to encode invalid code unit");
+ EscapeStringForDiagnostic(StringRef(Buffer, Ptr - Buffer), Str);
+ return std::string(Str.begin(), Str.end());
+}
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index fbd09141bc541..2da63b13faf9d 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -2193,6 +2193,20 @@ bool Type::isAnyCharacterType() const {
}
}
+bool Type::isUnicodeCharacterType() const {
+ const auto *BT = dyn_cast<BuiltinType>(CanonicalType);
+ if (!BT)
+ return false;
+ switch (BT->getKind()) {
+ default:
+ return false;
+ case BuiltinType::Char8:
+ case BuiltinType::Char16:
+ case BuiltinType::Char32:
+ return true;
+ }
+}
+
/// isSignedIntegerType - Return true if this is an integer type that is
/// signed, according to C99 6.2.5p4 [char, signed char, short, int, long..],
/// an enum decl which has a signed representation
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 97f623f61a405..d12b5cea37aa6 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -14,6 +14,7 @@
#include "CheckExprLifetime.h"
#include "clang/AST/APValue.h"
#include "clang/AST/ASTContext.h"
+#include "clang/AST/ASTDiagnostic.h"
#include "clang/AST/Attr.h"
#include "clang/AST/AttrIterator.h"
#include "clang/AST/CharUnits.h"
@@ -11810,6 +11811,46 @@ static void DiagnoseIntInBoolContext(Sema &S, Expr *E) {
}
}
+static void DiagnoseMixedUnicodeImplicitConversion(Sema &S, const Type *Source,
+ const Type *Target, Expr *E,
+ QualType T,
+ SourceLocation CC) {
+ assert(Source->isUnicodeCharacterType() && Target->isUnicodeCharacterType() &&
+ Source != Target);
+ Expr::EvalResult Result;
+ if (E->EvaluateAsInt(Result, S.getASTContext(), Expr::SE_AllowSideEffects,
+ S.isConstantEvaluatedContext())) {
+ llvm::APSInt Value(32);
+ Value = Result.Val.getInt();
+ bool IsASCII = Value <= 0x7F;
+ bool IsBMP = Value <= 0xD7FF || (Value >= 0xE000 && Value <= 0xFFFF);
+ bool ConversionPreservesSemantics =
+ IsASCII || (!Source->isChar8Type() && !Target->isChar8Type() && IsBMP);
+
+ if (!ConversionPreservesSemantics) {
+ auto IsSingleCodeUnitCP = [](const QualType &T,
+ const llvm::APSInt &Value) {
+ if (T->isChar8Type())
+ return llvm::IsSingleCodeUnitUTF8Codepoint(Value.getExtValue());
+ if (T->isChar16Type())
+ return llvm::IsSingleCodeUnitUTF16Codepoint(Value.getExtValue());
+ return llvm::IsSingleCodeUnitUTF32Codepoint(Value.getExtValue());
+ };
+
+ S.Diag(CC, diag::warn_impcast_unicode_char_type_constant)
+ << E->getType() << T
+ << IsSingleCodeUnitCP(E->getType().getUnqualifiedType(), Value)
+ << FormatUTFCodeUnitAsCodepoint(Value.getExtValue(), E->getType());
+ }
+ } else {
+ bool LosesPrecision = S.getASTContext().getIntWidth(E->getType()) >
+ S.getASTContext().getIntWidth(T);
+ DiagnoseImpCast(S, E, T, CC,
+ LosesPrecision ? diag::warn_impcast_unicode_precision
+ : diag::warn_impcast_unicode_char_type);
+ }
+}
+
void Sema::CheckImplicitConversion(Expr *E, QualType T, SourceLocation CC,
bool *ICContext, bool IsListInit) {
if (E->isTypeDependent() || E->isValueDependent()) return;
@@ -12147,6 +12188,13 @@ void Sema::CheckImplicitConversion(Expr *E, QualType T, SourceLocation CC,
DiscardMisalignedMemberAddress(Target, E);
+
+ if(Source->isUnicodeCharacterType() && Target->isUnicodeCharacterType()) {
+ DiagnoseMixedUnicodeImplicitConversion(*this, Source, Target, E, T, CC);
+ return;
+ }
+
+
if (Target->isBooleanType())
DiagnoseIntInBoolContext(*this, E);
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index be3f145f3c5f1..b0080b778db61 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -15,6 +15,7 @@
#include "UsedDeclVisitor.h"
#include "clang/AST/ASTConsumer.h"
#include "clang/AST/ASTContext.h"
+#include "clang/AST/ASTDiagnostic.h"
#include "clang/AST/ASTLambda.h"
#include "clang/AST/ASTMutationListener.h"
#include "clang/AST/CXXInheritance.h"
@@ -1567,6 +1568,72 @@ void Sema::checkEnumArithmeticConversions(Expr *LHS, Expr *RHS,
}
}
+static void CheckUnicodeArithmeticConversions(Sema & SemaRef,
+ Expr *LHS,
+ Expr *RHS,
+ SourceLocation Loc,
+ ArithConvKind ACK) {
+ QualType LHSType = LHS->getType().getUnqualifiedType();
+ QualType RHSType = RHS->getType().getUnqualifiedType();
+
+ if(!SemaRef.getLangOpts().CPlusPlus ||
+ !LHSType->isUnicodeCharacterType() || !RHSType->isUnicodeCharacterType())
+ return;
+
+ if(ACK == ArithConvKind::Comparison) {
+ if (SemaRef.getASTContext().hasSameType(LHSType, RHSType))
+ return;
+
+ Expr::EvalResult LHSRes, RHSRes;
+ bool Success = LHS->EvaluateAsInt(LHSRes, SemaRef.getASTContext(),
+ Expr::SE_AllowSideEffects,
+ SemaRef.isConstantEvaluatedContext());
+ if (Success)
+ Success = RHS->EvaluateAsInt(RHSRes, SemaRef.getASTContext(),
+ Expr::SE_AllowSideEffects,
+ SemaRef.isConstantEvaluatedContext());
+ if (Success) {
+ llvm::APSInt LHSValue(32);
+ LHSValue = LHSRes.Val.getInt();
+ llvm::APSInt RHSValue(32);
+ RHSValue = RHSRes.Val.getInt();
+
+ auto IsSingleCodeUnitCP = [](const QualType &T,
+ const llvm::APSInt &Value) {
+ if (T->isChar8Type())
+ return llvm::IsSingleCodeUnitUTF8Codepoint(Value.getExtValue());
+ if (T->isChar16Type())
+ return llvm::IsSingleCodeUnitUTF16Codepoint(Value.getExtValue());
+ return llvm::IsSingleCodeUnitUTF32Codepoint(Value.getExtValue());
+ };
+
+ bool LHSSafe = IsSingleCodeUnitCP(LHSType, LHSValue);
+ bool RHSSafe = IsSingleCodeUnitCP(RHSType, RHSValue);
+ if (LHSSafe && RHSSafe)
+ return;
+
+ SemaRef.Diag(Loc, diag::warn_comparison_unicode_mixed_types_constant)
+ << LHS->getSourceRange() << RHS->getSourceRange() << LHSType
+ << RHSType
+ << FormatUTFCodeUnitAsCodepoint(LHSValue.getExtValue(), LHSType)
+ << FormatUTFCodeUnitAsCodepoint(RHSValue.getExtValue(), RHSType);
+ return;
+ }
+ SemaRef.Diag(Loc, diag::warn_comparison_unicode_mixed_types)
+ << LHS->getSourceRange() << RHS->getSourceRange()
+ << LHSType << RHSType;
+ return;
+ }
+
+ if (SemaRef.getASTContext().hasSameType(LHSType, RHSType))
+ return;
+
+ SemaRef.Diag(Loc, diag::warn_arith_conv_mixed__unicode_types)
+ << LHS->getSourceRange() << RHS->getSourceRange() << ACK << LHSType
+ << RHSType;
+ return;
+}
+
/// UsualArithmeticConversions - Performs various conversions that are common to
/// binary operators (C99 6.3.1.8). If both operands aren't arithmetic, this
/// routine returns the first non-arithmetic type found. The client is
@@ -1574,8 +1641,12 @@ void Sema::checkEnumArithmeticConversions(Expr *LHS, Expr *RHS,
QualType Sema::UsualArithmeticConversions(ExprResult &LHS, ExprResult &RHS,
SourceLocation Loc,
ArithConvKind ACK) {
+
checkEnumArithmeticConversions(LHS.get(), RHS.get(), Loc, ACK);
+ CheckUnicodeArithmeticConversions(*this, LHS.get(), RHS.get(),
+ Loc, ACK);
+
if (ACK != ArithConvKind::CompAssign) {
LHS = UsualUnaryConversions(LHS.get());
if (LHS.isInvalid())
diff --git a/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp b/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp
new file mode 100644
index 0000000000000..41794b15175b5
--- /dev/null
+++ b/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp
@@ -0,0 +1,155 @@
+// RUN: %clang_cc1 -verify -fsyntax-only -std=c++20 -Wconversion %s
+
+void c8(char8_t);
+void c16(char16_t);
+void c32(char32_t);
+
+void test(char8_t u8, char16_t u16, char32_t u32) {
+ c8(u8);
+ c8(u16); // expected-warning {{implicit conversion from 'char16_t' to 'char8_t' may lose precision and change the meaning of the represented code unit}}
+ c8(u32); // expected-warning {{implicit conversion from 'char32_t' to 'char8_t' may lose precision and change the meaning of the represented code unit}}
+
+ c16(u8); // expected-warning {{implicit conversion from 'char8_t' to 'char16_t' may change the meaning of the represented code unit}}
+ c16(u16);
+ c16(u32); // expected-warning {{implicit conversion from 'char32_t' to 'char16_t' may lose precision and change the meaning of the represented code unit}}
+
+ c32(u8); // expected-warning {{implicit conversion from 'char8_t' to 'char32_t' may change the meaning of the represented code unit}}
+ c32(u16); // expected-warning {{implicit conversion from 'char16_t' to 'char32_t' may change the meaning of the represented code unit}}
+ c32(u32);
+
+
+ c8(char32_t(0x7f));
+ c8(char32_t(0x80)); // expected-warning {{implicit conversion from 'char32_t' to 'char8_t' changes the meaning of the codepoint '<U+0080>'}}
+
+ c8(char16_t(0x7f));
+ c8(char16_t(0x80)); // expected-warning {{implicit conversion from 'char16_t' to 'char8_t' changes the meaning of the codepoint '<U+0080>'}}
+ c8(char16_t(0xD800)); // expected-warning {{implicit conversion from 'char16_t' to 'char8_t' changes the meaning of the code unit '<0xD800>'}}
+ c8(char16_t(0xE000)); // expected-warning {{implicit conversion from 'char16_t' to 'char8_t' changes the meaning of the codepoint '<U+E000>'}}
+
+
+ c16(char32_t(0x7f));
+ c16(char32_t(0x80));
+ c16(char32_t(0xD7FF));
+ c16(char32_t(0xD800)); // expected-warning {{implicit conversion from 'char32_t' to 'char16_t' changes the meaning of the code unit '<0xD800>'}}
+ c16(char32_t(0xE000));
+ c16(char32_t(U'🐉')); // expected-warning {{implicit conversion from 'char32_t' to 'char16_t' changes the meaning of the codepoint '🐉'}}
+
+
+ c32(char8_t(0x7f));
+ c32(char8_t(0x80)); // expected-warning {{implicit conversion from 'char8_t' to 'char32_t' changes the meaning of the code unit '<0x80>'}}
+ c32(char8_t(0xFF)); // expected-warning {{implicit conversion from 'char8_t' to 'char32_t' changes the meaning of the code unit '<0xFF>'}}
+
+
+ c32(char16_t(0x7f));
+ c32(char16_t(0x80));
+
+ c32(char16_t(0xD7FF));
+ c32(char16_t(0xD800)); // expected-warning {{implicit conversion from 'char16_t' to 'char32_t' changes the meaning of the code unit '<0xD800>'}}
+ c32(char16_t(0xDFFF)); // expected-warning {{implicit conversion from 'char16_t' to 'char32_t' changes the meaning of the code unit '<0xDFFF>'}}
+ c32(char16_t(0xE000));
+ c32(char16_t(u'☕'));
+
+ (void)static_cast<char32_t>(char8_t(0x80)); // sanity check: no explicit conversion;
+
+ using Char8 = char8_t;
+ Char8 c81 = u16; // expected-warning {{implicit conversion from 'char16_t' to 'Char8' (aka 'char8_t') may lose precision and change the meaning of the represented code unit}}
+
+ [[maybe_unused]] char c = u16; // expected-warning {{implicit conversion loses integer precision: 'char16_t' to 'char'}}
+
+ // FIXME: We should apply the same logic to wchar
+ [[maybe_unused]] wchar_t wc = u16;
+ [[maybe_unused]] wchar_t wc2 = u8;
+}
+
+void test_comp(char8_t u8, char16_t u16, char32_t u32) {
+ (void)(u8 == u8' ');
+ (void)(u8 == u' '); // expected-warning{{comparing values of different Unicode code unit types 'char8_t' and 'char16_t' may compare different codepoints}}
+ (void)(u8 == U' '); // expected-warning{{comparing values of different Unicode code unit types 'char8_t' and 'char32_t' may compare different codepoints}}
+
+ (void)(u16 == u8' '); // expected-warning{{comparing values of different Unicode code unit types 'char16_t' and 'char8_t' may compare different codepoints}}
+ (void)(u16 == u' ');
+ (void)(u16 == U' '); // expected-warning{{comparing values of different Unicode code unit types 'char16_t' and 'char32_t' may compare different codepoints}}
+
+ (void)(u32 == u8' '); // expected-warning{{comparing values of different Unicode code unit types 'char32_t' and 'char8_t' may compare different codepoints}}
+ (void)(u32 == u' '); // expected-warning{{comparing values of different Unicode code unit types 'char32_t' and 'char16_t' may compare different codepoints}}
+ (void)(u32 == U' ');
+
+
+ (void)(u8' ' == u' ');
+ (void)(u8' ' == u' ');
+
+
+ (void)(u8 == U' '); // expected-warning{{comparing values of different Unicode code unit types 'char8_t' and 'char32_t' may compare different codepoints}}
+ (void)(u16 == u8' '); // expected-warning{{comparing values of different Unicode code unit types 'char16_t' and 'char8_t' may compare different codepoints}}
+ (void)(u16 == u' ');
+ (void)(u16 == U' '); // expected-warning{{comparing values of different Unicode code unit types 'char16_t' and 'char32_t' may compare different codepoints}}
+
+ (void)(u32 == u8' '); // expected-warning{{comparing values of different Unicode code unit types 'char32_t' and 'char8_t' may compare different codepoints}}
+ (void)(u32 == u' '); // expected-warning{{comparing values of different Unicode code unit types 'char32_t' and 'char16_t' may compare different codepoints}}
+ (void)(u32 == U' ');
+
+
+ (void)(char8_t(0x7f) == char8_t(0x7f));
+ (void)(char8_t(0x7f) == char16_t(0x7f));
+ (void)(char8_t(0x7f) == char32_t(0x7f));
+
+ (void)(char8_t(0x80) == char8_t(0x80));
+ (void)(char8_t(0x80) == char16_t(0x80)); // expected-warning{{comparing values of different Unicode code unit types 'char8_t' and 'char16_t' compares unrelated code units '<0x80>' and '<U+0080>}}
+ (void)(char8_t(0x80) == char32_t(0x80)); // expected-warning{{comparing values of different Unicode code unit types 'char8_t' and 'char32_t' compares unrelated code units '<0x80>' and '<U+0080>}}
+
+ (void)(char8_t(0x80) == char8_t(0x7f));
+ (void)(char8_t(0x80) == char16_t(0x7f)); // expected-warning{{comparing values of different Unicode code unit types 'char8_t' and 'char16_t' compares unrelated code units '<0x80>' and '<U+007F>'}}
+ (void)(char8_t(0x80) == char32_t(0x7f)); // expected-warning{{comparing values of different Unicode code unit types 'char8_t' and 'char32_t' compares unrelated code units '<0x80>' and '<U+007F>'}}
+
+
+ (void)(char16_t(0x7f) < char8_t(0x7f));
+ (void)(char16_t(0x7f) < char16_t(0x7f));
+ (void)(char16_t(0x7f) < char32_t(0x7f));
+
+ (void)(char16_t(0x80) < char8_t(0x80)); // expected-warning{{comparing values of different Unicode code unit types 'char16_t' and 'char8_t' compares unrelated code units '<U+0080>' and '<0x80>'}}
+ (void)(char16_t(0x80) < char16_t(0x80));
+ (void)(char16_t(0x80) < char32_t(0x80));
+
+ (void)(char16_t(0x80) == char8_t(0x7f));
+ (void)(char16_t(0x80) < char16_t(0x7f));
+ (void)(char16_t(0x80) < char32_t(0x7f));
+
+
+ (void)(char32_t(0x7f) < char8_t(0x7f));
+ (void)(char32_t(0x7f) < char16_t(0x7f));
+ (void)(char32_t(0x7f) < char32_t(0x7f));
+
+ (void)(char32_t(0x80) < char8_t(0x80)); // expected-warning{{comparing values of different Unicode code unit types 'char32_t' and 'char8_t' compares unrelated code units '<U+0080>' and '<0x80>'}}
+ (void)(char32_t(0x80) < char16_t(0x80));
+ (void)(char32_t(0x80) < char32_t(0x80));
+
+ (void)(char32_t(0x80) == char8_t(0x7f));
+ (void)(char32_t(0x80) < char16_t(0x7f));
+ (void)(char32_t(0x80) < char32_t(0x7f));
+
+
+ (void)(char32_t(U'🐉') <= char16_t(0xD800)); // expected-warning{{comparing values of different Unicode code unit types 'char32_t' and 'char16_t' compares unrelated code units '🐉' and '<0xD800>'}}
+ (void)(char32_t(U'🐉') <= char16_t(0xD7FF));
+
+ (void)(char16_t(0xD800) >= char32_t(U'🐉')); // expected-warning{{comparing values of different Unicode code unit types 'char16_t' and 'char32_t' compares unrelated code units '<0xD800>' and '🐉'}}
+ (void)(char16_t(0xD7FF) >= char32_t(U'🐉'));
+}
+
+void check_arithmetic(char8_t u8, char16_t u16, char32_t u32) {
+
+ (void)(u8 + u8);
+ (void)(u16 += u16);
+ (void)(u32 & u32);
+ (void)(1 ? u16 : u16);
+
+ (void)(u8 + u16); // expected-warning {{arithmetic between different Unicode character types 'char8_t' and 'char16_t'}}
+ (void)(u8 += u16); // expected-warning {{compound assignment of different Unicode character types 'char8_t' and 'char16_t'}}
+ (void)(u8 & u16); // expected-warning {{bitwise operation between different Unicode character types 'char8_t' and 'char16_t'}}
+ (void)(1 ? u8 : u16); // expected-warning {{conditional expression between different Unicode character types 'char8_t' and 'char16_t'}}
+
+
+ (void)(u16 * u32); // expected-warning {{arithmetic between different Unicode character types 'char16_t' and 'char32_t'}}
+ (void)(u16 -= u32); // expected-warning {{compound assignment of different Unicode character types 'char16_t' and 'char32_t'}}
+ (void)(u16 | u32); // expected-warning {{bitwise operation between different Unicode character types 'char16_t' and 'char32_t'}}
+ (void)(1 ? u32 : u16); // expected-warning {{conditional expression between different Unicode character types 'char32_t' and 'char16_t'}}
+}
diff --git a/llvm/include/llvm/Support/ConvertUTF.h b/llvm/include/llvm/Support/ConvertUTF.h
index 25d46178457d6..e30b3ee68364e 100644
--- a/llvm/include/llvm/Support/ConvertUTF.h
+++ b/llvm/include/llvm/Support/ConvertUTF.h
@@ -328,6 +328,10 @@ bool convertUTF32ToUTF8String(ArrayRef<UTF32> Src, std::string &Out);
bool convertUTF8ToUTF16String(StringRef SrcUTF8,
SmallVectorImpl<UTF16> &DstUTF16);
+bool IsSingleCodeUnitUTF8Codepoint(unsigned);
+bool IsSingleCodeUnitUTF16Codepoint(unsigned);
+bool IsSingleCodeUnitUTF32Codepoint(unsigned);
+
#if defined(_WIN32)
namespace sys {
namespace windows {
diff --git a/llvm/lib/Support/ConvertUTFWrapper.cpp b/llvm/lib/Support/ConvertUTFWrapper.cpp
index 4952fe65d7767..76ead00c977bd 100644
--- a/llvm/lib/Support/ConvertUTFWrapper.cpp
+++ b/llvm/lib/Support/ConvertUTFWrapper.cpp
@@ -303,5 +303,15 @@ bool convertWideToUTF8(const std::wstring &Source, std::string &Result) {
}
}
+bool IsSingleCodeUnitUTF8Codepoint(unsigned V) { return V <= 0x7F; }
+
+bool IsSingleCodeUnitUTF16Codepoint(unsigned V) {
+ return V <= 0xD7FF || (V >= 0xE000 && V <= 0xFFFF);
+}
+
+bool IsSingleCodeUnitUTF32Codepoint(unsigned V) {
+ return V <= 0xD7FF || (V >= 0xE000 && V <= 0x10FFFF);
+}
+
} // end namespace llvm
>From 513b292ad18da9c33968f1cc22f71e9256cfebfc Mon Sep 17 00:00:00 2001
From: Corentin Jabot <corentinjabot at gmail.com>
Date: Tue, 6 May 2025 17:38:28 +0200
Subject: [PATCH 2/8] format
---
.../clang/Basic/DiagnosticSemaKinds.td | 21 ++--
clang/lib/Sema/SemaChecking.cpp | 4 +-
clang/lib/Sema/SemaExpr.cpp | 112 +++++++++---------
3 files changed, 67 insertions(+), 70 deletions(-)
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index a018f6693cff2..9cd5d3d36b928 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -4357,20 +4357,23 @@ def warn_address_of_reference_bool_conversion : Warning<
"code; pointer may be assumed to always convert to true">,
InGroup<UndefinedBoolConversion>;
-def warn_impcast_unicode_char_type : Warning<
- "implicit conversion from %0 to %1 may change the meaning of the represented code unit">,
- InGroup<ImplicitUnicodeConversion>;
-def warn_impcast_unicode_precision : Warning<
- "implicit conversion from %0 to %1 may lose precision and change the meaning of the represented code unit">,
- InGroup<ImplicitUnicodeConversion>;
+def warn_impcast_unicode_char_type
+ : Warning<"implicit conversion from %0 to %1 may change the meaning of the "
+ "represented code unit">,
+ InGroup<ImplicitUnicodeConversion>;
+def warn_impcast_unicode_precision
+ : Warning<"implicit conversion from %0 to %1 may lose precision and change "
+ "the meaning of the represented code unit">,
+ InGroup<ImplicitUnicodeConversion>;
def warn_impcast_unicode_char_type_constant
: Warning<"implicit conversion from %0 to %1 changes the meaning of the "
"%select{code unit|codepoint}2 '%3'">,
InGroup<ImplicitUnicodeConversion>;
-def warn_comparison_unicode_mixed_types : Warning<
- "comparing values of different Unicode code unit types %0 and %1 may compare different codepoints">,
- InGroup<ImplicitUnicodeConversion>;
+def warn_comparison_unicode_mixed_types
+ : Warning<"comparing values of different Unicode code unit types %0 and %1 "
+ "may compare different codepoints">,
+ InGroup<ImplicitUnicodeConversion>;
def warn_comparison_unicode_mixed_types_constant
: Warning<"comparing values of different Unicode code unit types %0 and %1 "
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index d12b5cea37aa6..9361683ff4a8c 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -12188,13 +12188,11 @@ void Sema::CheckImplicitConversion(Expr *E, QualType T, SourceLocation CC,
DiscardMisalignedMemberAddress(Target, E);
-
- if(Source->isUnicodeCharacterType() && Target->isUnicodeCharacterType()) {
+ if (Source->isUnicodeCharacterType() && Target->isUnicodeCharacterType()) {
DiagnoseMixedUnicodeImplicitConversion(*this, Source, Target, E, T, CC);
return;
}
-
if (Target->isBooleanType())
DiagnoseIntInBoolContext(*this, E);
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index b0080b778db61..a7a7f55f3d34f 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -1568,70 +1568,67 @@ void Sema::checkEnumArithmeticConversions(Expr *LHS, Expr *RHS,
}
}
-static void CheckUnicodeArithmeticConversions(Sema & SemaRef,
- Expr *LHS,
- Expr *RHS,
- SourceLocation Loc,
- ArithConvKind ACK) {
- QualType LHSType = LHS->getType().getUnqualifiedType();
- QualType RHSType = RHS->getType().getUnqualifiedType();
-
- if(!SemaRef.getLangOpts().CPlusPlus ||
- !LHSType->isUnicodeCharacterType() || !RHSType->isUnicodeCharacterType())
- return;
+static void CheckUnicodeArithmeticConversions(Sema &SemaRef, Expr *LHS,
+ Expr *RHS, SourceLocation Loc,
+ ArithConvKind ACK) {
+ QualType LHSType = LHS->getType().getUnqualifiedType();
+ QualType RHSType = RHS->getType().getUnqualifiedType();
+
+ if (!SemaRef.getLangOpts().CPlusPlus || !LHSType->isUnicodeCharacterType() ||
+ !RHSType->isUnicodeCharacterType())
+ return;
- if(ACK == ArithConvKind::Comparison) {
- if (SemaRef.getASTContext().hasSameType(LHSType, RHSType))
- return;
+ if (ACK == ArithConvKind::Comparison) {
+ if (SemaRef.getASTContext().hasSameType(LHSType, RHSType))
+ return;
- Expr::EvalResult LHSRes, RHSRes;
- bool Success = LHS->EvaluateAsInt(LHSRes, SemaRef.getASTContext(),
- Expr::SE_AllowSideEffects,
- SemaRef.isConstantEvaluatedContext());
- if (Success)
- Success = RHS->EvaluateAsInt(RHSRes, SemaRef.getASTContext(),
- Expr::SE_AllowSideEffects,
- SemaRef.isConstantEvaluatedContext());
- if (Success) {
- llvm::APSInt LHSValue(32);
- LHSValue = LHSRes.Val.getInt();
- llvm::APSInt RHSValue(32);
- RHSValue = RHSRes.Val.getInt();
-
- auto IsSingleCodeUnitCP = [](const QualType &T,
- const llvm::APSInt &Value) {
- if (T->isChar8Type())
- return llvm::IsSingleCodeUnitUTF8Codepoint(Value.getExtValue());
- if (T->isChar16Type())
- return llvm::IsSingleCodeUnitUTF16Codepoint(Value.getExtValue());
- return llvm::IsSingleCodeUnitUTF32Codepoint(Value.getExtValue());
- };
-
- bool LHSSafe = IsSingleCodeUnitCP(LHSType, LHSValue);
- bool RHSSafe = IsSingleCodeUnitCP(RHSType, RHSValue);
- if (LHSSafe && RHSSafe)
- return;
+ Expr::EvalResult LHSRes, RHSRes;
+ bool Success = LHS->EvaluateAsInt(LHSRes, SemaRef.getASTContext(),
+ Expr::SE_AllowSideEffects,
+ SemaRef.isConstantEvaluatedContext());
+ if (Success)
+ Success = RHS->EvaluateAsInt(RHSRes, SemaRef.getASTContext(),
+ Expr::SE_AllowSideEffects,
+ SemaRef.isConstantEvaluatedContext());
+ if (Success) {
+ llvm::APSInt LHSValue(32);
+ LHSValue = LHSRes.Val.getInt();
+ llvm::APSInt RHSValue(32);
+ RHSValue = RHSRes.Val.getInt();
+
+ auto IsSingleCodeUnitCP = [](const QualType &T,
+ const llvm::APSInt &Value) {
+ if (T->isChar8Type())
+ return llvm::IsSingleCodeUnitUTF8Codepoint(Value.getExtValue());
+ if (T->isChar16Type())
+ return llvm::IsSingleCodeUnitUTF16Codepoint(Value.getExtValue());
+ return llvm::IsSingleCodeUnitUTF32Codepoint(Value.getExtValue());
+ };
- SemaRef.Diag(Loc, diag::warn_comparison_unicode_mixed_types_constant)
- << LHS->getSourceRange() << RHS->getSourceRange() << LHSType
- << RHSType
- << FormatUTFCodeUnitAsCodepoint(LHSValue.getExtValue(), LHSType)
- << FormatUTFCodeUnitAsCodepoint(RHSValue.getExtValue(), RHSType);
+ bool LHSSafe = IsSingleCodeUnitCP(LHSType, LHSValue);
+ bool RHSSafe = IsSingleCodeUnitCP(RHSType, RHSValue);
+ if (LHSSafe && RHSSafe)
return;
- }
- SemaRef.Diag(Loc, diag::warn_comparison_unicode_mixed_types)
- << LHS->getSourceRange() << RHS->getSourceRange()
- << LHSType << RHSType;
- return;
- }
- if (SemaRef.getASTContext().hasSameType(LHSType, RHSType))
+ SemaRef.Diag(Loc, diag::warn_comparison_unicode_mixed_types_constant)
+ << LHS->getSourceRange() << RHS->getSourceRange() << LHSType
+ << RHSType
+ << FormatUTFCodeUnitAsCodepoint(LHSValue.getExtValue(), LHSType)
+ << FormatUTFCodeUnitAsCodepoint(RHSValue.getExtValue(), RHSType);
return;
+ }
+ SemaRef.Diag(Loc, diag::warn_comparison_unicode_mixed_types)
+ << LHS->getSourceRange() << RHS->getSourceRange() << LHSType << RHSType;
+ return;
+ }
- SemaRef.Diag(Loc, diag::warn_arith_conv_mixed__unicode_types)
- << LHS->getSourceRange() << RHS->getSourceRange() << ACK << LHSType
- << RHSType;
+ if (SemaRef.getASTContext().hasSameType(LHSType, RHSType))
return;
+
+ SemaRef.Diag(Loc, diag::warn_arith_conv_mixed__unicode_types)
+ << LHS->getSourceRange() << RHS->getSourceRange() << ACK << LHSType
+ << RHSType;
+ return;
}
/// UsualArithmeticConversions - Performs various conversions that are common to
@@ -1644,8 +1641,7 @@ QualType Sema::UsualArithmeticConversions(ExprResult &LHS, ExprResult &RHS,
checkEnumArithmeticConversions(LHS.get(), RHS.get(), Loc, ACK);
- CheckUnicodeArithmeticConversions(*this, LHS.get(), RHS.get(),
- Loc, ACK);
+ CheckUnicodeArithmeticConversions(*this, LHS.get(), RHS.get(), Loc, ACK);
if (ACK != ArithConvKind::CompAssign) {
LHS = UsualUnaryConversions(LHS.get());
>From e31e747455a0191ae9ada002861e2cf3e8ab59f0 Mon Sep 17 00:00:00 2001
From: Corentin Jabot <corentinjabot at gmail.com>
Date: Thu, 8 May 2025 15:42:10 +0200
Subject: [PATCH 3/8] Silence warnings in libc++
---
libcxx/include/print | 2 +-
.../alg.nonmodifying/alg.equal/equal.pass.cpp | 2 +-
.../alg.nonmodifying/alg.find/find.pass.cpp | 2 +-
.../test/std/localization/codecvt_unicode.pass.cpp | 12 ++++++------
.../char16_t_char8_t_in.pass.cpp | 2 +-
.../char16_t_char8_t_out.pass.cpp | 2 +-
.../char32_t_char8_t_in.pass.cpp | 2 +-
.../char32_t_char8_t_out.pass.cpp | 2 +-
.../assign2.pass.cpp | 4 ++--
9 files changed, 15 insertions(+), 15 deletions(-)
diff --git a/libcxx/include/print b/libcxx/include/print
index 61c3ebcd98cb8..be05d30e0147f 100644
--- a/libcxx/include/print
+++ b/libcxx/include/print
@@ -123,7 +123,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr void __encode(_OutIt& __out_it, char32_t __value
_LIBCPP_ASSERT_UNCATEGORIZED(__is_scalar_value(__value), "an invalid unicode scalar value results in invalid UTF-16");
if (__value < 0x10000) {
- *__out_it++ = __value;
+ *__out_it++ = static_cast<iter_value_t<_OutIt>>(__value);
return;
}
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp
index 02cc84c288828..b7266d675c2a1 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp
@@ -19,7 +19,7 @@
// equal(Iter1 first1, Iter1 last1, Iter2 first2, Iter2 last2);
// We test the cartesian product, so we sometimes compare differently signed types
-// ADDITIONAL_COMPILE_FLAGS(gcc-style-warnings): -Wno-sign-compare
+// ADDITIONAL_COMPILE_FLAGS(gcc-style-warnings): -Wno-sign-compare -Wno-implicit-unicode-conversion
// MSVC warning C4242: 'argument': conversion from 'int' to 'const _Ty', possible loss of data
// MSVC warning C4244: 'argument': conversion from 'wchar_t' to 'const _Ty', possible loss of data
// MSVC warning C4389: '==': signed/unsigned mismatch
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp
index 3aaeb9c2f345f..cfc60369dd69f 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp
@@ -7,7 +7,7 @@
//===----------------------------------------------------------------------===//
// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-bool-compare
-// ADDITIONAL_COMPILE_FLAGS(gcc-style-warnings): -Wno-sign-compare
+// ADDITIONAL_COMPILE_FLAGS(gcc-style-warnings): -Wno-sign-compare -Wno-implicit-unicode-conversion
// MSVC warning C4245: conversion from 'int' to 'wchar_t', signed/unsigned mismatch
// MSVC warning C4305: truncation from 'int' to 'bool'
// MSVC warning C4310: cast truncates constant value
diff --git a/libcxx/test/std/localization/codecvt_unicode.pass.cpp b/libcxx/test/std/localization/codecvt_unicode.pass.cpp
index e54c0c2a4610a..7c5f112c7f495 100644
--- a/libcxx/test/std/localization/codecvt_unicode.pass.cpp
+++ b/libcxx/test/std/localization/codecvt_unicode.pass.cpp
@@ -484,7 +484,7 @@ template <class InternT, class ExternT>
void utf8_to_utf16_in_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
// UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
- const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
+ const InternT expected[] = {u'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
static_assert(array_size(input) == 11, "");
static_assert(array_size(expected) == 6, "");
@@ -549,7 +549,7 @@ template <class InternT, class ExternT>
void utf8_to_utf16_in_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
// UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
- const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
+ const InternT expected[] = {u'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
static_assert(array_size(input) == 11, "");
static_assert(array_size(expected) == 6, "");
@@ -618,7 +618,7 @@ template <class InternT, class ExternT>
void utf8_to_utf16_in_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
// UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP, 4-byte CP
const unsigned char input[] = "b\u0448\uD700\U0010AAAA";
- const char16_t expected[] = {'b', 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0};
+ const InternT expected[] = {u'b', 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0};
static_assert(array_size(input) == 11, "");
static_assert(array_size(expected) == 6, "");
@@ -765,7 +765,7 @@ void utf8_to_utf16_in(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
template <class InternT, class ExternT>
void utf16_to_utf8_out_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
// UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
- const char16_t input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
+ const InternT input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
static_assert(array_size(input) == 6, "");
static_assert(array_size(expected) == 11, "");
@@ -801,7 +801,7 @@ void utf16_to_utf8_out_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt)
template <class InternT, class ExternT>
void utf16_to_utf8_out_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
// UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
- const char16_t input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
+ const InternT input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
static_assert(array_size(input) == 6, "");
static_assert(array_size(expected) == 11, "");
@@ -860,7 +860,7 @@ void utf16_to_utf8_out_partial(const std::codecvt<InternT, ExternT, mbstate_t>&
template <class InternT, class ExternT>
void utf16_to_utf8_out_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
// UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
- const char16_t input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
+ const InternT input[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
static_assert(array_size(input) == 6, "");
static_assert(array_size(expected) == 11, "");
diff --git a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_in.pass.cpp b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_in.pass.cpp
index c34e864220e12..86a08ee32cb45 100644
--- a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_in.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_in.pass.cpp
@@ -33,6 +33,6 @@ int main(int, char**) {
assert(from_next - from == 9);
assert(to_next - to == 9);
for (unsigned i = 0; i < 9; ++i)
- assert(to[i] == from[i]);
+ assert(to[i] == static_cast<char16_t>(from[i]));
return 0;
}
diff --git a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_out.pass.cpp b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_out.pass.cpp
index c39e64de7a59f..d5c0c3cf31244 100644
--- a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_out.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_out.pass.cpp
@@ -34,6 +34,6 @@ int main(int, char**) {
assert(from_next - from == 9);
assert(to_next - to == 9);
for (unsigned i = 0; i < 9; ++i)
- assert(to[i] == from[i]);
+ assert(static_cast<char16_t>(to[i]) == from[i]);
return 0;
}
diff --git a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_in.pass.cpp b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_in.pass.cpp
index e848f8a10912e..e6af982c10e99 100644
--- a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_in.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_in.pass.cpp
@@ -33,6 +33,6 @@ int main(int, char**) {
assert(from_next - from == 9);
assert(to_next - to == 9);
for (unsigned i = 0; i < 9; ++i)
- assert(to[i] == from[i]);
+ assert(to[i] == static_cast<char32_t>(from[i]));
return 0;
}
diff --git a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_out.pass.cpp b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_out.pass.cpp
index 7a31c9ef10558..61a0502022840 100644
--- a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_out.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_out.pass.cpp
@@ -34,6 +34,6 @@ int main(int, char**) {
assert(from_next - from == 9);
assert(to_next - to == 9);
for (unsigned i = 0; i < 9; ++i)
- assert(to[i] == from[i]);
+ assert(static_cast<char32_t>(to[i]) == static_cast<char32_t>(from[i]));
return 0;
}
diff --git a/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char8_t/assign2.pass.cpp b/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char8_t/assign2.pass.cpp
index e3bc9c3c100d4..971fcd68cc8e6 100644
--- a/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char8_t/assign2.pass.cpp
+++ b/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char8_t/assign2.pass.cpp
@@ -19,9 +19,9 @@
#ifndef TEST_HAS_NO_CHAR8_T
constexpr bool test_constexpr() {
- char8_t c = u'1';
+ char8_t c = u8'1';
std::char_traits<char8_t>::assign(c, u'a');
- return c == u'a';
+ return c == u8'a';
}
int main(int, char**) {
>From 2df2d4844ba1d6a21c8d4677a54858cf085ff886 Mon Sep 17 00:00:00 2001
From: Corentin Jabot <corentinjabot at gmail.com>
Date: Thu, 8 May 2025 19:07:19 +0200
Subject: [PATCH 4/8] try to fix the libc++ build
---
.../std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp | 4 +++-
.../std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp | 3 ++-
2 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp
index b7266d675c2a1..780d18b364770 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp
@@ -19,7 +19,9 @@
// equal(Iter1 first1, Iter1 last1, Iter2 first2, Iter2 last2);
// We test the cartesian product, so we sometimes compare differently signed types
-// ADDITIONAL_COMPILE_FLAGS(gcc-style-warnings): -Wno-sign-compare -Wno-implicit-unicode-conversion
+// ADDITIONAL_COMPILE_FLAGS(gcc-style-warnings): -Wno-sign-compare
+// ADDITIONAL_COMPILE_FLAGS(clang-21): -Wno-implicit-unicode-conversion
+
// MSVC warning C4242: 'argument': conversion from 'int' to 'const _Ty', possible loss of data
// MSVC warning C4244: 'argument': conversion from 'wchar_t' to 'const _Ty', possible loss of data
// MSVC warning C4389: '==': signed/unsigned mismatch
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp
index cfc60369dd69f..1d31a43953d3b 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp
@@ -7,7 +7,8 @@
//===----------------------------------------------------------------------===//
// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-bool-compare
-// ADDITIONAL_COMPILE_FLAGS(gcc-style-warnings): -Wno-sign-compare -Wno-implicit-unicode-conversion
+// ADDITIONAL_COMPILE_FLAGS(gcc-style-warnings): -Wno-sign-compare
+// ADDITIONAL_COMPILE_FLAGS(clang-21): -Wno-implicit-unicode-conversion
// MSVC warning C4245: conversion from 'int' to 'wchar_t', signed/unsigned mismatch
// MSVC warning C4305: truncation from 'int' to 'bool'
// MSVC warning C4310: cast truncates constant value
>From 5e092e78ac6de9df564ee393af7ef6031a95e3ad Mon Sep 17 00:00:00 2001
From: Corentin Jabot <corentinjabot at gmail.com>
Date: Sat, 10 May 2025 21:51:37 +0200
Subject: [PATCH 5/8] Use -Wcharacter-conversion, add it to -Wconversion, make
it a feature for libcxx
---
clang/docs/ReleaseNotes.rst | 2 +-
clang/include/clang/Basic/DiagnosticGroups.td | 3 ++-
clang/include/clang/Basic/DiagnosticSemaKinds.td | 14 +++++++-------
.../alg.nonmodifying/alg.equal/equal.pass.cpp | 2 +-
.../alg.nonmodifying/alg.find/find.pass.cpp | 2 +-
libcxx/utils/libcxx/test/features.py | 4 ++++
6 files changed, 16 insertions(+), 11 deletions(-)
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 3a42f43d79fd1..ab1ae3ddb48e1 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -503,7 +503,7 @@ Improvements to Clang's diagnostics
- ``-Wreserved-identifier`` now fires on reserved parameter names in a function
declaration which is not a definition.
-- A new ``-Wimplicit-unicode-conversion`` warns where comparing or implicitly converting
+- A new ``-Wcharacter-conversion`` warns where comparing or implicitly converting
between different Unicode character types (``char8_t``, ``char16_t``, ``char32_t``).
This warning only triggers in C++ as these types are aliases in C. (#GH138526)
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index e5b5dbbd07f10..5bea4f09432b0 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -111,7 +111,7 @@ def EnumConversion : DiagGroup<"enum-conversion",
ImplicitEnumEnumCast,
EnumFloatConversion,
EnumCompareConditional]>;
-def ImplicitUnicodeConversion : DiagGroup<"implicit-unicode-conversion">;
+def CharacterConversion : DiagGroup<"character-conversion">;
def DeprecatedOFast : DiagGroup<"deprecated-ofast">;
def ObjCSignedCharBoolImplicitIntConversion :
DiagGroup<"objc-signed-char-bool-implicit-int-conversion">;
@@ -1074,6 +1074,7 @@ def Parentheses : DiagGroup<"parentheses",
// - __null-to-integer conversion warnings are on by default
def Conversion : DiagGroup<"conversion",
[BoolConversion,
+ CharacterConversion,
ConstantConversion,
EnumConversion,
BitFieldEnumConversion,
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 9cd5d3d36b928..be2791c3ff1fc 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -4360,25 +4360,25 @@ def warn_address_of_reference_bool_conversion : Warning<
def warn_impcast_unicode_char_type
: Warning<"implicit conversion from %0 to %1 may change the meaning of the "
"represented code unit">,
- InGroup<ImplicitUnicodeConversion>;
+ InGroup<CharacterConversion>;
def warn_impcast_unicode_precision
: Warning<"implicit conversion from %0 to %1 may lose precision and change "
"the meaning of the represented code unit">,
- InGroup<ImplicitUnicodeConversion>;
+ InGroup<CharacterConversion>;
def warn_impcast_unicode_char_type_constant
: Warning<"implicit conversion from %0 to %1 changes the meaning of the "
"%select{code unit|codepoint}2 '%3'">,
- InGroup<ImplicitUnicodeConversion>;
+ InGroup<CharacterConversion>;
def warn_comparison_unicode_mixed_types
: Warning<"comparing values of different Unicode code unit types %0 and %1 "
"may compare different codepoints">,
- InGroup<ImplicitUnicodeConversion>;
+ InGroup<CharacterConversion>;
def warn_comparison_unicode_mixed_types_constant
: Warning<"comparing values of different Unicode code unit types %0 and %1 "
"compares unrelated code units '%2' and '%3'">,
- InGroup<ImplicitUnicodeConversion>;
+ InGroup<CharacterConversion>;
def warn_xor_used_as_pow : Warning<
"result of '%0' is %1; did you mean exponentiation?">,
@@ -6843,7 +6843,7 @@ def err_counted_by_on_incomplete_type_on_use : Error <
def note_counted_by_consider_completing_pointee_ty : Note<
"consider providing a complete definition for %0">;
-
+
def note_counted_by_consider_using_sized_by : Note<
"consider using '__sized_by%select{|_or_null}0' instead of "
"'__counted_by%select{|_or_null}0'">;
@@ -7745,7 +7745,7 @@ def warn_comparison_of_mixed_enum_types_switch : Warning<
def warn_arith_conv_mixed__unicode_types
: Warning<"%sub{select_arith_conv_kind}0 "
"different Unicode character types %1 and %2">,
- InGroup<ImplicitUnicodeConversion>;
+ InGroup<CharacterConversion>;
def err_typecheck_assign_const : Error<
"%select{"
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp
index 780d18b364770..859532d4b79c7 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp
@@ -20,7 +20,7 @@
// We test the cartesian product, so we sometimes compare differently signed types
// ADDITIONAL_COMPILE_FLAGS(gcc-style-warnings): -Wno-sign-compare
-// ADDITIONAL_COMPILE_FLAGS(clang-21): -Wno-implicit-unicode-conversion
+// ADDITIONAL_COMPILE_FLAGS(character-conversion-warnings): -Wno-character-conversion
// MSVC warning C4242: 'argument': conversion from 'int' to 'const _Ty', possible loss of data
// MSVC warning C4244: 'argument': conversion from 'wchar_t' to 'const _Ty', possible loss of data
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp
index 1d31a43953d3b..989edcb3f6eed 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp
@@ -8,7 +8,7 @@
// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-bool-compare
// ADDITIONAL_COMPILE_FLAGS(gcc-style-warnings): -Wno-sign-compare
-// ADDITIONAL_COMPILE_FLAGS(clang-21): -Wno-implicit-unicode-conversion
+// ADDITIONAL_COMPILE_FLAGS(character-conversion-warnings): -Wno-character-conversion
// MSVC warning C4245: conversion from 'int' to 'wchar_t', signed/unsigned mismatch
// MSVC warning C4305: truncation from 'int' to 'bool'
// MSVC warning C4310: cast truncates constant value
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index 10fc4b0afde6b..74746e37d3bc4 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -144,6 +144,10 @@ def _mingwSupportsModules(cfg):
when=lambda cfg: hasCompileFlag(cfg, "-Wuser-defined-warnings"),
actions=[AddCompileFlag("-Wuser-defined-warnings")],
),
+ Feature(
+ name="character-conversion-warnings",
+ when=lambda cfg: hasCompileFlag(cfg, "-Wcharacter-conversion"),
+ ),
# Tests to validate whether the compiler has a way to set the maximum number
# of steps during constant evaluation. Since the flag differs per compiler
# store the "valid" flag as a feature. This allows passing the proper compile
>From 8e39dc9c6e82b8418e461b9b99553064dcf09074 Mon Sep 17 00:00:00 2001
From: Corentin Jabot <corentinjabot at gmail.com>
Date: Sat, 10 May 2025 22:53:44 +0200
Subject: [PATCH 6/8] Don't warn if one side of the comparison can be evaluated
to a code point representable in both types
---
.../clang/Basic/DiagnosticSemaKinds.td | 6 +-
clang/lib/AST/ASTDiagnostic.cpp | 3 +-
clang/lib/Sema/SemaChecking.cpp | 1 +
clang/lib/Sema/SemaExpr.cpp | 74 +++++++++++--------
.../warn-implicit-unicode-conversions.cpp | 44 +++++------
5 files changed, 68 insertions(+), 60 deletions(-)
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index be2791c3ff1fc..686dce9077735 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -4367,12 +4367,12 @@ def warn_impcast_unicode_precision
InGroup<CharacterConversion>;
def warn_impcast_unicode_char_type_constant
: Warning<"implicit conversion from %0 to %1 changes the meaning of the "
- "%select{code unit|codepoint}2 '%3'">,
+ "%select{code unit|code point}2 '%3'">,
InGroup<CharacterConversion>;
def warn_comparison_unicode_mixed_types
: Warning<"comparing values of different Unicode code unit types %0 and %1 "
- "may compare different codepoints">,
+ "may compare different code points">,
InGroup<CharacterConversion>;
def warn_comparison_unicode_mixed_types_constant
@@ -7742,7 +7742,7 @@ def warn_comparison_of_mixed_enum_types_switch : Warning<
"%diff{ ($ and $)|}0,1">,
InGroup<EnumCompareSwitch>;
-def warn_arith_conv_mixed__unicode_types
+def warn_arith_conv_mixed_unicode_types
: Warning<"%sub{select_arith_conv_kind}0 "
"different Unicode character types %1 and %2">,
InGroup<CharacterConversion>;
diff --git a/clang/lib/AST/ASTDiagnostic.cpp b/clang/lib/AST/ASTDiagnostic.cpp
index 0c9f50fb1a01c..a00d5801f054b 100644
--- a/clang/lib/AST/ASTDiagnostic.cpp
+++ b/clang/lib/AST/ASTDiagnostic.cpp
@@ -2203,9 +2203,10 @@ std::string clang::FormatUTFCodeUnitAsCodepoint(unsigned Value, QualType T) {
assert(Value <= 0xFFFF && "not a valid UTF-16 code unit");
return llvm::IsSingleCodeUnitUTF16Codepoint(Value);
}
+ assert(T->isChar32Type());
return llvm::IsSingleCodeUnitUTF32Codepoint(Value);
};
- llvm::SmallVector<char, 4> Str;
+ llvm::SmallVector<char, 16> Str;
if (!IsSingleCodeUnitCP(Value, T)) {
llvm::raw_svector_ostream OS(Str);
OS << "<" << llvm::format_hex(Value, 1, /*Upper=*/true) << ">";
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 9361683ff4a8c..13fa2b8ef5143 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -11834,6 +11834,7 @@ static void DiagnoseMixedUnicodeImplicitConversion(Sema &S, const Type *Source,
return llvm::IsSingleCodeUnitUTF8Codepoint(Value.getExtValue());
if (T->isChar16Type())
return llvm::IsSingleCodeUnitUTF16Codepoint(Value.getExtValue());
+ assert(T->isChar32Type());
return llvm::IsSingleCodeUnitUTF32Codepoint(Value.getExtValue());
};
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index a7a7f55f3d34f..e42a85a04f5bf 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -1582,50 +1582,60 @@ static void CheckUnicodeArithmeticConversions(Sema &SemaRef, Expr *LHS,
if (SemaRef.getASTContext().hasSameType(LHSType, RHSType))
return;
- Expr::EvalResult LHSRes, RHSRes;
- bool Success = LHS->EvaluateAsInt(LHSRes, SemaRef.getASTContext(),
- Expr::SE_AllowSideEffects,
- SemaRef.isConstantEvaluatedContext());
- if (Success)
- Success = RHS->EvaluateAsInt(RHSRes, SemaRef.getASTContext(),
- Expr::SE_AllowSideEffects,
- SemaRef.isConstantEvaluatedContext());
- if (Success) {
- llvm::APSInt LHSValue(32);
- LHSValue = LHSRes.Val.getInt();
- llvm::APSInt RHSValue(32);
- RHSValue = RHSRes.Val.getInt();
-
- auto IsSingleCodeUnitCP = [](const QualType &T,
- const llvm::APSInt &Value) {
- if (T->isChar8Type())
- return llvm::IsSingleCodeUnitUTF8Codepoint(Value.getExtValue());
- if (T->isChar16Type())
- return llvm::IsSingleCodeUnitUTF16Codepoint(Value.getExtValue());
- return llvm::IsSingleCodeUnitUTF32Codepoint(Value.getExtValue());
- };
+ auto IsSingleCodeUnitCP = [](const QualType &T, const llvm::APSInt &Value) {
+ if (T->isChar8Type())
+ return llvm::IsSingleCodeUnitUTF8Codepoint(Value.getExtValue());
+ if (T->isChar16Type())
+ return llvm::IsSingleCodeUnitUTF16Codepoint(Value.getExtValue());
+ assert(T->isChar32Type());
+ return llvm::IsSingleCodeUnitUTF32Codepoint(Value.getExtValue());
+ };
- bool LHSSafe = IsSingleCodeUnitCP(LHSType, LHSValue);
- bool RHSSafe = IsSingleCodeUnitCP(RHSType, RHSValue);
- if (LHSSafe && RHSSafe)
+ Expr::EvalResult LHSRes, RHSRes;
+ bool LHSSuccess = LHS->EvaluateAsInt(LHSRes, SemaRef.getASTContext(),
+ Expr::SE_AllowSideEffects,
+ SemaRef.isConstantEvaluatedContext());
+ bool RHSuccess = RHS->EvaluateAsInt(RHSRes, SemaRef.getASTContext(),
+ Expr::SE_AllowSideEffects,
+ SemaRef.isConstantEvaluatedContext());
+
+ // Don't warn if the one known value is a representable
+ // in the type of both expressions.
+ if (LHSSuccess != RHSuccess) {
+ Expr::EvalResult &Res = LHSSuccess ? LHSRes : RHSRes;
+ if (IsSingleCodeUnitCP(LHSType, Res.Val.getInt()) &&
+ IsSingleCodeUnitCP(RHSType, Res.Val.getInt()))
return;
+ }
- SemaRef.Diag(Loc, diag::warn_comparison_unicode_mixed_types_constant)
+ if (!LHSSuccess || !RHSuccess) {
+ SemaRef.Diag(Loc, diag::warn_comparison_unicode_mixed_types)
<< LHS->getSourceRange() << RHS->getSourceRange() << LHSType
- << RHSType
- << FormatUTFCodeUnitAsCodepoint(LHSValue.getExtValue(), LHSType)
- << FormatUTFCodeUnitAsCodepoint(RHSValue.getExtValue(), RHSType);
+ << RHSType;
return;
}
- SemaRef.Diag(Loc, diag::warn_comparison_unicode_mixed_types)
- << LHS->getSourceRange() << RHS->getSourceRange() << LHSType << RHSType;
+
+ llvm::APSInt LHSValue(32);
+ LHSValue = LHSRes.Val.getInt();
+ llvm::APSInt RHSValue(32);
+ RHSValue = RHSRes.Val.getInt();
+
+ bool LHSSafe = IsSingleCodeUnitCP(LHSType, LHSValue);
+ bool RHSSafe = IsSingleCodeUnitCP(RHSType, RHSValue);
+ if (LHSSafe && RHSSafe)
+ return;
+
+ SemaRef.Diag(Loc, diag::warn_comparison_unicode_mixed_types_constant)
+ << LHS->getSourceRange() << RHS->getSourceRange() << LHSType << RHSType
+ << FormatUTFCodeUnitAsCodepoint(LHSValue.getExtValue(), LHSType)
+ << FormatUTFCodeUnitAsCodepoint(RHSValue.getExtValue(), RHSType);
return;
}
if (SemaRef.getASTContext().hasSameType(LHSType, RHSType))
return;
- SemaRef.Diag(Loc, diag::warn_arith_conv_mixed__unicode_types)
+ SemaRef.Diag(Loc, diag::warn_arith_conv_mixed_unicode_types)
<< LHS->getSourceRange() << RHS->getSourceRange() << ACK << LHSType
<< RHSType;
return;
diff --git a/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp b/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp
index 41794b15175b5..fcff006d0e028 100644
--- a/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp
+++ b/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp
@@ -19,12 +19,12 @@ void test(char8_t u8, char16_t u16, char32_t u32) {
c8(char32_t(0x7f));
- c8(char32_t(0x80)); // expected-warning {{implicit conversion from 'char32_t' to 'char8_t' changes the meaning of the codepoint '<U+0080>'}}
+ c8(char32_t(0x80)); // expected-warning {{implicit conversion from 'char32_t' to 'char8_t' changes the meaning of the code point '<U+0080>'}}
c8(char16_t(0x7f));
- c8(char16_t(0x80)); // expected-warning {{implicit conversion from 'char16_t' to 'char8_t' changes the meaning of the codepoint '<U+0080>'}}
+ c8(char16_t(0x80)); // expected-warning {{implicit conversion from 'char16_t' to 'char8_t' changes the meaning of the code point '<U+0080>'}}
c8(char16_t(0xD800)); // expected-warning {{implicit conversion from 'char16_t' to 'char8_t' changes the meaning of the code unit '<0xD800>'}}
- c8(char16_t(0xE000)); // expected-warning {{implicit conversion from 'char16_t' to 'char8_t' changes the meaning of the codepoint '<U+E000>'}}
+ c8(char16_t(0xE000)); // expected-warning {{implicit conversion from 'char16_t' to 'char8_t' changes the meaning of the code point '<U+E000>'}}
c16(char32_t(0x7f));
@@ -32,7 +32,7 @@ void test(char8_t u8, char16_t u16, char32_t u32) {
c16(char32_t(0xD7FF));
c16(char32_t(0xD800)); // expected-warning {{implicit conversion from 'char32_t' to 'char16_t' changes the meaning of the code unit '<0xD800>'}}
c16(char32_t(0xE000));
- c16(char32_t(U'🐉')); // expected-warning {{implicit conversion from 'char32_t' to 'char16_t' changes the meaning of the codepoint '🐉'}}
+ c16(char32_t(U'🐉')); // expected-warning {{implicit conversion from 'char32_t' to 'char16_t' changes the meaning of the code point '🐉'}}
c32(char8_t(0x7f));
@@ -49,7 +49,7 @@ void test(char8_t u8, char16_t u16, char32_t u32) {
c32(char16_t(0xE000));
c32(char16_t(u'☕'));
- (void)static_cast<char32_t>(char8_t(0x80)); // sanity check: no explicit conversion;
+ (void)static_cast<char32_t>(char8_t(0x80)); //no warnings for explicit conversions.
using Char8 = char8_t;
Char8 c81 = u16; // expected-warning {{implicit conversion from 'char16_t' to 'Char8' (aka 'char8_t') may lose precision and change the meaning of the represented code unit}}
@@ -63,31 +63,27 @@ void test(char8_t u8, char16_t u16, char32_t u32) {
void test_comp(char8_t u8, char16_t u16, char32_t u32) {
(void)(u8 == u8' ');
- (void)(u8 == u' '); // expected-warning{{comparing values of different Unicode code unit types 'char8_t' and 'char16_t' may compare different codepoints}}
- (void)(u8 == U' '); // expected-warning{{comparing values of different Unicode code unit types 'char8_t' and 'char32_t' may compare different codepoints}}
+ (void)(u8 == u' ');
+ (void)(u8 == U' ');
- (void)(u16 == u8' '); // expected-warning{{comparing values of different Unicode code unit types 'char16_t' and 'char8_t' may compare different codepoints}}
- (void)(u16 == u' ');
- (void)(u16 == U' '); // expected-warning{{comparing values of different Unicode code unit types 'char16_t' and 'char32_t' may compare different codepoints}}
+ (void)(u16 == u8' ');
+ (void)(u16 == U' ');
- (void)(u32 == u8' '); // expected-warning{{comparing values of different Unicode code unit types 'char32_t' and 'char8_t' may compare different codepoints}}
- (void)(u32 == u' '); // expected-warning{{comparing values of different Unicode code unit types 'char32_t' and 'char16_t' may compare different codepoints}}
+ (void)(u32 == u8' ');
+ (void)(u32 == u' ');
(void)(u32 == U' ');
+ (void)(u8 == u'\u00FF'); // expected-warning{{comparing values of different Unicode code unit types 'char8_t' and 'char16_t' may compare different code points}}
+ (void)(u8 == U'\u00FF'); // expected-warning{{comparing values of different Unicode code unit types 'char8_t' and 'char32_t' may compare different code points}}
- (void)(u8' ' == u' ');
- (void)(u8' ' == u' ');
-
-
- (void)(u8 == U' '); // expected-warning{{comparing values of different Unicode code unit types 'char8_t' and 'char32_t' may compare different codepoints}}
- (void)(u16 == u8' '); // expected-warning{{comparing values of different Unicode code unit types 'char16_t' and 'char8_t' may compare different codepoints}}
- (void)(u16 == u' ');
- (void)(u16 == U' '); // expected-warning{{comparing values of different Unicode code unit types 'char16_t' and 'char32_t' may compare different codepoints}}
-
- (void)(u32 == u8' '); // expected-warning{{comparing values of different Unicode code unit types 'char32_t' and 'char8_t' may compare different codepoints}}
- (void)(u32 == u' '); // expected-warning{{comparing values of different Unicode code unit types 'char32_t' and 'char16_t' may compare different codepoints}}
- (void)(u32 == U' ');
+ (void)(u16 == u8'\xFF'); // expected-warning{{comparing values of different Unicode code unit types 'char16_t' and 'char8_t' may compare different code points}}
+ (void)(u16 == u'\u00FF');
+ (void)(u16 == U'\u00FF');
+ (void)(u16 == U'\xD800'); // expected-warning{{comparing values of different Unicode code unit types 'char16_t' and 'char32_t' may compare different code points}}
+ (void)(u32 == u8'\xFF'); // expected-warning{{comparing values of different Unicode code unit types 'char32_t' and 'char8_t' may compare different code points}}
+ (void)(u32 == u'\u00FF');
+ (void)(u32 == u'\xD800'); // expected-warning{{comparing values of different Unicode code unit types 'char32_t' and 'char16_t' may compare different code points}}
(void)(char8_t(0x7f) == char8_t(0x7f));
(void)(char8_t(0x7f) == char16_t(0x7f));
>From c299893015da0e0727139d128c3ef4ac4e686929 Mon Sep 17 00:00:00 2001
From: Corentin Jabot <corentinjabot at gmail.com>
Date: Sat, 10 May 2025 23:15:45 +0200
Subject: [PATCH 7/8] libc++ 03 fixes
---
libcxx/test/std/localization/codecvt_unicode.pass.cpp | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/libcxx/test/std/localization/codecvt_unicode.pass.cpp b/libcxx/test/std/localization/codecvt_unicode.pass.cpp
index 7c5f112c7f495..fed183ee0e71f 100644
--- a/libcxx/test/std/localization/codecvt_unicode.pass.cpp
+++ b/libcxx/test/std/localization/codecvt_unicode.pass.cpp
@@ -484,7 +484,7 @@ template <class InternT, class ExternT>
void utf8_to_utf16_in_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
// UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
- const InternT expected[] = {u'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
+ const InternT expected[] = {0x61, 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
static_assert(array_size(input) == 11, "");
static_assert(array_size(expected) == 6, "");
@@ -549,7 +549,7 @@ template <class InternT, class ExternT>
void utf8_to_utf16_in_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
// UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
- const InternT expected[] = {u'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
+ const InternT expected[] = {0x61, 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
static_assert(array_size(input) == 11, "");
static_assert(array_size(expected) == 6, "");
@@ -618,7 +618,7 @@ template <class InternT, class ExternT>
void utf8_to_utf16_in_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
// UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP, 4-byte CP
const unsigned char input[] = "b\u0448\uD700\U0010AAAA";
- const InternT expected[] = {u'b', 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0};
+ const InternT expected[] = {0x61, 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0};
static_assert(array_size(input) == 11, "");
static_assert(array_size(expected) == 6, "");
>From 9a17333fb5965f226be4f6d3783513da74571671 Mon Sep 17 00:00:00 2001
From: Corentin Jabot <corentinjabot at gmail.com>
Date: Sun, 11 May 2025 10:21:59 +0200
Subject: [PATCH 8/8] I guess I don't know my ascii tables...
---
libcxx/test/std/localization/codecvt_unicode.pass.cpp | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/libcxx/test/std/localization/codecvt_unicode.pass.cpp b/libcxx/test/std/localization/codecvt_unicode.pass.cpp
index fed183ee0e71f..da1acc8061fe1 100644
--- a/libcxx/test/std/localization/codecvt_unicode.pass.cpp
+++ b/libcxx/test/std/localization/codecvt_unicode.pass.cpp
@@ -484,7 +484,7 @@ template <class InternT, class ExternT>
void utf8_to_utf16_in_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
// UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
- const InternT expected[] = {0x61, 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
+ const InternT expected[] = {0x62, 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
static_assert(array_size(input) == 11, "");
static_assert(array_size(expected) == 6, "");
@@ -549,7 +549,7 @@ template <class InternT, class ExternT>
void utf8_to_utf16_in_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
// UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
- const InternT expected[] = {0x61, 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
+ const InternT expected[] = {0x62, 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
static_assert(array_size(input) == 11, "");
static_assert(array_size(expected) == 6, "");
@@ -618,7 +618,7 @@ template <class InternT, class ExternT>
void utf8_to_utf16_in_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
// UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP, 4-byte CP
const unsigned char input[] = "b\u0448\uD700\U0010AAAA";
- const InternT expected[] = {0x61, 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0};
+ const InternT expected[] = {0x62, 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0};
static_assert(array_size(input) == 11, "");
static_assert(array_size(expected) == 6, "");
More information about the llvm-commits
mailing list