[clang] [libcxx] [libc++] Don't dispatch find to wmemchr under -fshort-wchar (PR #203621)
via cfe-commits
cfe-commits at lists.llvm.org
Fri Jun 12 13:05:02 PDT 2026
llvmorg-github-actions[bot] wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-libcxx
Author: Akira Hatanaka (ahatanak)
<details>
<summary>Changes</summary>
std::__find routes a search to the libc wmemchr (via
__constexpr_wmemchr) whenever the element type has the same size and
alignment as wchar_t. That is wrong under -fshort-wchar on a platform
whose native wchar_t is 4 bytes: wchar_t shrinks to 2 bytes, so
_Tp=char16_t satisfies that condition, and the search is routed to
wmemchr, which still reads 4-byte elements.
Only take the wmemchr path when wchar_t is still its native type, i.e.,
unmodified by -fshort-wchar. The check uses the new __native_wchar_t
alias in <cwchar> (from __WCHAR_NATIVE_TYPE__, falling back to wchar_t
on older compilers). Normal builds keep the wmemchr fast path unchanged.
Fixes https://github.com/llvm/llvm-project/issues/195149
rdar://175090927
---
Full diff: https://github.com/llvm/llvm-project/pull/203621.diff
10 Files Affected:
- (modified) clang/docs/LanguageExtensions.rst (+8)
- (modified) clang/docs/ReleaseNotes.rst (+7)
- (modified) clang/include/clang/Basic/TargetInfo.h (+12)
- (modified) clang/lib/Basic/TargetInfo.cpp (+8)
- (modified) clang/lib/Frontend/InitPreprocessor.cpp (+1)
- (modified) clang/test/Preprocessor/init-aarch64.c (+1)
- (modified) clang/test/Preprocessor/init.c (+1)
- (modified) libcxx/include/__algorithm/find.h (+7-1)
- (modified) libcxx/include/cwchar (+9)
- (added) libcxx/test/libcxx/strings/basic.string/string.ops/string_find/short_wchar.pass.cpp (+42)
``````````diff
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index fbb9947f39d3e..c0beb6ddecd02 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -393,6 +393,14 @@ Builtin Macros
Defined to an integral value that is the include depth of the file currently
being translated. For the main file, this value is zero.
+``__WCHAR_NATIVE_TYPE__``
+ clang-specific extension defined to the platform's native type for
+ ``wchar_t``, i.e., the type ``wchar_t`` would have without ``-fshort-wchar``.
+ This matches ``__WCHAR_TYPE__`` unless ``-fshort-wchar`` is in effect. This
+ lets code detect when ``wchar_t`` is different from its native type,
+ e.g., to decide whether dispatching to a ``wchar_t``-based runtime function
+ such as ``wmemchr`` is safe.
+
``__TIMESTAMP__``
Defined to the date and time of the last modification of the current source
file.
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index cf4826f50e5a5..12d09cb361825 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -312,6 +312,13 @@ Non-comprehensive list of changes in this release
- ``typeid`` on references and pointers of ``final`` types no longer emits a
vtable lookup at runtime.
+- Added a new predefined macro ``__WCHAR_NATIVE_TYPE__``, expanding to the
+ platform's native type for ``wchar_t`` (the type ``wchar_t`` would have
+ without ``-fshort-wchar``). It matches ``__WCHAR_TYPE__`` unless
+ ``-fshort-wchar`` is in effect, letting code detect when ``wchar_t`` is
+ different from its native type.
+
+
- Updated support for Unicode from 15.1 to 18.0.
New Compiler Flags
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index cc226403877e2..8615da60803f7 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -163,6 +163,11 @@ struct TransferrableTargetInfo {
Char16Type, Char32Type, Int64Type, Int16Type, SigAtomicType,
ProcessIDType;
+ /// The platform's native type for wchar_t, i.e., the type wchar_t would have
+ /// without -fshort-wchar. This matches WCharType unless -fshort-wchar is in
+ /// effect.
+ IntType WideCharNativeType;
+
/// Whether Objective-C's built-in boolean type should be signed char.
///
/// Otherwise, when this flag is not set, the normal built-in boolean type is
@@ -417,6 +422,13 @@ class TargetInfo : public TransferrableTargetInfo,
return getCorrespondingUnsignedType(IntPtrType);
}
IntType getWCharType() const { return WCharType; }
+
+ /// Return the platform's native type for wchar_t, i.e., the type wchar_t
+ /// would have without -fshort-wchar.
+ IntType getWideCharNativeType() const {
+ return WideCharNativeType == NoInt ? WCharType : WideCharNativeType;
+ }
+
IntType getWIntType() const { return WIntType; }
IntType getChar16Type() const { return Char16Type; }
IntType getChar32Type() const { return Char32Type; }
diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp
index 854d23cadaea2..62fb6c8175484 100644
--- a/clang/lib/Basic/TargetInfo.cpp
+++ b/clang/lib/Basic/TargetInfo.cpp
@@ -137,6 +137,7 @@ TargetInfo::TargetInfo(const llvm::Triple &T) : Triple(T) {
IntMaxType = SignedLongLong;
IntPtrType = SignedLong;
WCharType = SignedInt;
+ WideCharNativeType = NoInt;
WIntType = SignedInt;
Char16Type = UnsignedShort;
Char32Type = UnsignedInt;
@@ -423,6 +424,13 @@ void TargetInfo::adjust(DiagnosticsEngine &Diags, LangOptions &Opts,
if (Opts.NoBitFieldTypeAlign)
UseBitFieldTypeAlignment = false;
+ // Capture the platform-native wchar_t before -fshort-wchar can override
+ // WCharType below. adjust() may run more than once on the same target, so
+ // only record it the first time, while WCharType still holds the target
+ // default.
+ if (WideCharNativeType == NoInt)
+ WideCharNativeType = WCharType;
+
switch (Opts.WCharSize) {
default: llvm_unreachable("invalid wchar_t width");
case 0: break;
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 3f0468a938149..f516c5159dba7 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1168,6 +1168,7 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
DefineType("__SIZE_TYPE__", TI.getSizeType(), Builder);
DefineFmt(LangOpts, "__SIZE", TI.getSizeType(), TI, Builder);
DefineType("__WCHAR_TYPE__", TI.getWCharType(), Builder);
+ DefineType("__WCHAR_NATIVE_TYPE__", TI.getWideCharNativeType(), Builder);
DefineType("__WINT_TYPE__", TI.getWIntType(), Builder);
DefineTypeSizeAndWidth("__SIG_ATOMIC", TI.getSigAtomicType(), TI, Builder);
if (LangOpts.C23)
diff --git a/clang/test/Preprocessor/init-aarch64.c b/clang/test/Preprocessor/init-aarch64.c
index 09e3fc926a309..3ec78a7651480 100644
--- a/clang/test/Preprocessor/init-aarch64.c
+++ b/clang/test/Preprocessor/init-aarch64.c
@@ -393,6 +393,7 @@
// AARCH64-NEXT: #define __USER_LABEL_PREFIX__
// AARCH64-NEXT: #define __VERSION__ "{{.*}}"
// AARCH64-NEXT: #define __WCHAR_MAX__ 4294967295U
+// AARCH64-NEXT: #define __WCHAR_NATIVE_TYPE__ unsigned int
// AARCH64-NEXT: #define __WCHAR_TYPE__ unsigned int
// AARCH64-NEXT: #define __WCHAR_UNSIGNED__ 1
// AARCH64-NEXT: #define __WCHAR_WIDTH__ 32
diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c
index 80b7a6399e5f4..cc67db4fa068e 100644
--- a/clang/test/Preprocessor/init.c
+++ b/clang/test/Preprocessor/init.c
@@ -2076,6 +2076,7 @@
// WEBASSEMBLY-NEXT:#define __USER_LABEL_PREFIX__
// WEBASSEMBLY-NEXT:#define __VERSION__ "{{.*}}"
// WEBASSEMBLY-NEXT:#define __WCHAR_MAX__ 2147483647
+// WEBASSEMBLY-NEXT:#define __WCHAR_NATIVE_TYPE__ int
// WEBASSEMBLY-NEXT:#define __WCHAR_TYPE__ int
// WEBASSEMBLY-NOT:#define __WCHAR_UNSIGNED__
// WEBASSEMBLY-NEXT:#define __WCHAR_WIDTH__ 32
diff --git a/libcxx/include/__algorithm/find.h b/libcxx/include/__algorithm/find.h
index f677fb2c7392d..66657a9056537 100644
--- a/libcxx/include/__algorithm/find.h
+++ b/libcxx/include/__algorithm/find.h
@@ -127,7 +127,13 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find(_Tp* __first, _T
return __last;
}
# if _LIBCPP_HAS_WIDE_CHARACTERS
- else if constexpr (sizeof(_Tp) == sizeof(wchar_t) && _LIBCPP_ALIGNOF(_Tp) >= _LIBCPP_ALIGNOF(wchar_t)) {
+ // __builtin_wmemchr lowers to a libc call that walks native-sized wchar_t
+ // elements. Only take this path when wchar_t still has its platform-native
+ // size and alignment. Otherwise (e.g., under -fshort-wchar) fall through to the
+ // vectorized integral path, which honors the current wchar_t size.
+ else if constexpr (sizeof(_Tp) == sizeof(wchar_t) && _LIBCPP_ALIGNOF(_Tp) >= _LIBCPP_ALIGNOF(wchar_t) &&
+ sizeof(wchar_t) == sizeof(__native_wchar_t) && _LIBCPP_ALIGNOF(wchar_t) ==
+ _LIBCPP_ALIGNOF(__native_wchar_t)) {
if (auto __ret = std::__constexpr_wmemchr(__first, __value, __last - __first))
return __ret;
return __last;
diff --git a/libcxx/include/cwchar b/libcxx/include/cwchar
index e2534977a7a3c..cc7e7fd8128bd 100644
--- a/libcxx/include/cwchar
+++ b/libcxx/include/cwchar
@@ -197,6 +197,15 @@ using ::putwchar _LIBCPP_USING_IF_EXISTS;
using ::vwprintf _LIBCPP_USING_IF_EXISTS;
using ::wprintf _LIBCPP_USING_IF_EXISTS;
+// Names the platform-native wchar_t (the type wchar_t would have without
+// -fshort-wchar). Falls back to wchar_t on compilers that predate
+// __WCHAR_NATIVE_TYPE__ (Clang < 23), preserving prior behavior.
+# ifdef __WCHAR_NATIVE_TYPE__
+using __native_wchar_t = __WCHAR_NATIVE_TYPE__;
+# else
+using __native_wchar_t = wchar_t;
+# endif
+
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 size_t __constexpr_wcslen(const wchar_t* __str) {
# if __has_builtin(__builtin_wcslen)
return __builtin_wcslen(__str);
diff --git a/libcxx/test/libcxx/strings/basic.string/string.ops/string_find/short_wchar.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.ops/string_find/short_wchar.pass.cpp
new file mode 100644
index 0000000000000..a261f7fd30c9c
--- /dev/null
+++ b/libcxx/test/libcxx/strings/basic.string/string.ops/string_find/short_wchar.pass.cpp
@@ -0,0 +1,42 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Regression test for llvm/llvm-project#195149: u16string::find used to
+// dispatch through __builtin_wmemchr when sizeof(char16_t) == sizeof(wchar_t).
+// Under -fshort-wchar on a platform whose native wchar_t is 4 bytes
+// (e.g., Linux/Darwin), the libc wmemchr keeps walking 4-byte elements, so the
+// search returned wrong results. __find now gates the wmemchr fast path on the
+// platform-native wchar_t size (via __WCHAR_NATIVE_TYPE__) so the runtime
+// libcall is taken only when it is binary-compatible with what wmemchr expects.
+//
+// Only meaningful where the platform-native wchar_t differs from 2 bytes; on
+// Windows (native 2-byte wchar_t) the optimization is always safe.
+
+// ADDITIONAL_COMPILE_FLAGS: -fshort-wchar
+
+#include <cassert>
+#include <string>
+
+#include "test_macros.h"
+
+TEST_CONSTEXPR_CXX20 bool test() {
+ std::u16string s = u"hello";
+ std::u16string t = u"goodbye";
+ assert(s.find(u'o') == 4);
+ assert(t.find(u'b') == 4);
+ assert(s.find(u'z') == std::u16string::npos);
+ return true;
+}
+
+int main(int, char**) {
+ test();
+#if TEST_STD_VER >= 20
+ static_assert(test());
+#endif
+ return 0;
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/203621
More information about the cfe-commits
mailing list