[clang] [CIR] Support wide string literals in CIR codegen (PR #171541)
via cfe-commits
cfe-commits at lists.llvm.org
Thu Dec 11 09:39:14 PST 2025
https://github.com/adams381 updated https://github.com/llvm/llvm-project/pull/171541
>From 13d288e89c4505e46a6ca3b1767c70e65fc8307c Mon Sep 17 00:00:00 2001
From: Adam Smith <adams at nvidia.com>
Date: Tue, 9 Dec 2025 16:07:08 -0800
Subject: [PATCH 1/2] [CIR] Support wide string literals in CIR codegen
Implement support for wide string literals (wchar_t, char16_t, char32_t)
in getConstantArrayFromStringLiteral. This migrates the feature from
the incubator to upstream.
The implementation handles wide character string literals by:
- Collecting code units using getCodeUnit()
- Creating constant arrays with IntAttr elements
- Using ZeroAttr for null-filled strings
Add test file wide-string.cpp copied from incubator, expanded with
wchar_t test cases.
---
clang/lib/CIR/CodeGen/CIRGenModule.cpp | 37 ++++++++++++++++++++++--
clang/test/CIR/CodeGen/wide-string.cpp | 40 ++++++++++++++++++++++++++
2 files changed, 74 insertions(+), 3 deletions(-)
create mode 100644 clang/test/CIR/CodeGen/wide-string.cpp
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 41a5d9db83e2b..c0dcd3f55f328 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -21,7 +21,9 @@
#include "clang/AST/GlobalDecl.h"
#include "clang/AST/RecordLayout.h"
#include "clang/Basic/SourceManager.h"
+#include "clang/CIR/Dialect/IR/CIRAttrs.h"
#include "clang/CIR/Dialect/IR/CIRDialect.h"
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
#include "clang/CIR/Interfaces/CIROpInterfaces.h"
#include "clang/CIR/MissingFeatures.h"
@@ -31,6 +33,8 @@
#include "mlir/IR/MLIRContext.h"
#include "mlir/IR/Verifier.h"
+#include <algorithm>
+
using namespace clang;
using namespace clang::CIRGen;
@@ -960,9 +964,36 @@ CIRGenModule::getConstantArrayFromStringLiteral(const StringLiteral *e) {
return builder.getString(str, eltTy, finalSize);
}
- errorNYI(e->getSourceRange(),
- "getConstantArrayFromStringLiteral: wide characters");
- return mlir::Attribute();
+ auto arrayTy = mlir::dyn_cast<cir::ArrayType>(convertType(e->getType()));
+ assert(arrayTy && "string literals must be emitted as an array type");
+
+ auto arrayEltTy = mlir::dyn_cast<cir::IntType>(arrayTy.getElementType());
+ assert(arrayEltTy &&
+ "string literal elements must be emitted as integral type");
+
+ auto arraySize = arrayTy.getSize();
+ auto literalSize = e->getLength();
+
+ // Collect the code units.
+ SmallVector<uint32_t, 32> elementValues;
+ elementValues.reserve(arraySize);
+ for (unsigned i = 0; i < literalSize; ++i)
+ elementValues.push_back(e->getCodeUnit(i));
+ elementValues.resize(arraySize);
+
+ // If the string is full of null bytes, emit a #cir.zero instead.
+ if (std::all_of(elementValues.begin(), elementValues.end(),
+ [](uint32_t x) { return x == 0; }))
+ return cir::ZeroAttr::get(arrayTy);
+
+ // Otherwise emit a constant array holding the characters.
+ SmallVector<mlir::Attribute, 32> elements;
+ elements.reserve(arraySize);
+ for (uint64_t i = 0; i < arraySize; ++i)
+ elements.push_back(cir::IntAttr::get(arrayEltTy, elementValues[i]));
+
+ auto elementsAttr = mlir::ArrayAttr::get(&getMLIRContext(), elements);
+ return builder.getConstArray(elementsAttr, arrayTy);
}
bool CIRGenModule::supportsCOMDAT() const {
diff --git a/clang/test/CIR/CodeGen/wide-string.cpp b/clang/test/CIR/CodeGen/wide-string.cpp
new file mode 100644
index 0000000000000..9f145d022a943
--- /dev/null
+++ b/clang/test/CIR/CodeGen/wide-string.cpp
@@ -0,0 +1,40 @@
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s
+
+const char16_t *test_utf16() {
+ return u"你好世界";
+}
+
+// CHECK: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.const_array<[#cir.int<20320> : !u16i, #cir.int<22909> : !u16i, #cir.int<19990> : !u16i, #cir.int<30028> : !u16i, #cir.int<0> : !u16i]> : !cir.array<!u16i x 5>
+
+const char32_t *test_utf32() {
+ return U"你好世界";
+}
+
+// CHECK: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.const_array<[#cir.int<20320> : !u32i, #cir.int<22909> : !u32i, #cir.int<19990> : !u32i, #cir.int<30028> : !u32i, #cir.int<0> : !u32i]> : !cir.array<!u32i x 5>
+
+const char16_t *test_zero16() {
+ return u"\0\0\0\0";
+}
+
+// CHECK: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.zero : !cir.array<!u16i x 5>
+
+const char32_t *test_zero32() {
+ return U"\0\0\0\0";
+}
+
+// CHECK: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.zero : !cir.array<!u32i x 5>
+
+#include <stddef.h>
+
+const wchar_t *test_wchar() {
+ return L"1234";
+}
+
+// CHECK: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.const_array<[#cir.int<49> : !s32i, #cir.int<50> : !s32i, #cir.int<51> : !s32i, #cir.int<52> : !s32i, #cir.int<0> : !s32i]> : !cir.array<!s32i x 5>
+
+const wchar_t *test_wchar_zero() {
+ return L"";
+}
+
+// CHECK: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.zero : !cir.array<!s32i x 1>
>From aa633ba1d1a9ffd253e036dfd8f5bbad7b8a27b1 Mon Sep 17 00:00:00 2001
From: Adam Smith <adams at nvidia.com>
Date: Thu, 11 Dec 2025 09:38:24 -0800
Subject: [PATCH 2/2] Address reviewer feedback: improve code style and
performance
- Use mlir::cast<> instead of dyn_cast<> + assert
- Use explicit types (uint64_t, unsigned) instead of auto for method returns
- Remove unnecessary SmallVector size parameters
- Optimize zero-check to happen before building vector (early exit)
- Avoid double-looping by building elements directly
- Add LLVM and OGCG checks to test file
---
clang/lib/CIR/CodeGen/CIRGenModule.cpp | 40 ++++++++++++++------------
clang/test/CIR/CodeGen/wide-string.cpp | 30 ++++++++++++++-----
2 files changed, 44 insertions(+), 26 deletions(-)
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index c0dcd3f55f328..5c19b8b58d0ed 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -964,33 +964,35 @@ CIRGenModule::getConstantArrayFromStringLiteral(const StringLiteral *e) {
return builder.getString(str, eltTy, finalSize);
}
- auto arrayTy = mlir::dyn_cast<cir::ArrayType>(convertType(e->getType()));
- assert(arrayTy && "string literals must be emitted as an array type");
+ auto arrayTy = mlir::cast<cir::ArrayType>(convertType(e->getType()));
- auto arrayEltTy = mlir::dyn_cast<cir::IntType>(arrayTy.getElementType());
- assert(arrayEltTy &&
- "string literal elements must be emitted as integral type");
+ auto arrayEltTy = mlir::cast<cir::IntType>(arrayTy.getElementType());
- auto arraySize = arrayTy.getSize();
- auto literalSize = e->getLength();
+ uint64_t arraySize = arrayTy.getSize();
+ unsigned literalSize = e->getLength();
- // Collect the code units.
- SmallVector<uint32_t, 32> elementValues;
- elementValues.reserve(arraySize);
- for (unsigned i = 0; i < literalSize; ++i)
- elementValues.push_back(e->getCodeUnit(i));
- elementValues.resize(arraySize);
+ // Check if the string is all null bytes before building the vector.
+ // In most non-zero cases, this will break out on the first element.
+ // Padding bytes (if literalSize < arraySize) are implicitly zero.
+ bool isAllZero = true;
+ for (unsigned i = 0; i < literalSize; ++i) {
+ if (e->getCodeUnit(i) != 0) {
+ isAllZero = false;
+ break;
+ }
+ }
- // If the string is full of null bytes, emit a #cir.zero instead.
- if (std::all_of(elementValues.begin(), elementValues.end(),
- [](uint32_t x) { return x == 0; }))
+ if (isAllZero)
return cir::ZeroAttr::get(arrayTy);
// Otherwise emit a constant array holding the characters.
- SmallVector<mlir::Attribute, 32> elements;
+ SmallVector<mlir::Attribute> elements;
elements.reserve(arraySize);
- for (uint64_t i = 0; i < arraySize; ++i)
- elements.push_back(cir::IntAttr::get(arrayEltTy, elementValues[i]));
+ for (unsigned i = 0; i < literalSize; ++i)
+ elements.push_back(cir::IntAttr::get(arrayEltTy, e->getCodeUnit(i)));
+ // Pad with zeros if needed.
+ for (uint64_t i = literalSize; i < arraySize; ++i)
+ elements.push_back(cir::IntAttr::get(arrayEltTy, 0));
auto elementsAttr = mlir::ArrayAttr::get(&getMLIRContext(), elements);
return builder.getConstArray(elementsAttr, arrayTy);
diff --git a/clang/test/CIR/CodeGen/wide-string.cpp b/clang/test/CIR/CodeGen/wide-string.cpp
index 9f145d022a943..3ce790f0cff3d 100644
--- a/clang/test/CIR/CodeGen/wide-string.cpp
+++ b/clang/test/CIR/CodeGen/wide-string.cpp
@@ -1,29 +1,41 @@
// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
-// RUN: FileCheck --input-file=%t.cir %s
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t-cir.ll %s
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
const char16_t *test_utf16() {
return u"你好世界";
}
-// CHECK: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.const_array<[#cir.int<20320> : !u16i, #cir.int<22909> : !u16i, #cir.int<19990> : !u16i, #cir.int<30028> : !u16i, #cir.int<0> : !u16i]> : !cir.array<!u16i x 5>
+// CIR: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.const_array<[#cir.int<20320> : !u16i, #cir.int<22909> : !u16i, #cir.int<19990> : !u16i, #cir.int<30028> : !u16i, #cir.int<0> : !u16i]> : !cir.array<!u16i x 5>
+// LLVM: @{{.+}} = private constant [5 x i16] [i16 20320, i16 22909, i16 19990, i16 30028, i16 0]
+// OGCG: @{{.+}} = private unnamed_addr constant [5 x i16] [i16 20320, i16 22909, i16 19990, i16 30028, i16 0]
const char32_t *test_utf32() {
return U"你好世界";
}
-// CHECK: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.const_array<[#cir.int<20320> : !u32i, #cir.int<22909> : !u32i, #cir.int<19990> : !u32i, #cir.int<30028> : !u32i, #cir.int<0> : !u32i]> : !cir.array<!u32i x 5>
+// CIR: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.const_array<[#cir.int<20320> : !u32i, #cir.int<22909> : !u32i, #cir.int<19990> : !u32i, #cir.int<30028> : !u32i, #cir.int<0> : !u32i]> : !cir.array<!u32i x 5>
+// LLVM: @{{.+}} = private constant [5 x i32] [i32 20320, i32 22909, i32 19990, i32 30028, i32 0]
+// OGCG: @{{.+}} = private unnamed_addr constant [5 x i32] [i32 20320, i32 22909, i32 19990, i32 30028, i32 0]
const char16_t *test_zero16() {
return u"\0\0\0\0";
}
-// CHECK: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.zero : !cir.array<!u16i x 5>
+// CIR: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.zero : !cir.array<!u16i x 5>
+// LLVM: @{{.+}} = private constant [5 x i16] zeroinitializer
+// OGCG: @{{.+}} = private unnamed_addr constant [5 x i16] zeroinitializer
const char32_t *test_zero32() {
return U"\0\0\0\0";
}
-// CHECK: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.zero : !cir.array<!u32i x 5>
+// CIR: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.zero : !cir.array<!u32i x 5>
+// LLVM: @{{.+}} = private constant [5 x i32] zeroinitializer
+// OGCG: @{{.+}} = private unnamed_addr constant [5 x i32] zeroinitializer
#include <stddef.h>
@@ -31,10 +43,14 @@ const wchar_t *test_wchar() {
return L"1234";
}
-// CHECK: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.const_array<[#cir.int<49> : !s32i, #cir.int<50> : !s32i, #cir.int<51> : !s32i, #cir.int<52> : !s32i, #cir.int<0> : !s32i]> : !cir.array<!s32i x 5>
+// CIR: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.const_array<[#cir.int<49> : !s32i, #cir.int<50> : !s32i, #cir.int<51> : !s32i, #cir.int<52> : !s32i, #cir.int<0> : !s32i]> : !cir.array<!s32i x 5>
+// LLVM: @{{.+}} = private constant [5 x i32] [i32 49, i32 50, i32 51, i32 52, i32 0]
+// OGCG: @{{.+}} = private unnamed_addr constant [5 x i32] [i32 49, i32 50, i32 51, i32 52, i32 0]
const wchar_t *test_wchar_zero() {
return L"";
}
-// CHECK: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.zero : !cir.array<!s32i x 1>
+// CIR: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.zero : !cir.array<!s32i x 1>
+// LLVM: @{{.+}} = private constant [1 x i32] zeroinitializer
+// OGCG: @{{.+}} = private unnamed_addr constant [1 x i32] zeroinitializer
More information about the cfe-commits
mailing list