[clang] [CIR] Upstream support for string literals (PR #140796)
Andy Kaylor via cfe-commits
cfe-commits at lists.llvm.org
Tue May 20 16:41:15 PDT 2025
https://github.com/andykaylor updated https://github.com/llvm/llvm-project/pull/140796
>From 7e1e98fd2f1686709fe16b6853dcbf00623e626c Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor at nvidia.com>
Date: Tue, 20 May 2025 09:51:41 -0700
Subject: [PATCH 1/3] [CIR] Upstream support for string literals
This adds the minimal support needed to handle string literals.
---
.../CIR/Dialect/Builder/CIRBaseBuilder.h | 7 ++
clang/include/clang/CIR/MissingFeatures.h | 1 +
clang/lib/CIR/CodeGen/CIRGenBuilder.h | 28 +++++
clang/lib/CIR/CodeGen/CIRGenExpr.cpp | 10 ++
clang/lib/CIR/CodeGen/CIRGenFunction.cpp | 2 +
clang/lib/CIR/CodeGen/CIRGenFunction.h | 2 +
clang/lib/CIR/CodeGen/CIRGenModule.cpp | 102 ++++++++++++++++++
clang/lib/CIR/CodeGen/CIRGenModule.h | 11 ++
.../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp | 12 +++
clang/test/CIR/CodeGen/string-literals.c | 56 ++++++++++
10 files changed, 231 insertions(+)
create mode 100644 clang/test/CIR/CodeGen/string-literals.c
diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index b680e4162a5ce..738f33bf36c9e 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -13,6 +13,7 @@
#include "clang/CIR/Dialect/IR/CIRAttrs.h"
#include "clang/CIR/Dialect/IR/CIRDialect.h"
#include "clang/CIR/Dialect/IR/CIRTypes.h"
+#include "clang/CIR/MissingFeatures.h"
#include "llvm/ADT/STLForwardCompat.h"
#include "llvm/Support/ErrorHandling.h"
@@ -177,6 +178,12 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
return create<cir::AllocaOp>(loc, addrType, type, name, alignment);
}
+ mlir::Value createGetGlobal(mlir::Location loc, cir::GlobalOp global) {
+ assert(!cir::MissingFeatures::addressSpace());
+ return create<cir::GetGlobalOp>(loc, getPointerTo(global.getSymType()),
+ global.getSymName());
+ }
+
cir::LoadOp createLoad(mlir::Location loc, mlir::Value ptr,
bool isVolatile = false, uint64_t alignment = 0) {
mlir::IntegerAttr intAttr;
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 7b33d94483d5f..d43e2d9f461d1 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -38,6 +38,7 @@ struct MissingFeatures {
static bool opGlobalWeakRef() { return false; }
static bool opGlobalLinkage() { return false; }
static bool opGlobalSetVisitibility() { return false; }
+ static bool opGlobalUnnamedAddr() { return false; }
static bool supportIFuncAttr() { return false; }
static bool supportVisibility() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index b1b0826a4e44a..aff8b8949f3ad 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -26,6 +26,34 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
CIRGenBuilderTy(mlir::MLIRContext &mlirContext, const CIRGenTypeCache &tc)
: CIRBaseBuilderTy(mlirContext), typeCache(tc) {}
+ /// Get a cir::ConstArrayAttr for a string literal.
+ /// Note: This is different from what is returned by
+ /// mlir::Builder::getStringAttr() which is an mlir::StringAttr.
+ mlir::Attribute getString(llvm::StringRef str, mlir::Type eltTy,
+ unsigned size) {
+ unsigned finalSize = size ? size : str.size();
+
+ size_t lastNonZeroPos = str.find_last_not_of('\0');
+ // If the string is full of null bytes, emit a #cir.zero rather than
+ // a #cir.const_array.
+ if (lastNonZeroPos == llvm::StringRef::npos) {
+ auto arrayTy = cir::ArrayType::get(eltTy, finalSize);
+ return cir::ZeroAttr::get(arrayTy);
+ }
+ // We emit trailing zeros only if there are multiple trailing zeros.
+ int trailingZerosNum = 0;
+ if (finalSize > lastNonZeroPos + 2)
+ trailingZerosNum = finalSize - lastNonZeroPos - 1;
+ auto truncatedArrayTy =
+ cir::ArrayType::get(eltTy, finalSize - trailingZerosNum);
+ auto fullArrayTy = cir::ArrayType::get(eltTy, finalSize);
+ return cir::ConstArrayAttr::get(
+ fullArrayTy,
+ mlir::StringAttr::get(str.drop_back(trailingZerosNum),
+ truncatedArrayTy),
+ trailingZerosNum);
+ }
+
std::string getUniqueAnonRecordName() { return getUniqueRecordName("anon"); }
std::string getUniqueRecordName(const std::string &baseName) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index c5fe3c1378624..a8fecafe4a1f3 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -743,6 +743,16 @@ CIRGenFunction::emitArraySubscriptExpr(const clang::ArraySubscriptExpr *e) {
return lv;
}
+LValue CIRGenFunction::emitStringLiteralLValue(const StringLiteral *e) {
+ cir::GlobalOp globalOp = cgm.getGlobalForStringLiteral(e);
+ assert(!cir::MissingFeatures::opGlobalAlignment());
+ mlir::Value addr =
+ builder.createGetGlobal(getLoc(e->getSourceRange()), globalOp);
+ return makeAddrLValue(
+ Address(addr, globalOp.getSymType(), CharUnits::fromQuantity(1)),
+ e->getType(), AlignmentSource::Decl);
+}
+
/// Casts are never lvalues unless that cast is to a reference type. If the cast
/// is to a reference, we can have the usual lvalue result, otherwise if a cast
/// is needed by the code generator in an lvalue context, then it must mean that
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index c3798de79d969..ce88e656a38e8 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -531,6 +531,8 @@ LValue CIRGenFunction::emitLValue(const Expr *e) {
return emitArraySubscriptExpr(cast<ArraySubscriptExpr>(e));
case Expr::UnaryOperatorClass:
return emitUnaryOpLValue(cast<UnaryOperator>(e));
+ case Expr::StringLiteralClass:
+ return emitStringLiteralLValue(cast<StringLiteral>(e));
case Expr::MemberExprClass:
return emitMemberExpr(cast<MemberExpr>(e));
case Expr::BinaryOperatorClass:
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index ce080f481da6b..74f2e4043933d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -695,6 +695,8 @@ class CIRGenFunction : public CIRGenTypeCache {
mlir::Value emitStoreThroughBitfieldLValue(RValue src, LValue dstresult);
+ LValue emitStringLiteralLValue(const StringLiteral *e);
+
mlir::LogicalResult emitSwitchBody(const clang::Stmt *s);
mlir::LogicalResult emitSwitchCase(const clang::SwitchCase &s,
bool buildingTopLevelCase);
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index e170498b67548..5bae8908d5dbb 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -562,6 +562,30 @@ void CIRGenModule::emitGlobalDefinition(clang::GlobalDecl gd,
llvm_unreachable("Invalid argument to CIRGenModule::emitGlobalDefinition");
}
+mlir::Attribute
+CIRGenModule::getConstantArrayFromStringLiteral(const StringLiteral *e) {
+ assert(!e->getType()->isPointerType() && "Strings are always arrays");
+
+ // Don't emit it as the address of the string, emit the string data itself
+ // as an inline array.
+ if (e->getCharByteWidth() == 1) {
+ SmallString<64> str(e->getString());
+
+ // Resize the string to the right size, which is indicated by its type.
+ const ConstantArrayType *cat =
+ astContext.getAsConstantArrayType(e->getType());
+ uint64_t finalSize = cat->getZExtSize();
+ str.resize(finalSize);
+
+ mlir::Type eltTy = convertType(cat->getElementType());
+ return builder.getString(str, eltTy, finalSize);
+ }
+
+ errorNYI(e->getSourceRange(),
+ "getConstantArrayFromStringLiteral: wide characters");
+ return mlir::Attribute();
+}
+
static bool shouldBeInCOMDAT(CIRGenModule &cgm, const Decl &d) {
assert(!cir::MissingFeatures::supportComdat());
@@ -749,6 +773,84 @@ CIRGenModule::getCIRLinkageVarDefinition(const VarDecl *vd, bool isConstant) {
return getCIRLinkageForDeclarator(vd, linkage, isConstant);
}
+static cir::GlobalOp generateStringLiteral(mlir::Location loc,
+ mlir::TypedAttr c, CIRGenModule &cgm,
+ StringRef globalName) {
+ assert(!cir::MissingFeatures::addressSpace());
+
+ // Create a global variable for this string
+ // FIXME(cir): check for insertion point in module level.
+ cir::GlobalOp gv =
+ CIRGenModule::createGlobalOp(cgm, loc, globalName, c.getType());
+
+ // Set up extra information and add to the module
+ assert(!cir::MissingFeatures::opGlobalAlignment());
+ assert(!cir::MissingFeatures::opGlobalLinkage());
+ assert(!cir::MissingFeatures::opGlobalThreadLocal());
+ assert(!cir::MissingFeatures::opGlobalUnnamedAddr());
+ CIRGenModule::setInitializer(gv, c);
+ assert(!cir::MissingFeatures::supportComdat());
+ assert(!cir::MissingFeatures::opGlobalDSOLocal());
+ return gv;
+}
+
+// LLVM IR automatically uniques names when new llvm::GlobalVariables are
+// created. This is handy, for example, when creating globals for string
+// literals. Since we don't do that when creating cir::GlobalOp's, we need
+// a mechanism to generate a unique name in advance.
+//
+// For now, this mechanism is only used in cases where we know that the
+// name is compiler-generated, so we don't use the MLIR symbol table for
+// the lookup.
+std::string CIRGenModule::getUniqueGlobalName(const std::string &baseName) {
+ // If this is the first time we've generated a name for this basename, use
+ // it as is and start a counter for this base name.
+ auto it = cgGlobalNames.find(baseName);
+ if (it == cgGlobalNames.end()) {
+ cgGlobalNames[baseName] = 0;
+ return baseName;
+ }
+
+ std::string result =
+ baseName + "." + std::to_string(cgGlobalNames[baseName]++);
+ // There should not be any symbol with this name in the module.
+ assert(!mlir::SymbolTable::lookupSymbolIn(theModule, result));
+ return result;
+}
+
+/// Return a pointer to a constant array for the given string literal.
+cir::GlobalOp CIRGenModule::getGlobalForStringLiteral(const StringLiteral *s,
+ StringRef name) {
+ mlir::Attribute c = getConstantArrayFromStringLiteral(s);
+
+ if (getLangOpts().WritableStrings) {
+ errorNYI(s->getSourceRange(),
+ "getGlobalForStringLiteral: Writable strings");
+ }
+
+ // Mangle the string literal if that's how the ABI merges duplicate strings.
+ // Don't do it if they are writable, since we don't want writes in one TU to
+ // affect strings in another.
+ if (getCXXABI().getMangleContext().shouldMangleStringLiteral(s) &&
+ !getLangOpts().WritableStrings) {
+ errorNYI(s->getSourceRange(),
+ "getGlobalForStringLiteral: mangle string literals");
+ }
+
+ // Unlike LLVM IR, CIR doesn't automatically unique names for globals, so
+ // we need to do that explicitly.
+ std::string uniqueName = getUniqueGlobalName(name.str());
+ mlir::Location loc = getLoc(s->getSourceRange());
+ auto typedC = llvm::cast<mlir::TypedAttr>(c);
+ assert(!cir::MissingFeatures::opGlobalAlignment());
+ cir::GlobalOp gv = generateStringLiteral(loc, typedC, *this, uniqueName);
+ assert(!cir::MissingFeatures::opGlobalDSOLocal());
+
+ assert(!cir::MissingFeatures::sanitizers());
+
+ return gv;
+}
+
void CIRGenModule::emitDeclContext(const DeclContext *dc) {
for (Decl *decl : dc->decls()) {
// Unlike other DeclContexts, the contents of an ObjCImplDecl at TU scope
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index b67239fcff44b..9828e1068e4fb 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -126,6 +126,9 @@ class CIRGenModule : public CIRGenTypeCache {
llvm::StringRef name, mlir::Type t,
mlir::Operation *insertPoint = nullptr);
+ llvm::StringMap<unsigned> cgGlobalNames;
+ std::string getUniqueGlobalName(const std::string &baseName);
+
/// Return the mlir::Value for the address of the given global variable.
/// If Ty is non-null and if the global doesn't exist, then it will be created
/// with the specified type instead of whatever the normal requested type
@@ -136,6 +139,14 @@ class CIRGenModule : public CIRGenTypeCache {
getAddrOfGlobalVar(const VarDecl *d, mlir::Type ty = {},
ForDefinition_t isForDefinition = NotForDefinition);
+ /// Return a constant array for the given string.
+ mlir::Attribute getConstantArrayFromStringLiteral(const StringLiteral *e);
+
+ /// Return a global symbol reference to a constant array for the given string
+ /// literal.
+ cir::GlobalOp getGlobalForStringLiteral(const StringLiteral *S,
+ llvm::StringRef Name = ".str");
+
const TargetCIRGenInfo &getTargetCIRGenInfo();
/// Helpers to convert the presumed location of Clang's SourceLocation to an
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 365569ce1f48a..2516007afd561 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -270,6 +270,18 @@ mlir::Value CIRAttrToValue::visitCirAttr(cir::ConstArrayAttr attr) {
result =
rewriter.create<mlir::LLVM::InsertValueOp>(loc, result, init, idx);
}
+ } else if (auto strAttr = mlir::dyn_cast<mlir::StringAttr>(attr.getElts())) {
+ // TODO(cir): this diverges from traditional lowering. Normally the string
+ // would be a global constant that is memcopied.
+ auto arrayTy = mlir::dyn_cast<cir::ArrayType>(strAttr.getType());
+ assert(arrayTy && "String attribute must have an array type");
+ mlir::Type eltTy = arrayTy.getElementType();
+ for (auto [idx, elt] : llvm::enumerate(strAttr)) {
+ auto init = rewriter.create<mlir::LLVM::ConstantOp>(
+ loc, converter->convertType(eltTy), elt);
+ result =
+ rewriter.create<mlir::LLVM::InsertValueOp>(loc, result, init, idx);
+ }
} else {
llvm_unreachable("unexpected ConstArrayAttr elements");
}
diff --git a/clang/test/CIR/CodeGen/string-literals.c b/clang/test/CIR/CodeGen/string-literals.c
new file mode 100644
index 0000000000000..873b00d9c9a98
--- /dev/null
+++ b/clang/test/CIR/CodeGen/string-literals.c
@@ -0,0 +1,56 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-android21 -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-android21 -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t-cir.ll %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-android21 -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
+
+// LLVM: @[[STR1_GLOBAL:.*]] = dso_local global [2 x i8] c"1\00"
+// LLVM: @[[STR2_GLOBAL:.*]] = dso_local global [1 x i8] zeroinitializer
+// LLVM: @[[STR3_GLOBAL:.*]] = dso_local global [2 x i8] zeroinitializer
+
+// OGCG: @[[STR1_GLOBAL:.*]] = private unnamed_addr constant [2 x i8] c"1\00"
+// OGCG: @[[STR2_GLOBAL:.*]] = private unnamed_addr constant [1 x i8] zeroinitializer
+// OGCG: @[[STR3_GLOBAL:.*]] = private unnamed_addr constant [2 x i8] zeroinitializer
+
+char *f1() {
+ return "1";
+}
+
+// CIR: cir.global external @[[STR1_GLOBAL:.*]] = #cir.const_array<"1\00" : !cir.array<!s8i x 2>> : !cir.array<!s8i x 2>
+// CIR: cir.func @f1()
+// CIR: %[[STR:.*]] = cir.get_global @[[STR1_GLOBAL]] : !cir.ptr<!cir.array<!s8i x 2>>
+
+// LLVM: define ptr @f1()
+// LLVM: store ptr @[[STR1_GLOBAL]], ptr {{.*}}
+
+// OGCG: define {{.*}}ptr @f1()
+// OGCG: ret ptr @[[STR1_GLOBAL]]
+
+char *f2() {
+ return "";
+}
+
+// CIR: cir.global external @[[STR2_GLOBAL:.*]] = #cir.zero : !cir.array<!s8i x 1>
+// CIR: cir.func @f2()
+// CIR: %[[STR2:.*]] = cir.get_global @[[STR2_GLOBAL]] : !cir.ptr<!cir.array<!s8i x 1>>
+
+// LLVM: define ptr @f2()
+// LLVM: store ptr @[[STR2_GLOBAL]], ptr {{.*}}
+
+// OGCG: define {{.*}}ptr @f2()
+// OGCG: ret ptr @[[STR2_GLOBAL]]
+
+char *f3() {
+ return "\00";
+}
+
+// CIR: cir.global external @[[STR3_GLOBAL:.*]] = #cir.zero : !cir.array<!s8i x 2>
+// CIR: cir.func @f3()
+// CIR: %[[STR3:.*]] = cir.get_global @[[STR3_GLOBAL]] : !cir.ptr<!cir.array<!s8i x 2>>
+
+// LLVM: define ptr @f3()
+// LLVM: store ptr @[[STR3_GLOBAL]], ptr {{.*}}
+
+// OGCG: define {{.*}}ptr @f3()
+// OGCG: ret ptr @[[STR3_GLOBAL]]
>From ec81084670a0f0866a2b43e106aa7a7921b8090c Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor at nvidia.com>
Date: Tue, 20 May 2025 15:28:01 -0700
Subject: [PATCH 2/3] Incorporate review feedback
---
clang/lib/CIR/CodeGen/CIRGenBuilder.h | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index aff8b8949f3ad..581f9f7152d88 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -30,8 +30,8 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
/// Note: This is different from what is returned by
/// mlir::Builder::getStringAttr() which is an mlir::StringAttr.
mlir::Attribute getString(llvm::StringRef str, mlir::Type eltTy,
- unsigned size) {
- unsigned finalSize = size ? size : str.size();
+ std::optional<size_t> size) {
+ size_t finalSize = size ? *size : str.size();
size_t lastNonZeroPos = str.find_last_not_of('\0');
// If the string is full of null bytes, emit a #cir.zero rather than
@@ -41,7 +41,7 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
return cir::ZeroAttr::get(arrayTy);
}
// We emit trailing zeros only if there are multiple trailing zeros.
- int trailingZerosNum = 0;
+ size_t trailingZerosNum = 0;
if (finalSize > lastNonZeroPos + 2)
trailingZerosNum = finalSize - lastNonZeroPos - 1;
auto truncatedArrayTy =
>From b6b9d31bafafe59f70aa45321f00d4e1cd7518cc Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor at nvidia.com>
Date: Tue, 20 May 2025 16:40:43 -0700
Subject: [PATCH 3/3] Accepted suggestion
---
clang/lib/CIR/CodeGen/CIRGenBuilder.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index 581f9f7152d88..9c0968f144eef 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -31,7 +31,7 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
/// mlir::Builder::getStringAttr() which is an mlir::StringAttr.
mlir::Attribute getString(llvm::StringRef str, mlir::Type eltTy,
std::optional<size_t> size) {
- size_t finalSize = size ? *size : str.size();
+ size_t finalSize = size.value_or(str.size());
size_t lastNonZeroPos = str.find_last_not_of('\0');
// If the string is full of null bytes, emit a #cir.zero rather than
More information about the cfe-commits
mailing list