[clang] [clang] Introduce "binary" StringLiteral for #embed data (PR #127629)

Mariya Podchishchaeva via cfe-commits cfe-commits at lists.llvm.org
Tue Feb 18 05:36:31 PST 2025


https://github.com/Fznamznon updated https://github.com/llvm/llvm-project/pull/127629

>From 700ec6f78c0a24729801bea381bafbcafb06826b Mon Sep 17 00:00:00 2001
From: "Podchishchaeva, Mariya" <mariya.podchishchaeva at intel.com>
Date: Tue, 18 Feb 2025 05:12:07 -0800
Subject: [PATCH 1/2] [clang] Introduce "binary" StringLiteral for #embed data

StringLiteral is used as internal data of EmbedExpr and we directly use it as
an initializer if a single EmbedExpr appears in the initializer list of a char
array. It is fast and convenient, but it is causing problems when
string literal character values are checked because #embed data values
are within a range [0-2^(char width)] but ordinary StringLiteral is of
maybe signed char type.
This PR introduces new kind of StringLiteral to hold binary data coming from
an embedded resource to mitigate these problems. The new kind of
StringLiteral is not assumed to have signed char type. The new kind of
StringLiteral also helps to prevent crashes when trying to find StringLiteral
token locations since these simply do not exist for binary data.

Fixes https://github.com/llvm/llvm-project/issues/119256
---
 clang/include/clang/AST/Expr.h            |  7 ++++---
 clang/lib/AST/Expr.cpp                    |  8 ++++++++
 clang/lib/Parse/ParseInit.cpp             |  2 +-
 clang/lib/Sema/SemaInit.cpp               |  1 +
 clang/test/Preprocessor/embed_constexpr.c | 21 +++++++++++++++++++++
 5 files changed, 35 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/Preprocessor/embed_constexpr.c

diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h
index cd584d9621a22..cf6f63b8711b8 100644
--- a/clang/include/clang/AST/Expr.h
+++ b/clang/include/clang/AST/Expr.h
@@ -1752,7 +1752,8 @@ enum class StringLiteralKind {
   UTF8,
   UTF16,
   UTF32,
-  Unevaluated
+  Unevaluated,
+  Binary
 };
 
 /// StringLiteral - This represents a string literal expression, e.g. "foo"
@@ -4965,9 +4966,9 @@ class EmbedExpr final : public Expr {
       assert(EExpr && CurOffset != ULLONG_MAX &&
              "trying to dereference an invalid iterator");
       IntegerLiteral *N = EExpr->FakeChildNode;
-      StringRef DataRef = EExpr->Data->BinaryData->getBytes();
       N->setValue(*EExpr->Ctx,
-                  llvm::APInt(N->getValue().getBitWidth(), DataRef[CurOffset],
+                  llvm::APInt(N->getValue().getBitWidth(),
+                              EExpr->Data->BinaryData->getCodeUnit(CurOffset),
                               N->getType()->isSignedIntegerType()));
       // We want to return a reference to the fake child node in the
       // EmbedExpr, not the local variable N.
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index 6f570139630d8..2747480f00d68 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -1104,6 +1104,7 @@ unsigned StringLiteral::mapCharByteWidth(TargetInfo const &Target,
   switch (SK) {
   case StringLiteralKind::Ordinary:
   case StringLiteralKind::UTF8:
+  case StringLiteralKind::Binary:
     CharByteWidth = Target.getCharWidth();
     break;
   case StringLiteralKind::Wide:
@@ -1216,6 +1217,7 @@ void StringLiteral::outputString(raw_ostream &OS) const {
   switch (getKind()) {
   case StringLiteralKind::Unevaluated:
   case StringLiteralKind::Ordinary:
+  case StringLiteralKind::Binary:
     break; // no prefix.
   case StringLiteralKind::Wide:
     OS << 'L';
@@ -1332,11 +1334,17 @@ StringLiteral::getLocationOfByte(unsigned ByteNo, const SourceManager &SM,
                                  const LangOptions &Features,
                                  const TargetInfo &Target, unsigned *StartToken,
                                  unsigned *StartTokenByteOffset) const {
+  // No source location of bytes for binary literals since they don't come from
+  // source.
+  if (getKind() == StringLiteralKind::Binary)
+    return getStrTokenLoc(0);
+
   assert((getKind() == StringLiteralKind::Ordinary ||
           getKind() == StringLiteralKind::UTF8 ||
           getKind() == StringLiteralKind::Unevaluated) &&
          "Only narrow string literals are currently supported");
 
+
   // Loop over all of the tokens in this string until we find the one that
   // contains the byte we're looking for.
   unsigned TokNo = 0;
diff --git a/clang/lib/Parse/ParseInit.cpp b/clang/lib/Parse/ParseInit.cpp
index 63b1d7bd9db53..471b3eaf28287 100644
--- a/clang/lib/Parse/ParseInit.cpp
+++ b/clang/lib/Parse/ParseInit.cpp
@@ -445,7 +445,7 @@ ExprResult Parser::createEmbedExpr() {
           Context.MakeIntValue(Str.size(), Context.getSizeType());
       QualType ArrayTy = Context.getConstantArrayType(
           Ty, ArraySize, nullptr, ArraySizeModifier::Normal, 0);
-      return StringLiteral::Create(Context, Str, StringLiteralKind::Ordinary,
+      return StringLiteral::Create(Context, Str, StringLiteralKind::Binary,
                                    false, ArrayTy, StartLoc);
     };
 
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 6a76e6d74a4b0..013e57df6615c 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -106,6 +106,7 @@ static StringInitFailureKind IsStringInit(Expr *Init, const ArrayType *AT,
       return SIF_None;
     [[fallthrough]];
   case StringLiteralKind::Ordinary:
+  case StringLiteralKind::Binary:
     // char array can be initialized with a narrow string.
     // Only allow char x[] = "foo";  not char x[] = L"foo";
     if (ElemTy->isCharType())
diff --git a/clang/test/Preprocessor/embed_constexpr.c b/clang/test/Preprocessor/embed_constexpr.c
new file mode 100644
index 0000000000000..e444dfec158b5
--- /dev/null
+++ b/clang/test/Preprocessor/embed_constexpr.c
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 %s -fsyntax-only --embed-dir=%S/Inputs -verify -std=c23
+
+static constexpr unsigned char data[] = {
+#embed "big_char.txt"
+};
+
+static constexpr char data1[] = {
+#embed "big_char.txt" // expected-error {{constexpr initializer evaluates to 255 which is not exactly representable in type 'const char'}}
+};
+
+static constexpr int data2[] = {
+#embed "big_char.txt"
+};
+
+static constexpr unsigned data3[] = {
+#embed "big_char.txt" suffix(, -1) // expected-error {{constexpr initializer evaluates to -1 which is not exactly representable in type 'const unsigned int'}}
+};
+
+static constexpr int data4[] = {
+#embed "big_char.txt" suffix(, -1)
+};

>From 97c1c0411afc85264ee15876db50494d6b52c93a Mon Sep 17 00:00:00 2001
From: "Podchishchaeva, Mariya" <mariya.podchishchaeva at intel.com>
Date: Tue, 18 Feb 2025 05:35:51 -0800
Subject: [PATCH 2/2] Make format happy

---
 clang/lib/AST/Expr.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index 2747480f00d68..e48b389fbc2c2 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -1344,7 +1344,6 @@ StringLiteral::getLocationOfByte(unsigned ByteNo, const SourceManager &SM,
           getKind() == StringLiteralKind::Unevaluated) &&
          "Only narrow string literals are currently supported");
 
-
   // Loop over all of the tokens in this string until we find the one that
   // contains the byte we're looking for.
   unsigned TokNo = 0;



More information about the cfe-commits mailing list