[clang] [clang] implement current CWG direction for string literal comparisons in constant evaluation (PR #109208)

Richard Smith via cfe-commits cfe-commits at lists.llvm.org
Wed Sep 18 15:27:25 PDT 2024


https://github.com/zygoloid updated https://github.com/llvm/llvm-project/pull/109208

>From 81193568c17a89f6cf42f43a82fb1fbf0f90184d Mon Sep 17 00:00:00 2001
From: Richard Smith <richard at metafoo.co.uk>
Date: Wed, 18 Sep 2024 21:59:56 +0000
Subject: [PATCH 1/3] Implement current CWG direction for string literal
 comparisons.

Track the identity of each string literal object produced by evaluation
with a global version number. Accept comparisons between literals of the
same version, and between literals of different versions that cannot
possibly be placed in overlapping storage. Treat the remaining
comparisons as non-constant.
---
 clang/include/clang/AST/ASTContext.h          |  11 ++
 .../include/clang/Basic/DiagnosticASTKinds.td |   2 +
 clang/lib/AST/ExprConstant.cpp                | 119 +++++++++++++++---
 clang/test/AST/ByteCode/builtin-functions.cpp |   3 +-
 clang/test/AST/ByteCode/cxx20.cpp             |  20 ++-
 clang/test/SemaCXX/builtins.cpp               |   2 +-
 .../SemaCXX/constant-expression-cxx11.cpp     |  36 ++++--
 7 files changed, 154 insertions(+), 39 deletions(-)

diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index b65a1f7dff5bc1..6170bcd4f15ae3 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -324,6 +324,13 @@ class ASTContext : public RefCountedBase<ASTContext> {
   /// This is lazily created.  This is intentionally not serialized.
   mutable llvm::StringMap<StringLiteral *> StringLiteralCache;
 
+  /// The next string literal "version" to allocate during constant evaluation.
+  /// This is used to distinguish between repeated evaluations of the same
+  /// string literal.
+  ///
+  /// TODO: Ensure version numbers don't collide when deserialized.
+  unsigned NextStringLiteralVersion = 0;
+
   /// MD5 hash of CUID. It is calculated when first used and cached by this
   /// data member.
   mutable std::string CUIDHash;
@@ -3278,6 +3285,10 @@ class ASTContext : public RefCountedBase<ASTContext> {
   /// PredefinedExpr to cache evaluated results.
   StringLiteral *getPredefinedStringLiteralFromCache(StringRef Key) const;
 
+  /// Return the next version number to be used for a string literal evaluated
+  /// as part of constant evaluation.
+  unsigned getNextStringLiteralVersion() { return NextStringLiteralVersion++; }
+
   /// Return a declaration for the global GUID object representing the given
   /// GUID value.
   MSGuidDecl *getMSGuidDecl(MSGuidDeclParts Parts) const;
diff --git a/clang/include/clang/Basic/DiagnosticASTKinds.td b/clang/include/clang/Basic/DiagnosticASTKinds.td
index 21a307d1e89878..76e693f6b4a6ca 100644
--- a/clang/include/clang/Basic/DiagnosticASTKinds.td
+++ b/clang/include/clang/Basic/DiagnosticASTKinds.td
@@ -96,6 +96,8 @@ def note_constexpr_pointer_constant_comparison : Note<
   "at runtime">;
 def note_constexpr_literal_comparison : Note<
   "comparison of addresses of literals has unspecified value">;
+def note_constexpr_opaque_call_comparison : Note<
+  "comparison against opaque constant has unspecified value">;
 def note_constexpr_pointer_weak_comparison : Note<
   "comparison against address of weak declaration '%0' can only be performed "
   "at runtime">;
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 6387e375dda79c..d9384a7c125a82 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -54,8 +54,10 @@
 #include "clang/Basic/DiagnosticSema.h"
 #include "clang/Basic/TargetInfo.h"
 #include "llvm/ADT/APFixedPoint.h"
+#include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/SaveAndRestore.h"
 #include "llvm/Support/SipHash.h"
@@ -2061,8 +2063,8 @@ static bool EvaluateIgnoredValue(EvalInfo &Info, const Expr *E) {
   return true;
 }
 
-/// Should this call expression be treated as a no-op?
-static bool IsNoOpCall(const CallExpr *E) {
+/// Should this call expression be treated as forming an opaque constant?
+static bool IsOpaqueConstantCall(const CallExpr *E) {
   unsigned Builtin = E->getBuiltinCallee();
   return (Builtin == Builtin::BI__builtin___CFStringMakeConstantString ||
           Builtin == Builtin::BI__builtin___NSStringMakeConstantString ||
@@ -2070,6 +2072,12 @@ static bool IsNoOpCall(const CallExpr *E) {
           Builtin == Builtin::BI__builtin_function_start);
 }
 
+static bool IsOpaqueConstantCall(const LValue &LVal) {
+  auto *BaseExpr =
+      llvm::dyn_cast_or_null<CallExpr>(LVal.Base.dyn_cast<const Expr *>());
+  return BaseExpr && IsOpaqueConstantCall(BaseExpr);
+}
+
 static bool IsGlobalLValue(APValue::LValueBase B) {
   // C++11 [expr.const]p3 An address constant expression is a prvalue core
   // constant expression of pointer type that evaluates to...
@@ -2115,7 +2123,7 @@ static bool IsGlobalLValue(APValue::LValueBase B) {
   case Expr::ObjCBoxedExprClass:
     return cast<ObjCBoxedExpr>(E)->isExpressibleAsConstantInitializer();
   case Expr::CallExprClass:
-    return IsNoOpCall(cast<CallExpr>(E));
+    return IsOpaqueConstantCall(cast<CallExpr>(E));
   // For GCC compatibility, &&label has static storage duration.
   case Expr::AddrLabelExprClass:
     return true;
@@ -2142,11 +2150,81 @@ static const ValueDecl *GetLValueBaseDecl(const LValue &LVal) {
   return LVal.Base.dyn_cast<const ValueDecl*>();
 }
 
-static bool IsLiteralLValue(const LValue &Value) {
-  if (Value.getLValueCallIndex())
+// Information about an LValueBase that is some kind of string.
+struct LValueBaseString {
+  std::string ObjCEncodeStorage;
+  StringRef Bytes;
+  int CharWidth;
+};
+
+// Gets the lvalue base of LVal as a string.
+static bool GetLValueBaseAsString(const EvalInfo &Info, const LValue &LVal,
+                                  LValueBaseString &AsString) {
+  const auto *BaseExpr = LVal.Base.dyn_cast<const Expr *>();
+  if (!BaseExpr)
+    return false;
+
+  // For ObjCEncodeExpr, we need to compute and store the string.
+  if (const auto *EE = dyn_cast<ObjCEncodeExpr>(BaseExpr)) {
+    Info.Ctx.getObjCEncodingForType(EE->getEncodedType(),
+                                    AsString.ObjCEncodeStorage);
+    AsString.Bytes = AsString.ObjCEncodeStorage;
+    AsString.CharWidth = 1;
+    return true;
+  }
+
+  // Otherwise, we have a StringLiteral.
+  const auto *Lit = dyn_cast<StringLiteral>(BaseExpr);
+  if (const auto *PE = dyn_cast<PredefinedExpr>(BaseExpr))
+    Lit = PE->getFunctionName();
+
+  if (!Lit)
+    return false;
+
+  AsString.Bytes = Lit->getBytes();
+  AsString.CharWidth = Lit->getCharByteWidth();
+  return true;
+}
+
+// Determine whether two string literals potentially overlap. This will be the
+// case if they agree on the values of all the bytes on the overlapping region
+// between them.
+static bool ArePotentiallyOverlappingStringLiterals(const EvalInfo &Info,
+                                                    const LValue &LHS,
+                                                    const LValue &RHS) {
+  LValueBaseString LHSString, RHSString;
+  if (!GetLValueBaseAsString(Info, LHS, LHSString) ||
+      !GetLValueBaseAsString(Info, RHS, RHSString))
     return false;
-  const Expr *E = Value.Base.dyn_cast<const Expr*>();
-  return E && !isa<MaterializeTemporaryExpr>(E);
+
+  // This is the byte offset to the location of the first character of LHS
+  // within RHS. We don't need to look at the characters of one string that
+  // would appear before the start of the other string if they were merged.
+  CharUnits Offset = RHS.Offset - LHS.Offset;
+  if (Offset.isPositive()) {
+    RHSString.Bytes = RHSString.Bytes.drop_front(Offset.getQuantity());
+  } else if (Offset.isNegative()) {
+    LHSString.Bytes = LHSString.Bytes.drop_front(-Offset.getQuantity());
+  }
+
+  bool LHSIsLonger = LHSString.Bytes.size() > RHSString.Bytes.size();
+  StringRef Longer = LHSIsLonger ? LHSString.Bytes : RHSString.Bytes;
+  StringRef Shorter = LHSIsLonger ? RHSString.Bytes : LHSString.Bytes;
+  int ShorterCharWidth = (LHSIsLonger ? RHSString : LHSString).CharWidth;
+
+  // The null terminator isn't included in the string data, so check for it
+  // manually. If the longer string doesn't have a null terminator where the
+  // shorter string ends, they aren't potentially overlapping.
+  for (int nullByte : llvm::seq(ShorterCharWidth)) {
+    if (Shorter.size() + nullByte >= Longer.size())
+      break;
+    if (Longer[Shorter.size() + nullByte])
+      return false;
+  }
+
+  // Otherwise, they're potentially overlapping if and only if the overlapping
+  // region is the same.
+  return Shorter == Longer.take_front(Shorter.size());
 }
 
 static bool IsWeakLValue(const LValue &Value) {
@@ -8573,7 +8651,10 @@ class LValueExprEvaluator
   bool VisitMaterializeTemporaryExpr(const MaterializeTemporaryExpr *E);
   bool VisitCompoundLiteralExpr(const CompoundLiteralExpr *E);
   bool VisitMemberExpr(const MemberExpr *E);
-  bool VisitStringLiteral(const StringLiteral *E) { return Success(E); }
+  bool VisitStringLiteral(const StringLiteral *E) {
+    uint64_t Version = Info.getASTContext().getNextStringLiteralVersion();
+    return Success(APValue::LValueBase(E, 0, static_cast<unsigned>(Version)));
+  }
   bool VisitObjCEncodeExpr(const ObjCEncodeExpr *E) { return Success(E); }
   bool VisitCXXTypeidExpr(const CXXTypeidExpr *E);
   bool VisitCXXUuidofExpr(const CXXUuidofExpr *E);
@@ -9639,7 +9720,7 @@ static bool isOneByteCharacterType(QualType T) {
 
 bool PointerExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
                                                 unsigned BuiltinOp) {
-  if (IsNoOpCall(E))
+  if (IsOpaqueConstantCall(E))
     return Success(E);
 
   switch (BuiltinOp) {
@@ -13889,13 +13970,21 @@ EvaluateComparisonBinaryOperator(EvalInfo &Info, const BinaryOperator *E,
           (!RHSValue.Base && !RHSValue.Offset.isZero()))
         return DiagComparison(diag::note_constexpr_pointer_constant_comparison,
                               !RHSValue.Base);
-      // It's implementation-defined whether distinct literals will have
-      // distinct addresses. In clang, the result of such a comparison is
-      // unspecified, so it is not a constant expression. However, we do know
-      // that the address of a literal will be non-null.
-      if ((IsLiteralLValue(LHSValue) || IsLiteralLValue(RHSValue)) &&
-          LHSValue.Base && RHSValue.Base)
+      // C++2c [intro.object]/10:
+      //   Two objects [...] may have the same address if [...] they are both
+      //   potentially non-unique objects.
+      // C++2c [intro.object]/9:
+      //   An object is potentially non-unique if it is a string literal object,
+      //   the backing array of an initializer list, or a subobject thereof.
+      //
+      // This makes the comparison result unspecified, so it's not a constant
+      // expression.
+      //
+      // TODO: Do we need to handle the initializer list case here?
+      if (ArePotentiallyOverlappingStringLiterals(Info, LHSValue, RHSValue))
         return DiagComparison(diag::note_constexpr_literal_comparison);
+      if (IsOpaqueConstantCall(LHSValue) || IsOpaqueConstantCall(RHSValue))
+        return DiagComparison(diag::note_constexpr_opaque_call_comparison);
       // We can't tell whether weak symbols will end up pointing to the same
       // object.
       if (IsWeakLValue(LHSValue) || IsWeakLValue(RHSValue))
diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp
index 9fd5eae67a21f6..3de0baaade50d0 100644
--- a/clang/test/AST/ByteCode/builtin-functions.cpp
+++ b/clang/test/AST/ByteCode/builtin-functions.cpp
@@ -966,7 +966,8 @@ namespace shufflevector {
 namespace FunctionStart {
   void a(void) {}
   static_assert(__builtin_function_start(a) == a, ""); // both-error {{not an integral constant expression}} \
-                                                       // both-note {{comparison of addresses of literals has unspecified value}}
+                                                       // ref-note {{comparison against opaque constant has unspecified value}} \
+                                                       // expected-note {{comparison of addresses of literals has unspecified value}}
 }
 
 namespace BuiltinInImplicitCtor {
diff --git a/clang/test/AST/ByteCode/cxx20.cpp b/clang/test/AST/ByteCode/cxx20.cpp
index 9bbc3dbe0073d3..f2a87bae55b13e 100644
--- a/clang/test/AST/ByteCode/cxx20.cpp
+++ b/clang/test/AST/ByteCode/cxx20.cpp
@@ -99,7 +99,7 @@ constexpr int f() {
 static_assert(f());
 #endif
 
-/// Distinct literals have disctinct addresses.
+/// Distinct literals have distinct addresses.
 /// see https://github.com/llvm/llvm-project/issues/58754
 constexpr auto foo(const char *p) { return p; }
 constexpr auto p1 = "test1";
@@ -108,22 +108,16 @@ constexpr auto p2 = "test2";
 constexpr bool b1 = foo(p1) == foo(p1);
 static_assert(b1);
 
-constexpr bool b2 = foo(p1) == foo(p2); // ref-error {{must be initialized by a constant expression}} \
-                                        // ref-note {{comparison of addresses of literals}} \
-                                        // ref-note {{declared here}}
-static_assert(!b2); // ref-error {{not an integral constant expression}} \
-                    // ref-note {{not a constant expression}}
+constexpr bool b2 = foo(p1) == foo(p2);
+static_assert(!b2);
 
 constexpr auto name1() { return "name1"; }
 constexpr auto name2() { return "name2"; }
 
-constexpr auto b3 = name1() == name1();
-static_assert(b3);
-constexpr auto b4 = name1() == name2(); // ref-error {{must be initialized by a constant expression}} \
-                                        // ref-note {{has unspecified value}} \
-                                        // ref-note {{declared here}}
-static_assert(!b4); // ref-error {{not an integral constant expression}} \
-                    // ref-note {{not a constant expression}}
+constexpr auto b3 = name1() == name1(); // ref-error {{must be initialized by a constant expression}} \
+                                        // ref-note {{comparison of addresses of literals}}
+constexpr auto b4 = name1() == name2();
+static_assert(!b4);
 
 namespace UninitializedFields {
   class A {
diff --git a/clang/test/SemaCXX/builtins.cpp b/clang/test/SemaCXX/builtins.cpp
index f47ed3a1f7ebfc..7df405f0662a15 100644
--- a/clang/test/SemaCXX/builtins.cpp
+++ b/clang/test/SemaCXX/builtins.cpp
@@ -47,7 +47,7 @@ void a(void) {}
 int n;
 void *p = __builtin_function_start(n);               // expected-error {{argument must be a function}}
 static_assert(__builtin_function_start(a) == a, ""); // expected-error {{static assertion expression is not an integral constant expression}}
-// expected-note at -1 {{comparison of addresses of literals has unspecified value}}
+// expected-note at -1 {{comparison against opaque constant has unspecified value}}
 } // namespace function_start
 
 void no_ms_builtins() {
diff --git a/clang/test/SemaCXX/constant-expression-cxx11.cpp b/clang/test/SemaCXX/constant-expression-cxx11.cpp
index 44ef540f41fa8c..3d56c3e351f9bd 100644
--- a/clang/test/SemaCXX/constant-expression-cxx11.cpp
+++ b/clang/test/SemaCXX/constant-expression-cxx11.cpp
@@ -2,6 +2,10 @@
 // RUN: %clang_cc1 -std=c++20 -isystem %S/Inputs -fsyntax-only -verify=expected,cxx11_20,cxx20_23,pre-cxx23 -triple x86_64-linux -Wno-string-plus-int -Wno-pointer-arith -Wno-zero-length-array -Wno-c99-designator -fcxx-exceptions -pedantic %s -Wno-comment -Wno-tautological-pointer-compare -Wno-bool-conversion
 // RUN: %clang_cc1 -std=c++11 -isystem %S/Inputs -fsyntax-only -verify=expected,cxx11_20,cxx11,pre-cxx23    -triple x86_64-linux -Wno-string-plus-int -Wno-pointer-arith -Wno-zero-length-array -Wno-c99-designator -fcxx-exceptions -pedantic %s -Wno-comment -Wno-tautological-pointer-compare -Wno-bool-conversion
 
+// This macro forces its argument to be constant-folded, even if it's not
+// otherwise a constant expression.
+#define fold(x) (__builtin_constant_p(x) ? (x) : (x))
+
 namespace StaticAssertFoldTest {
 
 int x;
@@ -358,11 +362,31 @@ struct Str {
 
 extern char externalvar[];
 constexpr bool constaddress = (void *)externalvar == (void *)0x4000UL; // expected-error {{must be initialized by a constant expression}} expected-note {{reinterpret_cast}}
-constexpr bool litaddress = "foo" == "foo"; // expected-error {{must be initialized by a constant expression}}
-// expected-note at -1 {{comparison of addresses of literals has unspecified value}}
-// cxx20_23-warning at -2 {{comparison between two arrays is deprecated}}
 static_assert(0 != "foo", "");
 
+// OK: These string literals cannot possibly overlap.
+static_assert(+"foo" != +"bar", "");
+static_assert("xfoo" + 1 != "yfoo" + 1, "");
+static_assert(+"foot" != +"foo", "");
+static_assert(+"foo\0bar" != +"foo\0baz", "");
+
+// These can't overlap because the null terminator for UTF-16 is two bytes wide.
+static_assert(fold((const char*)u"A" != (const char*)"\0A\0x"), "");
+static_assert(fold((const char*)u"A" != (const char*)"A\0\0x"), "");
+
+// These strings may overlap, and so the result of the comparison is unknown.
+constexpr bool may_overlap_1 = +"foo" == +"foo"; // expected-error {{}} expected-note {{addresses of literals}}
+constexpr bool may_overlap_2 = +"foo" == +"foo\0bar"; // expected-error {{}} expected-note {{addresses of literals}}
+constexpr bool may_overlap_3 = +"foo" == "bar\0foo" + 4; // expected-error {{}} expected-note {{addresses of literals}}
+constexpr bool may_overlap_4 = "xfoo" + 1 == "xfoo" + 1; // expected-error {{}} expected-note {{addresses of literals}}
+
+// These may overlap even though they have different encodings.
+// One of these two comparisons is non-constant, but due to endianness we don't
+// know which one.
+constexpr bool may_overlap_different_encoding[] =
+  {fold((const char*)u"A" != (const char*)"xA\0\0\0x" + 1), fold((const char*)u"A" != (const char*)"x\0A\0\0x" + 1)};
+  // expected-error at -2 {{}} expected-note at -1 {{addresses of literals}}
+
 }
 
 namespace MaterializeTemporary {
@@ -1543,16 +1567,10 @@ namespace MutableMembers {
 
 namespace Fold {
 
-  // This macro forces its argument to be constant-folded, even if it's not
-  // otherwise a constant expression.
-  #define fold(x) (__builtin_constant_p(x) ? (x) : (x))
-
   constexpr int n = (long)(char*)123; // expected-error {{constant expression}} expected-note {{reinterpret_cast}}
   constexpr int m = fold((long)(char*)123); // ok
   static_assert(m == 123, "");
 
-  #undef fold
-
 }
 
 namespace DR1454 {

>From 2f8cfbe8c5f09c89b7a572b65985c1e1693e8d89 Mon Sep 17 00:00:00 2001
From: Richard Smith <richard at metafoo.co.uk>
Date: Wed, 18 Sep 2024 22:23:06 +0000
Subject: [PATCH 2/3] Add a test that repeated evaluation in the same function
 call might produce different string literals each time.

---
 clang/test/SemaCXX/constant-expression-cxx14.cpp | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/clang/test/SemaCXX/constant-expression-cxx14.cpp b/clang/test/SemaCXX/constant-expression-cxx14.cpp
index 70ab5dcd357c1c..01326aed8be63d 100644
--- a/clang/test/SemaCXX/constant-expression-cxx14.cpp
+++ b/clang/test/SemaCXX/constant-expression-cxx14.cpp
@@ -1306,3 +1306,18 @@ constexpr int field(int a) {
 static_assert(field(3), ""); // expected-error {{constant expression}} \
                              // expected-note {{in call to 'field(3)'}}
 }
+
+namespace literal_comparison {
+
+constexpr bool different_in_loop(bool b = false) {
+  if (b) return false;
+
+  const char *p[2] = {};
+  for (const char *&r : p)
+    r = "hello";
+  return p[0] == p[1]; // expected-note {{addresses of literals}}
+}
+constexpr bool check = different_in_loop();
+  // expected-error at -1 {{}} expected-note at -1 {{in call}}
+
+}
\ No newline at end of file

>From def1aa7c677fd5fdaa4a8b4c24a5858e03371fb1 Mon Sep 17 00:00:00 2001
From: Richard Smith <richard at metafoo.co.uk>
Date: Wed, 18 Sep 2024 22:27:09 +0000
Subject: [PATCH 3/3] A couple more tests.

---
 clang/test/SemaCXX/constant-expression-cxx11.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/clang/test/SemaCXX/constant-expression-cxx11.cpp b/clang/test/SemaCXX/constant-expression-cxx11.cpp
index 3d56c3e351f9bd..e2ea984b37cd04 100644
--- a/clang/test/SemaCXX/constant-expression-cxx11.cpp
+++ b/clang/test/SemaCXX/constant-expression-cxx11.cpp
@@ -374,6 +374,11 @@ static_assert(+"foo\0bar" != +"foo\0baz", "");
 static_assert(fold((const char*)u"A" != (const char*)"\0A\0x"), "");
 static_assert(fold((const char*)u"A" != (const char*)"A\0\0x"), "");
 
+constexpr const char *string = "hello";
+constexpr const char *also_string = string;
+static_assert(string == string, "");
+static_assert(string == also_string, "");
+
 // These strings may overlap, and so the result of the comparison is unknown.
 constexpr bool may_overlap_1 = +"foo" == +"foo"; // expected-error {{}} expected-note {{addresses of literals}}
 constexpr bool may_overlap_2 = +"foo" == +"foo\0bar"; // expected-error {{}} expected-note {{addresses of literals}}



More information about the cfe-commits mailing list