[clang] [llvm] [Clang] Emit TBAA info for enums in C (PR #73326)

David Sherwood via llvm-commits llvm-commits at lists.llvm.org
Thu Dec 7 06:19:15 PST 2023


https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/73326

>From af76f6b6b3469fd0f5f24427c5a175c8d9d7c83a Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Fri, 24 Nov 2023 13:20:23 +0000
Subject: [PATCH 1/5] [Clang] Emit TBAA info for enums in C

When emitting TBAA information for enums in C code we
currently just treat the data as an 'omnipotent char'.
However, with C strict aliasing this means we fail to
optimise certain cases. For example, in the SPEC2017
xz benchmark there are structs that contain arrays of
enums, and clang pessmistically assumes that accesses
to those enums could alias with other struct members
that have a different type.

According to

https://en.cppreference.com/w/c/language/enum

enums should be treated as 'int' types unless
explicitly specified (C23) or if 'int' would not be
large enough to hold all the enumerated values. In the
latter case the compiler is free to choose a suitable
integer that would hold all such values.

When compiling C code this patch generates TBAA
information for the enum by using an equivalent integer
of the size clang has already chosen for the enum. I
have ignored C++ for now because the rules are more
complex.

New test added here:

  clang/test/CodeGen/tbaa.c
---
 clang/lib/CodeGen/CodeGenTBAA.cpp |   5 +-
 clang/test/CodeGen/tbaa.c         | 116 ++++++++++++++++++++++++++++++
 2 files changed, 120 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/CodeGen/tbaa.c

diff --git a/clang/lib/CodeGen/CodeGenTBAA.cpp b/clang/lib/CodeGen/CodeGenTBAA.cpp
index 8705d3d65f1a5..f59d3d422d520 100644
--- a/clang/lib/CodeGen/CodeGenTBAA.cpp
+++ b/clang/lib/CodeGen/CodeGenTBAA.cpp
@@ -196,11 +196,14 @@ llvm::MDNode *CodeGenTBAA::getTypeInfoHelper(const Type *Ty) {
   // Enum types are distinct types. In C++ they have "underlying types",
   // however they aren't related for TBAA.
   if (const EnumType *ETy = dyn_cast<EnumType>(Ty)) {
+    if (!Features.CPlusPlus)
+      return getTypeInfo(Context.getIntTypeForBitwidth(Size * 8, 0));
+
     // In C++ mode, types have linkage, so we can rely on the ODR and
     // on their mangled names, if they're external.
     // TODO: Is there a way to get a program-wide unique name for a
     // decl with local linkage or no linkage?
-    if (!Features.CPlusPlus || !ETy->getDecl()->isExternallyVisible())
+    if (!ETy->getDecl()->isExternallyVisible())
       return getChar();
 
     SmallString<256> OutName;
diff --git a/clang/test/CodeGen/tbaa.c b/clang/test/CodeGen/tbaa.c
new file mode 100644
index 0000000000000..0ab81f60a7194
--- /dev/null
+++ b/clang/test/CodeGen/tbaa.c
@@ -0,0 +1,116 @@
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -no-struct-path-tbaa -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefixes=PATH
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O0 -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -relaxed-aliasing -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA
+// Test TBAA metadata generated by front-end.
+//
+// NO-TBAA-NOT: !tbaa
+
+typedef unsigned char uint8_t;
+typedef unsigned short uint16_t;
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+
+typedef enum {
+  RED_AUTO_32,
+  GREEN_AUTO_32,
+  BLUE_AUTO_32
+} EnumAuto32;
+
+typedef enum {
+  RED_AUTO_64,
+  GREEN_AUTO_64,
+  BLUE_AUTO_64 = 0x100000000ull
+} EnumAuto64;
+
+typedef enum : uint16_t {
+  RED_16,
+  GREEN_16,
+  BLUE_16
+} Enum16;
+
+typedef enum : uint8_t {
+  RED_8,
+  GREEN_8,
+  BLUE_8
+} Enum8;
+
+uint32_t g0(EnumAuto32 *E, uint32_t *val) {
+// CHECK-LABEL: define{{.*}} i32 @g0(
+// CHECK: store i32 5, ptr %{{.*}}, align 4, !tbaa [[TAG_i32:!.*]]
+// CHECK: store i32 0, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+// CHECK: load i32, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+// PATH-LABEL: define{{.*}} i32 @g0(
+// PATH: store i32 5, ptr %{{.*}}, align 4, !tbaa [[TAG_i32:!.*]]
+// PATH: store i32 0, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+// PATH: load i32, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+  *val = 5;
+  *E = RED_AUTO_32;
+  return *val;
+}
+
+uint64_t g1(EnumAuto64 *E, uint64_t *val) {
+// CHECK-LABEL: define{{.*}} i64 @g1(
+// CHECK: store i64 5, ptr %{{.*}}, align 8, !tbaa [[TAG_i64:!.*]]
+// CHECK: store i64 0, ptr %{{.*}}, align 8, !tbaa [[TAG_long:!.*]]
+// CHECK: load i64, ptr %{{.*}}, align 8, !tbaa [[TAG_i64]]
+// PATH-LABEL: define{{.*}} i64 @g1(
+// PATH: store i64 5, ptr %{{.*}}, align 8, !tbaa [[TAG_i64:!.*]]
+// PATH: store i64 0, ptr %{{.*}}, align 8, !tbaa [[TAG_long:!.*]]
+// PATH: load i64, ptr %{{.*}}, align 8, !tbaa [[TAG_i64]]
+  *val = 5;
+  *E = RED_AUTO_64;
+  return *val;
+}
+
+uint16_t g2(Enum16 *E, uint16_t *val) {
+// CHECK-LABEL: define{{.*}} i16 @g2(
+// CHECK: store i16 5, ptr %{{.*}}, align 2, !tbaa [[TAG_i16:!.*]]
+// CHECK: store i16 0, ptr %{{.*}}, align 2, !tbaa [[TAG_i16]]
+// CHECK: load i16, ptr %{{.*}}, align 2, !tbaa [[TAG_i16]]
+// PATH-LABEL: define{{.*}} i16 @g2(
+// PATH: store i16 5, ptr %{{.*}}, align 2, !tbaa [[TAG_i16:!.*]]
+// PATH: store i16 0, ptr %{{.*}}, align 2, !tbaa [[TAG_i16]]
+// PATH: load i16, ptr %{{.*}}, align 2, !tbaa [[TAG_i16]]
+  *val = 5;
+  *E = RED_16;
+  return *val;
+}
+
+uint8_t g3(Enum8 *E, uint8_t *val) {
+// CHECK-LABEL: define{{.*}} i8 @g3(
+// CHECK: store i8 5, ptr %{{.*}}, align 1, !tbaa [[TAG_i8:!.*]]
+// CHECK: store i8 0, ptr %{{.*}}, align 1, !tbaa [[TAG_i8]]
+// CHECK: load i8, ptr %{{.*}}, align 1, !tbaa [[TAG_i8]]
+// PATH-LABEL: define{{.*}} i8 @g3(
+// PATH: store i8 5, ptr %{{.*}}, align 1, !tbaa [[TAG_i8:!.*]]
+// PATH: store i8 0, ptr %{{.*}}, align 1, !tbaa [[TAG_i8]]
+// PATH: load i8, ptr %{{.*}}, align 1, !tbaa [[TAG_i8]]
+  *val = 5;
+  *E = RED_8;
+  return *val;
+}
+
+// CHECK: [[TYPE_char:!.*]] = !{!"omnipotent char", [[TAG_c_tbaa:!.*]],
+// CHECK: [[TAG_c_tbaa]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[TAG_i32]] = !{[[TYPE_i32:!.*]], [[TYPE_i32]], i64 0}
+// CHECK: [[TYPE_i32]] = !{!"int", [[TYPE_char]],
+// CHECK: [[TAG_i64]] = !{[[TYPE_i64:!.*]], [[TYPE_i64]], i64 0}
+// CHECK: [[TYPE_i64]] = !{!"long long", [[TYPE_char]],
+// CHECK: [[TAG_long]] = !{[[TYPE_long:!.*]], [[TYPE_long]], i64 0}
+// CHECK: [[TYPE_long]] = !{!"long", [[TYPE_char]],
+// CHECK: [[TAG_i16]] = !{[[TYPE_i16:!.*]], [[TYPE_i16]], i64 0}
+// CHECK: [[TYPE_i16]] = !{!"short", [[TYPE_char]],
+// CHECK: [[TAG_i8]] = !{[[TYPE_i8:!.*]], [[TYPE_char]], i64 0}
+
+// PATH: [[TYPE_char:!.*]] = !{!"omnipotent char", [[TAG_c_tbaa:!.*]],
+// PATH: [[TAG_c_tbaa]] = !{!"Simple C/C++ TBAA"}
+// PATH: [[TAG_i32]] = !{[[TYPE_i32:!.*]], [[TYPE_i32]], i64 0}
+// PATH: [[TYPE_i32]] = !{!"int", [[TYPE_char]],
+// PATH: [[TAG_i64]] = !{[[TYPE_i64:!.*]], [[TYPE_i64]], i64 0}
+// PATH: [[TYPE_i64]] = !{!"long long", [[TYPE_char]],
+// PATH: [[TAG_long]] = !{[[TYPE_long:!.*]], [[TYPE_long]], i64 0}
+// PATH: [[TYPE_long]] = !{!"long", [[TYPE_char]],
+// PATH: [[TAG_i16]] = !{[[TYPE_i16:!.*]], [[TYPE_i16]], i64 0}
+// PATH: [[TYPE_i16]] = !{!"short", [[TYPE_char]],
+// PATH: [[TAG_i8]] = !{[[TYPE_i8:!.*]], [[TYPE_char]], i64 0}

>From 96864bfabaa1f0189a335cc0f5c8afc7ab9fd3f0 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Mon, 27 Nov 2023 14:03:34 +0000
Subject: [PATCH 2/5] Use correct integer type for enum

---
 clang/lib/CodeGen/CodeGenTBAA.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CodeGenTBAA.cpp b/clang/lib/CodeGen/CodeGenTBAA.cpp
index f59d3d422d520..379097c1aeb1f 100644
--- a/clang/lib/CodeGen/CodeGenTBAA.cpp
+++ b/clang/lib/CodeGen/CodeGenTBAA.cpp
@@ -197,7 +197,7 @@ llvm::MDNode *CodeGenTBAA::getTypeInfoHelper(const Type *Ty) {
   // however they aren't related for TBAA.
   if (const EnumType *ETy = dyn_cast<EnumType>(Ty)) {
     if (!Features.CPlusPlus)
-      return getTypeInfo(Context.getIntTypeForBitwidth(Size * 8, 0));
+      return getTypeInfo(ETy->getDecl()->getIntegerType());
 
     // In C++ mode, types have linkage, so we can rely on the ODR and
     // on their mangled names, if they're external.

>From d74f00cd4b5c482801e1a082e9ed16f564ef8d9e Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Thu, 7 Dec 2023 09:15:24 +0000
Subject: [PATCH 3/5] Add enums to the TBAA documentation in LangRef.rst

---
 llvm/docs/LangRef.rst | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index bc1eab1e0b7a0..f7256b02a3209 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -6376,13 +6376,26 @@ aliases a memory access with an access tag ``(BaseTy2, AccessTy2,
 Offset2)`` if either ``(BaseTy1, Offset1)`` is reachable from ``(Base2,
 Offset2)`` via the ``Parent`` relation or vice versa.
 
+In C an enum will be compatible with an underlying integer type that is large
+enough to hold all enumerated values. In most cases this will be an ``int``,
+which is the default when no type is specified. However, if an ``int`` is not
+large enough to contain all enumerated values the compiler is free to choose
+any suitably larger compatible integer type. In addition, C23 allows you to
+explicitly specify the type, i.e. ``enum Foo : uint16_t { ... }``).
+
 As a concrete example, the type descriptor graph for the following program
 
 .. code-block:: c
 
+    enum Foo {
+      ENUM_1 = 1,
+      ENUM_2 = 2
+    };
+
     struct Inner {
-      int i;    // offset 0
-      float f;  // offset 4
+      int i;      // offset 0
+      float f;    // offset 4
+      enum Foo e; // offset 8
     };
 
     struct Outer {
@@ -6392,10 +6405,11 @@ As a concrete example, the type descriptor graph for the following program
     };
 
     void f(struct Outer* outer, struct Inner* inner, float* f, int* i, char* c) {
-      outer->f = 0;            // tag0: (OuterStructTy, FloatScalarTy, 0)
-      outer->inner_a.i = 0;    // tag1: (OuterStructTy, IntScalarTy, 12)
-      outer->inner_a.f = 0.0;  // tag2: (OuterStructTy, FloatScalarTy, 16)
-      *f = 0.0;                // tag3: (FloatScalarTy, FloatScalarTy, 0)
+      outer->f = 0;               // tag0: (OuterStructTy, FloatScalarTy, 0)
+      outer->inner_a.i = 0;       // tag1: (OuterStructTy, IntScalarTy, 12)
+      outer->inner_a.f = 0.0;     // tag2: (OuterStructTy, FloatScalarTy, 16)
+      outer->inner_a.e = ENUM_1;  // tag3: (OuterStructTy, IntScalarTy, 20)
+      *f = 0.0;                   // tag4: (FloatScalarTy, FloatScalarTy, 0)
     }
 
 is (note that in C and C++, ``char`` can be used to access any arbitrary
@@ -6408,7 +6422,7 @@ type):
     FloatScalarTy = ("float", CharScalarTy, 0)
     DoubleScalarTy = ("double", CharScalarTy, 0)
     IntScalarTy = ("int", CharScalarTy, 0)
-    InnerStructTy = {"Inner" (IntScalarTy, 0), (FloatScalarTy, 4)}
+    InnerStructTy = {"Inner" (IntScalarTy, 0), (FloatScalarTy, 4), (IntScalarTy, 12)}
     OuterStructTy = {"Outer", (FloatScalarTy, 0), (DoubleScalarTy, 4),
                      (InnerStructTy, 12)}
 

>From 3fd936071140e8e3bcd4a5c78f08351c0376f513 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Thu, 7 Dec 2023 09:52:00 +0000
Subject: [PATCH 4/5] Add enum change to ReleaseNotes.rst

---
 clang/docs/ReleaseNotes.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index ed1a978b5382d..2636b50d0ad70 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -196,6 +196,9 @@ C Language Changes
   number of elements in the flexible array member. This information can improve
   the results of the array bound sanitizer and the
   ``__builtin_dynamic_object_size`` builtin.
+- Enums will now be represented in TBAA metadata using their actual underlying
+  integer type. Previously they were treated as chars, which meant they could
+  alias with all other types.
 
 C23 Feature Support
 ^^^^^^^^^^^^^^^^^^^

>From 546043d4155ffe4b6fe47a9f4074070224cd23f1 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Thu, 7 Dec 2023 14:15:59 +0000
Subject: [PATCH 5/5] Revert changes to LangRef.rst

---
 llvm/docs/LangRef.rst | 28 +++++++---------------------
 1 file changed, 7 insertions(+), 21 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index f7256b02a3209..bc1eab1e0b7a0 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -6376,26 +6376,13 @@ aliases a memory access with an access tag ``(BaseTy2, AccessTy2,
 Offset2)`` if either ``(BaseTy1, Offset1)`` is reachable from ``(Base2,
 Offset2)`` via the ``Parent`` relation or vice versa.
 
-In C an enum will be compatible with an underlying integer type that is large
-enough to hold all enumerated values. In most cases this will be an ``int``,
-which is the default when no type is specified. However, if an ``int`` is not
-large enough to contain all enumerated values the compiler is free to choose
-any suitably larger compatible integer type. In addition, C23 allows you to
-explicitly specify the type, i.e. ``enum Foo : uint16_t { ... }``).
-
 As a concrete example, the type descriptor graph for the following program
 
 .. code-block:: c
 
-    enum Foo {
-      ENUM_1 = 1,
-      ENUM_2 = 2
-    };
-
     struct Inner {
-      int i;      // offset 0
-      float f;    // offset 4
-      enum Foo e; // offset 8
+      int i;    // offset 0
+      float f;  // offset 4
     };
 
     struct Outer {
@@ -6405,11 +6392,10 @@ As a concrete example, the type descriptor graph for the following program
     };
 
     void f(struct Outer* outer, struct Inner* inner, float* f, int* i, char* c) {
-      outer->f = 0;               // tag0: (OuterStructTy, FloatScalarTy, 0)
-      outer->inner_a.i = 0;       // tag1: (OuterStructTy, IntScalarTy, 12)
-      outer->inner_a.f = 0.0;     // tag2: (OuterStructTy, FloatScalarTy, 16)
-      outer->inner_a.e = ENUM_1;  // tag3: (OuterStructTy, IntScalarTy, 20)
-      *f = 0.0;                   // tag4: (FloatScalarTy, FloatScalarTy, 0)
+      outer->f = 0;            // tag0: (OuterStructTy, FloatScalarTy, 0)
+      outer->inner_a.i = 0;    // tag1: (OuterStructTy, IntScalarTy, 12)
+      outer->inner_a.f = 0.0;  // tag2: (OuterStructTy, FloatScalarTy, 16)
+      *f = 0.0;                // tag3: (FloatScalarTy, FloatScalarTy, 0)
     }
 
 is (note that in C and C++, ``char`` can be used to access any arbitrary
@@ -6422,7 +6408,7 @@ type):
     FloatScalarTy = ("float", CharScalarTy, 0)
     DoubleScalarTy = ("double", CharScalarTy, 0)
     IntScalarTy = ("int", CharScalarTy, 0)
-    InnerStructTy = {"Inner" (IntScalarTy, 0), (FloatScalarTy, 4), (IntScalarTy, 12)}
+    InnerStructTy = {"Inner" (IntScalarTy, 0), (FloatScalarTy, 4)}
     OuterStructTy = {"Outer", (FloatScalarTy, 0), (DoubleScalarTy, 4),
                      (InnerStructTy, 12)}
 



More information about the llvm-commits mailing list