[clang] [llvm] Add metadata for const C/C++ scalar types to track initial values of escaped alloca (PR #157676)

Vladislav Belov via llvm-commits llvm-commits at lists.llvm.org
Tue Sep 9 06:59:10 PDT 2025


https://github.com/vbe-sc created https://github.com/llvm/llvm-project/pull/157676

According to the 9.2.9.2.4 bullet of the C++ standard:
> Any attempt to modify (7.6.19, 7.6.1.6, 7.6.2.3) a const object (6.8.5) during its lifetime (6.7.3) results in
undefined behavior.

and 6.7.4.7 bullet of the C standard: 
> If an attempt is made to modify an object defined with a const-qualified type through use of an
lvalue with non-const-qualified type, the behavior is undefined.

the following case: 

```
void foo(int const * const p);

int test(int p1) {
    const int p2 = p1; 
    foo(&p2);
    if (p1 == p2)
        return 14;
    return 42;
}
```

can be optimized with dropping `p1 == p2` comparison. However, LLVM doesn't emit any information about `const` qualifier. 

This patch enables such optimizations

>From 8999ba88d20054e329736d43bf57ba19ee07974a Mon Sep 17 00:00:00 2001
From: vb-sc <vladislav.belov at syntacore.com>
Date: Tue, 9 Sep 2025 16:55:07 +0300
Subject: [PATCH 1/2] [clang] Add precommit tests for immutable alloca related
 to const scalar C/C++ types

---
 clang/test/CodeGen/const-alloca.c             | 14 ++++
 .../SROA/sroa-immutable-alloca-propagation.ll | 66 +++++++++++++++++++
 2 files changed, 80 insertions(+)
 create mode 100644 clang/test/CodeGen/const-alloca.c
 create mode 100644 llvm/test/Transforms/SROA/sroa-immutable-alloca-propagation.ll

diff --git a/clang/test/CodeGen/const-alloca.c b/clang/test/CodeGen/const-alloca.c
new file mode 100644
index 0000000000000..c68bdadad8fc4
--- /dev/null
+++ b/clang/test/CodeGen/const-alloca.c
@@ -0,0 +1,14 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 %s -emit-llvm -o - | FileCheck %s
+
+// CHECK-LABEL: define dso_local i32 @test(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[X:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 1, ptr [[X]], align 4
+// CHECK-NEXT:    ret i32 1
+//
+int test() {
+  const int x = 1;
+  return x;
+}
diff --git a/llvm/test/Transforms/SROA/sroa-immutable-alloca-propagation.ll b/llvm/test/Transforms/SROA/sroa-immutable-alloca-propagation.ll
new file mode 100644
index 0000000000000..6dd79832b6d38
--- /dev/null
+++ b/llvm/test/Transforms/SROA/sroa-immutable-alloca-propagation.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=sroa < %s | FileCheck %s
+
+; Function Attrs: nounwind uwtable
+define dso_local i32 @test2(i16 noundef signext %p1) {
+; CHECK-LABEL: define dso_local i32 @test2(
+; CHECK-SAME: i16 noundef signext [[P1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[P2:%.*]] = alloca i16, align 2
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[P2]])
+; CHECK-NEXT:    store i16 [[P1]], ptr [[P2]], align 2
+; CHECK-NEXT:    call void @foo(ptr noundef [[P2]])
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[P1]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[P2]], align 2
+; CHECK-NEXT:    [[CONV1:%.*]] = sext i16 [[TMP0]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CONV]], [[CONV1]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    br label %[[CLEANUP:.*]]
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    br label %[[CLEANUP]]
+; CHECK:       [[CLEANUP]]:
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 14, %[[IF_THEN]] ], [ 42, %[[IF_END]] ]
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[P2]])
+; CHECK-NEXT:    ret i32 [[RETVAL_0]]
+;
+entry:
+  %retval = alloca i32, align 4
+  %p1.addr = alloca i16, align 2
+  %p2 = alloca i16, align 2
+  %cleanup.dest.slot = alloca i32, align 4
+  store i16 %p1, ptr %p1.addr, align 2
+  call void @llvm.lifetime.start.p0(ptr %p2)
+  %0 = load i16, ptr %p1.addr, align 2
+  store i16 %0, ptr %p2, align 2
+  call void @foo(ptr noundef %p2)
+  %1 = load i16, ptr %p1.addr, align 2
+  %conv = sext i16 %1 to i32
+  %2 = load i16, ptr %p2, align 2
+  %conv1 = sext i16 %2 to i32
+  %cmp = icmp eq i32 %conv, %conv1
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  store i32 14, ptr %retval, align 4
+  store i32 1, ptr %cleanup.dest.slot, align 4
+  br label %cleanup
+
+if.end:                                           ; preds = %entry
+  store i32 42, ptr %retval, align 4
+  store i32 1, ptr %cleanup.dest.slot, align 4
+  br label %cleanup
+
+cleanup:                                          ; preds = %if.end, %if.then
+  call void @llvm.lifetime.end.p0(ptr %p2)
+  %3 = load i32, ptr %retval, align 4
+  ret i32 %3
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(ptr captures(none))
+
+declare void @foo(ptr noundef)
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(ptr captures(none))

>From d8fa50fe4d50284e6d5ae3287861e7fdaf621187 Mon Sep 17 00:00:00 2001
From: vb-sc <vladislav.belov at syntacore.com>
Date: Tue, 9 Sep 2025 16:56:32 +0300
Subject: [PATCH 2/2] Add metadata for const C/C++ scalar types to track
 initial values of escaped alloca

---
 clang/lib/CodeGen/CGDecl.cpp                  | 15 ++++-
 clang/lib/CodeGen/CGExpr.cpp                  | 20 +++++--
 clang/lib/CodeGen/CodeGenFunction.h           |  6 +-
 clang/test/CodeGen/const-alloca.c             |  5 +-
 .../CodeGenOpenCL/amdgpu-enqueue-kernel.cl    | 58 +++++++++----------
 llvm/include/llvm/IR/FixedMetadataKinds.def   |  1 +
 llvm/lib/Transforms/Scalar/SROA.cpp           | 48 ++++++++++++++-
 .../SROA/sroa-immutable-alloca-propagation.ll | 11 +++-
 8 files changed, 119 insertions(+), 45 deletions(-)

diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index 29193e0c541b9..a96c796b18ddf 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -1450,6 +1450,14 @@ static uint64_t maxFakeUseAggregateSize(const ASTContext &C) {
   return 4 * C.getTypeSize(C.UnsignedIntTy);
 }
 
+static bool checkIsReadOnlyMetadataAvailable(QualType Ty,
+                                             const LangOptions &LO) {
+  bool IsLangSupported =
+      LO.C99 || LO.C11 || LO.C17 || LO.C23 || LO.C2y || LO.CPlusPlus;
+  // Currently support only for scalar types
+  return IsLangSupported && Ty.isConstQualified() && Ty->isScalarType();
+}
+
 // Helper function to determine whether a variable's or parameter's lifetime
 // should be extended.
 static bool shouldExtendLifetime(const ASTContext &Context,
@@ -1601,9 +1609,10 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
       // Create the alloca.  Note that we set the name separately from
       // building the instruction so that it's there even in no-asserts
       // builds.
-      address = CreateTempAlloca(allocaTy, Ty.getAddressSpace(),
-                                 allocaAlignment, D.getName(),
-                                 /*ArraySize=*/nullptr, &AllocaAddr);
+      address = CreateTempAlloca(
+          allocaTy, Ty.getAddressSpace(), allocaAlignment, D.getName(),
+          /*ArraySize=*/nullptr, &AllocaAddr,
+          checkIsReadOnlyMetadataAvailable(Ty, getLangOpts()));
 
       // Don't emit lifetime markers for MSVC catch parameters. The lifetime of
       // the catch parameter starts in the catchpad instruction, and we can't
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index e8456a44f8367..a2351d30af394 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -99,11 +99,17 @@ static llvm::StringRef GetUBSanTrapForHandler(SanitizerHandler ID) {
 
 /// CreateTempAlloca - This creates a alloca and inserts it into the entry
 /// block.
-RawAddress
-CodeGenFunction::CreateTempAllocaWithoutCast(llvm::Type *Ty, CharUnits Align,
-                                             const Twine &Name,
-                                             llvm::Value *ArraySize) {
+RawAddress CodeGenFunction::CreateTempAllocaWithoutCast(llvm::Type *Ty,
+                                                        CharUnits Align,
+                                                        const Twine &Name,
+                                                        llvm::Value *ArraySize,
+                                                        bool IsReadOnly) {
   auto Alloca = CreateTempAlloca(Ty, Name, ArraySize);
+  if (IsReadOnly) {
+    llvm::MDNode *Node = llvm::MDNode::get(
+        getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1)));
+    Alloca->setMetadata(llvm::LLVMContext::MD_immutable, Node);
+  }
   Alloca->setAlignment(Align.getAsAlign());
   return RawAddress(Alloca, Ty, Align, KnownNonNull);
 }
@@ -138,8 +144,10 @@ RawAddress CodeGenFunction::MaybeCastStackAddressSpace(RawAddress Alloca,
 RawAddress CodeGenFunction::CreateTempAlloca(llvm::Type *Ty, LangAS DestLangAS,
                                              CharUnits Align, const Twine &Name,
                                              llvm::Value *ArraySize,
-                                             RawAddress *AllocaAddr) {
-  RawAddress Alloca = CreateTempAllocaWithoutCast(Ty, Align, Name, ArraySize);
+                                             RawAddress *AllocaAddr,
+                                             bool IsReadOnly) {
+  RawAddress Alloca =
+      CreateTempAllocaWithoutCast(Ty, Align, Name, ArraySize, IsReadOnly);
   if (AllocaAddr)
     *AllocaAddr = Alloca;
   return MaybeCastStackAddressSpace(Alloca, DestLangAS, ArraySize);
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 123cb4f51f828..c64312ba8e52a 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -2848,7 +2848,8 @@ class CodeGenFunction : public CodeGenTypeCache {
   RawAddress CreateTempAlloca(llvm::Type *Ty, LangAS UseAddrSpace,
                               CharUnits align, const Twine &Name = "tmp",
                               llvm::Value *ArraySize = nullptr,
-                              RawAddress *Alloca = nullptr);
+                              RawAddress *Alloca = nullptr,
+                              bool IsReadOnly = false);
 
   /// CreateTempAlloca - This creates a alloca and inserts it into the entry
   /// block. The alloca is casted to default address space if necessary.
@@ -2865,7 +2866,8 @@ class CodeGenFunction : public CodeGenTypeCache {
 
   RawAddress CreateTempAllocaWithoutCast(llvm::Type *Ty, CharUnits align,
                                          const Twine &Name = "tmp",
-                                         llvm::Value *ArraySize = nullptr);
+                                         llvm::Value *ArraySize = nullptr,
+                                         bool IsReadOnly = false);
 
   /// CreateDefaultAlignedTempAlloca - This creates an alloca with the
   /// default ABI alignment of the given LLVM type.
diff --git a/clang/test/CodeGen/const-alloca.c b/clang/test/CodeGen/const-alloca.c
index c68bdadad8fc4..96b89fc7f8aff 100644
--- a/clang/test/CodeGen/const-alloca.c
+++ b/clang/test/CodeGen/const-alloca.c
@@ -4,7 +4,7 @@
 // CHECK-LABEL: define dso_local i32 @test(
 // CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[X:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[X:%.*]] = alloca i32, align 4, !immutable [[META2:![0-9]+]]
 // CHECK-NEXT:    store i32 1, ptr [[X]], align 4
 // CHECK-NEXT:    ret i32 1
 //
@@ -12,3 +12,6 @@ int test() {
   const int x = 1;
   return x;
 }
+//.
+// CHECK: [[META2]] = !{i32 1}
+//.
diff --git a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
index d71c89811f04b..9dead762a8bd4 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
@@ -129,7 +129,7 @@ kernel void test_target_features_kernel(global int *i) {
 // NOCPU-NEXT:    [[VARTMP11:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
 // NOCPU-NEXT:    [[BLOCK12:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
 // NOCPU-NEXT:    [[BLOCK_SIZES:%.*]] = alloca [1 x i64], align 8, addrspace(5)
-// NOCPU-NEXT:    [[BLOCK20:%.*]] = alloca ptr, align 8, addrspace(5)
+// NOCPU-NEXT:    [[BLOCK20:%.*]] = alloca ptr, align 8, addrspace(5), !immutable [[META7:![0-9]+]]
 // NOCPU-NEXT:    [[BLOCK21:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5)
 // NOCPU-NEXT:    [[VARTMP27:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
 // NOCPU-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
@@ -235,7 +235,7 @@ kernel void test_target_features_kernel(global int *i) {
 //
 // NOCPU: Function Attrs: convergent noinline norecurse nounwind optnone
 // NOCPU-LABEL: define dso_local amdgpu_kernel void @test_target_features_kernel(
-// NOCPU-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR4:[0-9]+]] !kernel_arg_addr_space [[META7:![0-9]+]] !kernel_arg_access_qual [[META8:![0-9]+]] !kernel_arg_type [[META9:![0-9]+]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META10:![0-9]+]] {
+// NOCPU-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR4:[0-9]+]] !kernel_arg_addr_space [[META7]] !kernel_arg_access_qual [[META8:![0-9]+]] !kernel_arg_type [[META9:![0-9]+]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META10:![0-9]+]] {
 // NOCPU-NEXT:  [[ENTRY:.*:]]
 // NOCPU-NEXT:    [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // NOCPU-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
@@ -503,7 +503,7 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[VARTMP11:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
 // GFX900-NEXT:    [[BLOCK12:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
 // GFX900-NEXT:    [[BLOCK_SIZES:%.*]] = alloca [1 x i64], align 8, addrspace(5)
-// GFX900-NEXT:    [[BLOCK20:%.*]] = alloca ptr, align 8, addrspace(5)
+// GFX900-NEXT:    [[BLOCK20:%.*]] = alloca ptr, align 8, addrspace(5), !immutable [[META17:![0-9]+]]
 // GFX900-NEXT:    [[BLOCK21:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5)
 // GFX900-NEXT:    [[VARTMP27:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
 // GFX900-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
@@ -525,11 +525,11 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    store i64 [[D]], ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR9:[0-9]+]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[FLAGS]]) #[[ATTR9]]
-// GFX900-NEXT:    store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17:![0-9]+]]
+// GFX900-NEXT:    store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA18:![0-9]+]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[NDRANGE]]) #[[ATTR9]]
-// GFX900-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA19:![0-9]+]]
-// GFX900-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17]]
-// GFX900-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21:![0-9]+]]
+// GFX900-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA20:![0-9]+]]
+// GFX900-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA18]]
+// GFX900-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT22:![0-9]+]]
 // GFX900-NEXT:    [[BLOCK_SIZE:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 0
 // GFX900-NEXT:    store i32 25, ptr [[BLOCK_SIZE]], align 8
 // GFX900-NEXT:    [[BLOCK_ALIGN:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 1
@@ -543,9 +543,9 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[TMP3:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[TBAA16]]
 // GFX900-NEXT:    store i8 [[TMP3]], ptr [[BLOCK_CAPTURED1]], align 8, !tbaa [[TBAA16]]
 // GFX900-NEXT:    [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle to ptr), ptr [[BLOCK_ASCAST]])
-// GFX900-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA19]]
-// GFX900-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17]]
-// GFX900-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP2_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]]
+// GFX900-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA20]]
+// GFX900-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA18]]
+// GFX900-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP2_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT22]]
 // GFX900-NEXT:    [[BLOCK_SIZE4:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 0
 // GFX900-NEXT:    store i32 41, ptr [[BLOCK_SIZE4]], align 8
 // GFX900-NEXT:    [[BLOCK_ALIGN5:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 1
@@ -565,9 +565,9 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[TMP10:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]]
 // GFX900-NEXT:    store i64 [[TMP10]], ptr [[BLOCK_CAPTURED10]], align 8, !tbaa [[TBAA3]]
 // GFX900-NEXT:    [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr addrspace(5) [[VARTMP2]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle to ptr), ptr [[BLOCK3_ASCAST]])
-// GFX900-NEXT:    [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA19]]
-// GFX900-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17]]
-// GFX900-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP11_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]]
+// GFX900-NEXT:    [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA20]]
+// GFX900-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA18]]
+// GFX900-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP11_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT22]]
 // GFX900-NEXT:    [[BLOCK_SIZE13:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 0
 // GFX900-NEXT:    store i32 41, ptr [[BLOCK_SIZE13]], align 8
 // GFX900-NEXT:    [[BLOCK_ALIGN14:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 1
@@ -605,9 +605,9 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[TMP21:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[TBAA7]]
 // GFX900-NEXT:    store ptr addrspace(1) [[TMP21]], ptr [[BLOCK_CAPTURED26]], align 8, !tbaa [[TBAA7]]
 // GFX900-NEXT:    store ptr [[BLOCK21_ASCAST]], ptr addrspace(5) [[BLOCK20]], align 8, !tbaa [[TBAA16]]
-// GFX900-NEXT:    [[TMP22:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA19]]
-// GFX900-NEXT:    [[TMP23:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17]]
-// GFX900-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP27_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]]
+// GFX900-NEXT:    [[TMP22:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA20]]
+// GFX900-NEXT:    [[TMP23:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA18]]
+// GFX900-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP27_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT22]]
 // GFX900-NEXT:    [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[BLOCK20]], align 8, !tbaa [[TBAA16]]
 // GFX900-NEXT:    [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr addrspace(5) [[VARTMP27]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle to ptr), ptr [[BLOCK21_ASCAST]])
 // GFX900-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[BLOCK20]]) #[[ATTR9]]
@@ -619,7 +619,7 @@ kernel void test_target_features_kernel(global int *i) {
 //
 // GFX900: Function Attrs: convergent norecurse nounwind
 // GFX900-LABEL: define dso_local amdgpu_kernel void @test_target_features_kernel(
-// GFX900-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META22:![0-9]+]] !kernel_arg_access_qual [[META23:![0-9]+]] !kernel_arg_type [[META24:![0-9]+]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25:![0-9]+]] {
+// GFX900-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META17]] !kernel_arg_access_qual [[META23:![0-9]+]] !kernel_arg_type [[META24:![0-9]+]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25:![0-9]+]] {
 // GFX900-NEXT:  [[ENTRY:.*:]]
 // GFX900-NEXT:    [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // GFX900-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
@@ -631,7 +631,7 @@ kernel void test_target_features_kernel(global int *i) {
 //
 // GFX900: Function Attrs: alwaysinline convergent norecurse nounwind
 // GFX900-LABEL: define dso_local void @__clang_ocl_kern_imp_test_target_features_kernel(
-// GFX900-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META22]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META24]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25]] {
+// GFX900-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META17]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META24]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25]] {
 // GFX900-NEXT:  [[ENTRY:.*:]]
 // GFX900-NEXT:    [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // GFX900-NEXT:    [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
@@ -643,12 +643,12 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[TBAA26]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR9]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[FLAGS]]) #[[ATTR9]]
-// GFX900-NEXT:    store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17]]
+// GFX900-NEXT:    store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA18]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[NDRANGE]]) #[[ATTR9]]
 // GFX900-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()
-// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA19]]
-// GFX900-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17]]
-// GFX900-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]]
+// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA20]]
+// GFX900-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA18]]
+// GFX900-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT22]]
 // GFX900-NEXT:    [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle to ptr), ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
 // GFX900-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[NDRANGE]]) #[[ATTR9]]
 // GFX900-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[FLAGS]]) #[[ATTR9]]
@@ -740,7 +740,7 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    store i64 [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 8, !tbaa [[TBAA3]]
 // GFX900-NEXT:    [[TMP4:%.*]] = load ptr addrspace(3), ptr [[LP_ADDR_ASCAST]], align 4, !tbaa [[TBAA32]]
 // GFX900-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[TMP4]], i64 0
-// GFX900-NEXT:    store i32 1, ptr addrspace(3) [[ARRAYIDX5]], align 4, !tbaa [[TBAA17]]
+// GFX900-NEXT:    store i32 1, ptr addrspace(3) [[ARRAYIDX5]], align 4, !tbaa [[TBAA18]]
 // GFX900-NEXT:    ret void
 //
 //
@@ -866,12 +866,12 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900: [[TBAA14]] = !{[[META15:![0-9]+]], [[META15]], i64 0}
 // GFX900: [[META15]] = !{!"p1 omnipotent char", [[META9]], i64 0}
 // GFX900: [[TBAA16]] = !{[[META5]], [[META5]], i64 0}
-// GFX900: [[TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
-// GFX900: [[META18]] = !{!"int", [[META5]], i64 0}
-// GFX900: [[TBAA19]] = !{[[META20:![0-9]+]], [[META20]], i64 0}
-// GFX900: [[META20]] = !{!"queue_t", [[META5]], i64 0}
-// GFX900: [[TBAA_STRUCT21]] = !{i64 0, i64 4, [[TBAA17]]}
-// GFX900: [[META22]] = !{i32 1}
+// GFX900: [[META17]] = !{i32 1}
+// GFX900: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
+// GFX900: [[META19]] = !{!"int", [[META5]], i64 0}
+// GFX900: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
+// GFX900: [[META21]] = !{!"queue_t", [[META5]], i64 0}
+// GFX900: [[TBAA_STRUCT22]] = !{i64 0, i64 4, [[TBAA18]]}
 // GFX900: [[META23]] = !{!"none"}
 // GFX900: [[META24]] = !{!"int*"}
 // GFX900: [[META25]] = !{!""}
diff --git a/llvm/include/llvm/IR/FixedMetadataKinds.def b/llvm/include/llvm/IR/FixedMetadataKinds.def
index d09cc15d65ff6..a7736166b8a44 100644
--- a/llvm/include/llvm/IR/FixedMetadataKinds.def
+++ b/llvm/include/llvm/IR/FixedMetadataKinds.def
@@ -55,3 +55,4 @@ LLVM_FIXED_MD_KIND(MD_mmra, "mmra", 40)
 LLVM_FIXED_MD_KIND(MD_noalias_addrspace, "noalias.addrspace", 41)
 LLVM_FIXED_MD_KIND(MD_callee_type, "callee_type", 42)
 LLVM_FIXED_MD_KIND(MD_nofree, "nofree", 43)
+LLVM_FIXED_MD_KIND(MD_immutable, "immutable", 44)
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 9b4f1dc6ddb34..f33d477fcbfee 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -102,6 +102,7 @@ using namespace llvm;
 #define DEBUG_TYPE "sroa"
 
 STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
+STATISTIC(NumConstAllocasPropagated, "Number of immutable allocas propageted");
 STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
 STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
 STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
@@ -248,6 +249,7 @@ class SROA {
   bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
   AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P);
   bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
+  bool tryToPropagateImmutableAllocaInitValue(AllocaInst &AI, AllocaSlices &AS);
   bool propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS);
   std::pair<bool /*Changed*/, bool /*CFGChanged*/> runOnAlloca(AllocaInst &AI);
   void clobberUse(Use &U);
@@ -5452,6 +5454,47 @@ class BasicLoadAndStorePromoter : public LoadAndStorePromoter {
   Type *ZeroType;
 };
 
+bool SROA::tryToPropagateImmutableAllocaInitValue(AllocaInst &AI,
+                                                  AllocaSlices &AS) {
+  // If an alloca of a scalar type is marked with the immutable metadata, it
+  // means that it cannot be reinitialized. Therefore, we can propagate its
+  // initial value quite early throughout, even if the alloca is escaped.
+  bool Changed = false;
+  SmallVector<User *> AIStoreUsers;
+
+  copy_if(AI.users(), std::back_inserter(AIStoreUsers), [&AI](auto *U) {
+    auto *SI = dyn_cast<StoreInst>(U);
+    return (SI && SI->getPointerOperand() == &AI);
+  });
+
+  if (range_size(AIStoreUsers) != 1)
+    return Changed;
+
+  SmallVector<User *> AILoadUsers;
+
+  copy_if(AI.users(), std::back_inserter(AILoadUsers), [&AI](auto *U) {
+    auto *LI = dyn_cast<LoadInst>(U);
+    return (LI && LI->getPointerOperand() == &AI);
+  });
+
+  auto *StoreInitValInst = dyn_cast<StoreInst>(AIStoreUsers.front());
+
+  assert(StoreInitValInst);
+  auto *InitVal = StoreInitValInst->getValueOperand();
+
+  for (User *U : AILoadUsers) {
+    auto *LI = dyn_cast<LoadInst>(U);
+    assert(LI);
+    assert(DTU->getDomTree().dominates(InitVal, LI));
+    assert(InitVal->getType() == LI->getType());
+    ++NumConstAllocasPropagated;
+    LI->replaceAllUsesWith(InitVal);
+    Changed |= true;
+  }
+
+  return Changed;
+}
+
 bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) {
   // Look through each "partition", looking for slices with the same start/end
   // that do not overlap with any before them. The slices are sorted by
@@ -5564,8 +5607,11 @@ SROA::runOnAlloca(AllocaInst &AI) {
   // Build the slices using a recursive instruction-visiting builder.
   AllocaSlices AS(DL, AI);
   LLVM_DEBUG(AS.print(dbgs()));
-  if (AS.isEscaped())
+  if (AS.isEscaped()) {
+    if (AI.hasMetadata(LLVMContext::MD_immutable))
+      Changed |= tryToPropagateImmutableAllocaInitValue(AI, AS);
     return {Changed, CFGChanged};
+  }
 
   if (AS.isEscapedReadOnly()) {
     Changed |= propagateStoredValuesToLoads(AI, AS);
diff --git a/llvm/test/Transforms/SROA/sroa-immutable-alloca-propagation.ll b/llvm/test/Transforms/SROA/sroa-immutable-alloca-propagation.ll
index 6dd79832b6d38..ada0a1107978a 100644
--- a/llvm/test/Transforms/SROA/sroa-immutable-alloca-propagation.ll
+++ b/llvm/test/Transforms/SROA/sroa-immutable-alloca-propagation.ll
@@ -6,13 +6,13 @@ define dso_local i32 @test2(i16 noundef signext %p1) {
 ; CHECK-LABEL: define dso_local i32 @test2(
 ; CHECK-SAME: i16 noundef signext [[P1:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[P2:%.*]] = alloca i16, align 2
+; CHECK-NEXT:    [[P2:%.*]] = alloca i16, align 2, !immutable [[META0:![0-9]+]]
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[P2]])
 ; CHECK-NEXT:    store i16 [[P1]], ptr [[P2]], align 2
 ; CHECK-NEXT:    call void @foo(ptr noundef [[P2]])
 ; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[P1]] to i32
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[P2]], align 2
-; CHECK-NEXT:    [[CONV1:%.*]] = sext i16 [[TMP0]] to i32
+; CHECK-NEXT:    [[CONV1:%.*]] = sext i16 [[P1]] to i32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CONV]], [[CONV1]]
 ; CHECK-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
 ; CHECK:       [[IF_THEN]]:
@@ -27,7 +27,7 @@ define dso_local i32 @test2(i16 noundef signext %p1) {
 entry:
   %retval = alloca i32, align 4
   %p1.addr = alloca i16, align 2
-  %p2 = alloca i16, align 2
+  %p2 = alloca i16, align 2, !immutable !1
   %cleanup.dest.slot = alloca i32, align 4
   store i16 %p1, ptr %p1.addr, align 2
   call void @llvm.lifetime.start.p0(ptr %p2)
@@ -64,3 +64,8 @@ declare void @foo(ptr noundef)
 
 ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
 declare void @llvm.lifetime.end.p0(ptr captures(none))
+
+!1 = !{i32 1}
+;.
+; CHECK: [[META0]] = !{i32 1}
+;.



More information about the llvm-commits mailing list