[clang] [llvm] Add metadata for const C/C++ scalar types to track initial values of escaped alloca (PR #157676)
Vladislav Belov via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 9 06:59:10 PDT 2025
https://github.com/vbe-sc created https://github.com/llvm/llvm-project/pull/157676
According to the 9.2.9.2.4 bullet of the C++ standard:
> Any attempt to modify (7.6.19, 7.6.1.6, 7.6.2.3) a const object (6.8.5) during its lifetime (6.7.3) results in
undefined behavior.
and 6.7.4.7 bullet of the C standard:
> If an attempt is made to modify an object defined with a const-qualified type through use of an
lvalue with non-const-qualified type, the behavior is undefined.
the following case:
```
void foo(int const * const p);
int test(int p1) {
const int p2 = p1;
foo(&p2);
if (p1 == p2)
return 14;
return 42;
}
```
can be optimized with dropping `p1 == p2` comparison. However, LLVM doesn't emit any information about `const` qualifier.
This patch enables such optimizations
>From 8999ba88d20054e329736d43bf57ba19ee07974a Mon Sep 17 00:00:00 2001
From: vb-sc <vladislav.belov at syntacore.com>
Date: Tue, 9 Sep 2025 16:55:07 +0300
Subject: [PATCH 1/2] [clang] Add precommit tests for immutable alloca related
to const scalar C/C++ types
---
clang/test/CodeGen/const-alloca.c | 14 ++++
.../SROA/sroa-immutable-alloca-propagation.ll | 66 +++++++++++++++++++
2 files changed, 80 insertions(+)
create mode 100644 clang/test/CodeGen/const-alloca.c
create mode 100644 llvm/test/Transforms/SROA/sroa-immutable-alloca-propagation.ll
diff --git a/clang/test/CodeGen/const-alloca.c b/clang/test/CodeGen/const-alloca.c
new file mode 100644
index 0000000000000..c68bdadad8fc4
--- /dev/null
+++ b/clang/test/CodeGen/const-alloca.c
@@ -0,0 +1,14 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 %s -emit-llvm -o - | FileCheck %s
+
+// CHECK-LABEL: define dso_local i32 @test(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[X:%.*]] = alloca i32, align 4
+// CHECK-NEXT: store i32 1, ptr [[X]], align 4
+// CHECK-NEXT: ret i32 1
+//
+int test() {
+ const int x = 1;
+ return x;
+}
diff --git a/llvm/test/Transforms/SROA/sroa-immutable-alloca-propagation.ll b/llvm/test/Transforms/SROA/sroa-immutable-alloca-propagation.ll
new file mode 100644
index 0000000000000..6dd79832b6d38
--- /dev/null
+++ b/llvm/test/Transforms/SROA/sroa-immutable-alloca-propagation.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=sroa < %s | FileCheck %s
+
+; Function Attrs: nounwind uwtable
+define dso_local i32 @test2(i16 noundef signext %p1) {
+; CHECK-LABEL: define dso_local i32 @test2(
+; CHECK-SAME: i16 noundef signext [[P1:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[P2:%.*]] = alloca i16, align 2
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr [[P2]])
+; CHECK-NEXT: store i16 [[P1]], ptr [[P2]], align 2
+; CHECK-NEXT: call void @foo(ptr noundef [[P2]])
+; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[P1]] to i32
+; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[P2]], align 2
+; CHECK-NEXT: [[CONV1:%.*]] = sext i16 [[TMP0]] to i32
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[CONV]], [[CONV1]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: br label %[[CLEANUP:.*]]
+; CHECK: [[IF_END]]:
+; CHECK-NEXT: br label %[[CLEANUP]]
+; CHECK: [[CLEANUP]]:
+; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 14, %[[IF_THEN]] ], [ 42, %[[IF_END]] ]
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr [[P2]])
+; CHECK-NEXT: ret i32 [[RETVAL_0]]
+;
+entry:
+ %retval = alloca i32, align 4
+ %p1.addr = alloca i16, align 2
+ %p2 = alloca i16, align 2
+ %cleanup.dest.slot = alloca i32, align 4
+ store i16 %p1, ptr %p1.addr, align 2
+ call void @llvm.lifetime.start.p0(ptr %p2)
+ %0 = load i16, ptr %p1.addr, align 2
+ store i16 %0, ptr %p2, align 2
+ call void @foo(ptr noundef %p2)
+ %1 = load i16, ptr %p1.addr, align 2
+ %conv = sext i16 %1 to i32
+ %2 = load i16, ptr %p2, align 2
+ %conv1 = sext i16 %2 to i32
+ %cmp = icmp eq i32 %conv, %conv1
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ store i32 14, ptr %retval, align 4
+ store i32 1, ptr %cleanup.dest.slot, align 4
+ br label %cleanup
+
+if.end: ; preds = %entry
+ store i32 42, ptr %retval, align 4
+ store i32 1, ptr %cleanup.dest.slot, align 4
+ br label %cleanup
+
+cleanup: ; preds = %if.end, %if.then
+ call void @llvm.lifetime.end.p0(ptr %p2)
+ %3 = load i32, ptr %retval, align 4
+ ret i32 %3
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(ptr captures(none))
+
+declare void @foo(ptr noundef)
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(ptr captures(none))
>From d8fa50fe4d50284e6d5ae3287861e7fdaf621187 Mon Sep 17 00:00:00 2001
From: vb-sc <vladislav.belov at syntacore.com>
Date: Tue, 9 Sep 2025 16:56:32 +0300
Subject: [PATCH 2/2] Add metadata for const C/C++ scalar types to track
initial values of escaped alloca
---
clang/lib/CodeGen/CGDecl.cpp | 15 ++++-
clang/lib/CodeGen/CGExpr.cpp | 20 +++++--
clang/lib/CodeGen/CodeGenFunction.h | 6 +-
clang/test/CodeGen/const-alloca.c | 5 +-
.../CodeGenOpenCL/amdgpu-enqueue-kernel.cl | 58 +++++++++----------
llvm/include/llvm/IR/FixedMetadataKinds.def | 1 +
llvm/lib/Transforms/Scalar/SROA.cpp | 48 ++++++++++++++-
.../SROA/sroa-immutable-alloca-propagation.ll | 11 +++-
8 files changed, 119 insertions(+), 45 deletions(-)
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index 29193e0c541b9..a96c796b18ddf 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -1450,6 +1450,14 @@ static uint64_t maxFakeUseAggregateSize(const ASTContext &C) {
return 4 * C.getTypeSize(C.UnsignedIntTy);
}
+static bool checkIsReadOnlyMetadataAvailable(QualType Ty,
+ const LangOptions &LO) {
+ bool IsLangSupported =
+ LO.C99 || LO.C11 || LO.C17 || LO.C23 || LO.C2y || LO.CPlusPlus;
+ // Currently support only for scalar types
+ return IsLangSupported && Ty.isConstQualified() && Ty->isScalarType();
+}
+
// Helper function to determine whether a variable's or parameter's lifetime
// should be extended.
static bool shouldExtendLifetime(const ASTContext &Context,
@@ -1601,9 +1609,10 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
// Create the alloca. Note that we set the name separately from
// building the instruction so that it's there even in no-asserts
// builds.
- address = CreateTempAlloca(allocaTy, Ty.getAddressSpace(),
- allocaAlignment, D.getName(),
- /*ArraySize=*/nullptr, &AllocaAddr);
+ address = CreateTempAlloca(
+ allocaTy, Ty.getAddressSpace(), allocaAlignment, D.getName(),
+ /*ArraySize=*/nullptr, &AllocaAddr,
+ checkIsReadOnlyMetadataAvailable(Ty, getLangOpts()));
// Don't emit lifetime markers for MSVC catch parameters. The lifetime of
// the catch parameter starts in the catchpad instruction, and we can't
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index e8456a44f8367..a2351d30af394 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -99,11 +99,17 @@ static llvm::StringRef GetUBSanTrapForHandler(SanitizerHandler ID) {
/// CreateTempAlloca - This creates a alloca and inserts it into the entry
/// block.
-RawAddress
-CodeGenFunction::CreateTempAllocaWithoutCast(llvm::Type *Ty, CharUnits Align,
- const Twine &Name,
- llvm::Value *ArraySize) {
+RawAddress CodeGenFunction::CreateTempAllocaWithoutCast(llvm::Type *Ty,
+ CharUnits Align,
+ const Twine &Name,
+ llvm::Value *ArraySize,
+ bool IsReadOnly) {
auto Alloca = CreateTempAlloca(Ty, Name, ArraySize);
+ if (IsReadOnly) {
+ llvm::MDNode *Node = llvm::MDNode::get(
+ getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1)));
+ Alloca->setMetadata(llvm::LLVMContext::MD_immutable, Node);
+ }
Alloca->setAlignment(Align.getAsAlign());
return RawAddress(Alloca, Ty, Align, KnownNonNull);
}
@@ -138,8 +144,10 @@ RawAddress CodeGenFunction::MaybeCastStackAddressSpace(RawAddress Alloca,
RawAddress CodeGenFunction::CreateTempAlloca(llvm::Type *Ty, LangAS DestLangAS,
CharUnits Align, const Twine &Name,
llvm::Value *ArraySize,
- RawAddress *AllocaAddr) {
- RawAddress Alloca = CreateTempAllocaWithoutCast(Ty, Align, Name, ArraySize);
+ RawAddress *AllocaAddr,
+ bool IsReadOnly) {
+ RawAddress Alloca =
+ CreateTempAllocaWithoutCast(Ty, Align, Name, ArraySize, IsReadOnly);
if (AllocaAddr)
*AllocaAddr = Alloca;
return MaybeCastStackAddressSpace(Alloca, DestLangAS, ArraySize);
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 123cb4f51f828..c64312ba8e52a 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -2848,7 +2848,8 @@ class CodeGenFunction : public CodeGenTypeCache {
RawAddress CreateTempAlloca(llvm::Type *Ty, LangAS UseAddrSpace,
CharUnits align, const Twine &Name = "tmp",
llvm::Value *ArraySize = nullptr,
- RawAddress *Alloca = nullptr);
+ RawAddress *Alloca = nullptr,
+ bool IsReadOnly = false);
/// CreateTempAlloca - This creates a alloca and inserts it into the entry
/// block. The alloca is casted to default address space if necessary.
@@ -2865,7 +2866,8 @@ class CodeGenFunction : public CodeGenTypeCache {
RawAddress CreateTempAllocaWithoutCast(llvm::Type *Ty, CharUnits align,
const Twine &Name = "tmp",
- llvm::Value *ArraySize = nullptr);
+ llvm::Value *ArraySize = nullptr,
+ bool IsReadOnly = false);
/// CreateDefaultAlignedTempAlloca - This creates an alloca with the
/// default ABI alignment of the given LLVM type.
diff --git a/clang/test/CodeGen/const-alloca.c b/clang/test/CodeGen/const-alloca.c
index c68bdadad8fc4..96b89fc7f8aff 100644
--- a/clang/test/CodeGen/const-alloca.c
+++ b/clang/test/CodeGen/const-alloca.c
@@ -4,7 +4,7 @@
// CHECK-LABEL: define dso_local i32 @test(
// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[X:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[X:%.*]] = alloca i32, align 4, !immutable [[META2:![0-9]+]]
// CHECK-NEXT: store i32 1, ptr [[X]], align 4
// CHECK-NEXT: ret i32 1
//
@@ -12,3 +12,6 @@ int test() {
const int x = 1;
return x;
}
+//.
+// CHECK: [[META2]] = !{i32 1}
+//.
diff --git a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
index d71c89811f04b..9dead762a8bd4 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
@@ -129,7 +129,7 @@ kernel void test_target_features_kernel(global int *i) {
// NOCPU-NEXT: [[VARTMP11:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
// NOCPU-NEXT: [[BLOCK12:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
// NOCPU-NEXT: [[BLOCK_SIZES:%.*]] = alloca [1 x i64], align 8, addrspace(5)
-// NOCPU-NEXT: [[BLOCK20:%.*]] = alloca ptr, align 8, addrspace(5)
+// NOCPU-NEXT: [[BLOCK20:%.*]] = alloca ptr, align 8, addrspace(5), !immutable [[META7:![0-9]+]]
// NOCPU-NEXT: [[BLOCK21:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5)
// NOCPU-NEXT: [[VARTMP27:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
// NOCPU-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
@@ -235,7 +235,7 @@ kernel void test_target_features_kernel(global int *i) {
//
// NOCPU: Function Attrs: convergent noinline norecurse nounwind optnone
// NOCPU-LABEL: define dso_local amdgpu_kernel void @test_target_features_kernel(
-// NOCPU-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR4:[0-9]+]] !kernel_arg_addr_space [[META7:![0-9]+]] !kernel_arg_access_qual [[META8:![0-9]+]] !kernel_arg_type [[META9:![0-9]+]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META10:![0-9]+]] {
+// NOCPU-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR4:[0-9]+]] !kernel_arg_addr_space [[META7]] !kernel_arg_access_qual [[META8:![0-9]+]] !kernel_arg_type [[META9:![0-9]+]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META10:![0-9]+]] {
// NOCPU-NEXT: [[ENTRY:.*:]]
// NOCPU-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// NOCPU-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
@@ -503,7 +503,7 @@ kernel void test_target_features_kernel(global int *i) {
// GFX900-NEXT: [[VARTMP11:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
// GFX900-NEXT: [[BLOCK12:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
// GFX900-NEXT: [[BLOCK_SIZES:%.*]] = alloca [1 x i64], align 8, addrspace(5)
-// GFX900-NEXT: [[BLOCK20:%.*]] = alloca ptr, align 8, addrspace(5)
+// GFX900-NEXT: [[BLOCK20:%.*]] = alloca ptr, align 8, addrspace(5), !immutable [[META17:![0-9]+]]
// GFX900-NEXT: [[BLOCK21:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5)
// GFX900-NEXT: [[VARTMP27:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
// GFX900-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
@@ -525,11 +525,11 @@ kernel void test_target_features_kernel(global int *i) {
// GFX900-NEXT: store i64 [[D]], ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]]
// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR9:[0-9]+]]
// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) [[FLAGS]]) #[[ATTR9]]
-// GFX900-NEXT: store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17:![0-9]+]]
+// GFX900-NEXT: store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA18:![0-9]+]]
// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) [[NDRANGE]]) #[[ATTR9]]
-// GFX900-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA19:![0-9]+]]
-// GFX900-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17]]
-// GFX900-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21:![0-9]+]]
+// GFX900-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA20:![0-9]+]]
+// GFX900-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA18]]
+// GFX900-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT22:![0-9]+]]
// GFX900-NEXT: [[BLOCK_SIZE:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 0
// GFX900-NEXT: store i32 25, ptr [[BLOCK_SIZE]], align 8
// GFX900-NEXT: [[BLOCK_ALIGN:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 1
@@ -543,9 +543,9 @@ kernel void test_target_features_kernel(global int *i) {
// GFX900-NEXT: [[TMP3:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[TBAA16]]
// GFX900-NEXT: store i8 [[TMP3]], ptr [[BLOCK_CAPTURED1]], align 8, !tbaa [[TBAA16]]
// GFX900-NEXT: [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle to ptr), ptr [[BLOCK_ASCAST]])
-// GFX900-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA19]]
-// GFX900-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17]]
-// GFX900-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP2_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]]
+// GFX900-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA20]]
+// GFX900-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA18]]
+// GFX900-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP2_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT22]]
// GFX900-NEXT: [[BLOCK_SIZE4:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 0
// GFX900-NEXT: store i32 41, ptr [[BLOCK_SIZE4]], align 8
// GFX900-NEXT: [[BLOCK_ALIGN5:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 1
@@ -565,9 +565,9 @@ kernel void test_target_features_kernel(global int *i) {
// GFX900-NEXT: [[TMP10:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]]
// GFX900-NEXT: store i64 [[TMP10]], ptr [[BLOCK_CAPTURED10]], align 8, !tbaa [[TBAA3]]
// GFX900-NEXT: [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr addrspace(5) [[VARTMP2]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle to ptr), ptr [[BLOCK3_ASCAST]])
-// GFX900-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA19]]
-// GFX900-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17]]
-// GFX900-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP11_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]]
+// GFX900-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA20]]
+// GFX900-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA18]]
+// GFX900-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP11_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT22]]
// GFX900-NEXT: [[BLOCK_SIZE13:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 0
// GFX900-NEXT: store i32 41, ptr [[BLOCK_SIZE13]], align 8
// GFX900-NEXT: [[BLOCK_ALIGN14:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 1
@@ -605,9 +605,9 @@ kernel void test_target_features_kernel(global int *i) {
// GFX900-NEXT: [[TMP21:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[TBAA7]]
// GFX900-NEXT: store ptr addrspace(1) [[TMP21]], ptr [[BLOCK_CAPTURED26]], align 8, !tbaa [[TBAA7]]
// GFX900-NEXT: store ptr [[BLOCK21_ASCAST]], ptr addrspace(5) [[BLOCK20]], align 8, !tbaa [[TBAA16]]
-// GFX900-NEXT: [[TMP22:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA19]]
-// GFX900-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17]]
-// GFX900-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP27_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]]
+// GFX900-NEXT: [[TMP22:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA20]]
+// GFX900-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA18]]
+// GFX900-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP27_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT22]]
// GFX900-NEXT: [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[BLOCK20]], align 8, !tbaa [[TBAA16]]
// GFX900-NEXT: [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr addrspace(5) [[VARTMP27]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle to ptr), ptr [[BLOCK21_ASCAST]])
// GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) [[BLOCK20]]) #[[ATTR9]]
@@ -619,7 +619,7 @@ kernel void test_target_features_kernel(global int *i) {
//
// GFX900: Function Attrs: convergent norecurse nounwind
// GFX900-LABEL: define dso_local amdgpu_kernel void @test_target_features_kernel(
-// GFX900-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META22:![0-9]+]] !kernel_arg_access_qual [[META23:![0-9]+]] !kernel_arg_type [[META24:![0-9]+]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25:![0-9]+]] {
+// GFX900-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META17]] !kernel_arg_access_qual [[META23:![0-9]+]] !kernel_arg_type [[META24:![0-9]+]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25:![0-9]+]] {
// GFX900-NEXT: [[ENTRY:.*:]]
// GFX900-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// GFX900-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
@@ -631,7 +631,7 @@ kernel void test_target_features_kernel(global int *i) {
//
// GFX900: Function Attrs: alwaysinline convergent norecurse nounwind
// GFX900-LABEL: define dso_local void @__clang_ocl_kern_imp_test_target_features_kernel(
-// GFX900-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META22]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META24]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25]] {
+// GFX900-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META17]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META24]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25]] {
// GFX900-NEXT: [[ENTRY:.*:]]
// GFX900-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// GFX900-NEXT: [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
@@ -643,12 +643,12 @@ kernel void test_target_features_kernel(global int *i) {
// GFX900-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[TBAA26]]
// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR9]]
// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) [[FLAGS]]) #[[ATTR9]]
-// GFX900-NEXT: store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17]]
+// GFX900-NEXT: store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA18]]
// GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) [[NDRANGE]]) #[[ATTR9]]
// GFX900-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()
-// GFX900-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA19]]
-// GFX900-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17]]
-// GFX900-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]]
+// GFX900-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA20]]
+// GFX900-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA18]]
+// GFX900-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT22]]
// GFX900-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle to ptr), ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
// GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) [[NDRANGE]]) #[[ATTR9]]
// GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) [[FLAGS]]) #[[ATTR9]]
@@ -740,7 +740,7 @@ kernel void test_target_features_kernel(global int *i) {
// GFX900-NEXT: store i64 [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 8, !tbaa [[TBAA3]]
// GFX900-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr [[LP_ADDR_ASCAST]], align 4, !tbaa [[TBAA32]]
// GFX900-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[TMP4]], i64 0
-// GFX900-NEXT: store i32 1, ptr addrspace(3) [[ARRAYIDX5]], align 4, !tbaa [[TBAA17]]
+// GFX900-NEXT: store i32 1, ptr addrspace(3) [[ARRAYIDX5]], align 4, !tbaa [[TBAA18]]
// GFX900-NEXT: ret void
//
//
@@ -866,12 +866,12 @@ kernel void test_target_features_kernel(global int *i) {
// GFX900: [[TBAA14]] = !{[[META15:![0-9]+]], [[META15]], i64 0}
// GFX900: [[META15]] = !{!"p1 omnipotent char", [[META9]], i64 0}
// GFX900: [[TBAA16]] = !{[[META5]], [[META5]], i64 0}
-// GFX900: [[TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
-// GFX900: [[META18]] = !{!"int", [[META5]], i64 0}
-// GFX900: [[TBAA19]] = !{[[META20:![0-9]+]], [[META20]], i64 0}
-// GFX900: [[META20]] = !{!"queue_t", [[META5]], i64 0}
-// GFX900: [[TBAA_STRUCT21]] = !{i64 0, i64 4, [[TBAA17]]}
-// GFX900: [[META22]] = !{i32 1}
+// GFX900: [[META17]] = !{i32 1}
+// GFX900: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
+// GFX900: [[META19]] = !{!"int", [[META5]], i64 0}
+// GFX900: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
+// GFX900: [[META21]] = !{!"queue_t", [[META5]], i64 0}
+// GFX900: [[TBAA_STRUCT22]] = !{i64 0, i64 4, [[TBAA18]]}
// GFX900: [[META23]] = !{!"none"}
// GFX900: [[META24]] = !{!"int*"}
// GFX900: [[META25]] = !{!""}
diff --git a/llvm/include/llvm/IR/FixedMetadataKinds.def b/llvm/include/llvm/IR/FixedMetadataKinds.def
index d09cc15d65ff6..a7736166b8a44 100644
--- a/llvm/include/llvm/IR/FixedMetadataKinds.def
+++ b/llvm/include/llvm/IR/FixedMetadataKinds.def
@@ -55,3 +55,4 @@ LLVM_FIXED_MD_KIND(MD_mmra, "mmra", 40)
LLVM_FIXED_MD_KIND(MD_noalias_addrspace, "noalias.addrspace", 41)
LLVM_FIXED_MD_KIND(MD_callee_type, "callee_type", 42)
LLVM_FIXED_MD_KIND(MD_nofree, "nofree", 43)
+LLVM_FIXED_MD_KIND(MD_immutable, "immutable", 44)
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 9b4f1dc6ddb34..f33d477fcbfee 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -102,6 +102,7 @@ using namespace llvm;
#define DEBUG_TYPE "sroa"
STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
+STATISTIC(NumConstAllocasPropagated, "Number of immutable allocas propageted");
STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
@@ -248,6 +249,7 @@ class SROA {
bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P);
bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
+ bool tryToPropagateImmutableAllocaInitValue(AllocaInst &AI, AllocaSlices &AS);
bool propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS);
std::pair<bool /*Changed*/, bool /*CFGChanged*/> runOnAlloca(AllocaInst &AI);
void clobberUse(Use &U);
@@ -5452,6 +5454,47 @@ class BasicLoadAndStorePromoter : public LoadAndStorePromoter {
Type *ZeroType;
};
+bool SROA::tryToPropagateImmutableAllocaInitValue(AllocaInst &AI,
+ AllocaSlices &AS) {
+ // If an alloca of a scalar type is marked with the immutable metadata, it
+ // means that it cannot be reinitialized. Therefore, we can propagate its
+ // initial value quite early throughout, even if the alloca is escaped.
+ bool Changed = false;
+ SmallVector<User *> AIStoreUsers;
+
+ copy_if(AI.users(), std::back_inserter(AIStoreUsers), [&AI](auto *U) {
+ auto *SI = dyn_cast<StoreInst>(U);
+ return (SI && SI->getPointerOperand() == &AI);
+ });
+
+ if (range_size(AIStoreUsers) != 1)
+ return Changed;
+
+ SmallVector<User *> AILoadUsers;
+
+ copy_if(AI.users(), std::back_inserter(AILoadUsers), [&AI](auto *U) {
+ auto *LI = dyn_cast<LoadInst>(U);
+ return (LI && LI->getPointerOperand() == &AI);
+ });
+
+ auto *StoreInitValInst = dyn_cast<StoreInst>(AIStoreUsers.front());
+
+ assert(StoreInitValInst);
+ auto *InitVal = StoreInitValInst->getValueOperand();
+
+ for (User *U : AILoadUsers) {
+ auto *LI = dyn_cast<LoadInst>(U);
+ assert(LI);
+ assert(DTU->getDomTree().dominates(InitVal, LI));
+ assert(InitVal->getType() == LI->getType());
+ ++NumConstAllocasPropagated;
+ LI->replaceAllUsesWith(InitVal);
+ Changed |= true;
+ }
+
+ return Changed;
+}
+
bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) {
// Look through each "partition", looking for slices with the same start/end
// that do not overlap with any before them. The slices are sorted by
@@ -5564,8 +5607,11 @@ SROA::runOnAlloca(AllocaInst &AI) {
// Build the slices using a recursive instruction-visiting builder.
AllocaSlices AS(DL, AI);
LLVM_DEBUG(AS.print(dbgs()));
- if (AS.isEscaped())
+ if (AS.isEscaped()) {
+ if (AI.hasMetadata(LLVMContext::MD_immutable))
+ Changed |= tryToPropagateImmutableAllocaInitValue(AI, AS);
return {Changed, CFGChanged};
+ }
if (AS.isEscapedReadOnly()) {
Changed |= propagateStoredValuesToLoads(AI, AS);
diff --git a/llvm/test/Transforms/SROA/sroa-immutable-alloca-propagation.ll b/llvm/test/Transforms/SROA/sroa-immutable-alloca-propagation.ll
index 6dd79832b6d38..ada0a1107978a 100644
--- a/llvm/test/Transforms/SROA/sroa-immutable-alloca-propagation.ll
+++ b/llvm/test/Transforms/SROA/sroa-immutable-alloca-propagation.ll
@@ -6,13 +6,13 @@ define dso_local i32 @test2(i16 noundef signext %p1) {
; CHECK-LABEL: define dso_local i32 @test2(
; CHECK-SAME: i16 noundef signext [[P1:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[P2:%.*]] = alloca i16, align 2
+; CHECK-NEXT: [[P2:%.*]] = alloca i16, align 2, !immutable [[META0:![0-9]+]]
; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr [[P2]])
; CHECK-NEXT: store i16 [[P1]], ptr [[P2]], align 2
; CHECK-NEXT: call void @foo(ptr noundef [[P2]])
; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[P1]] to i32
; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[P2]], align 2
-; CHECK-NEXT: [[CONV1:%.*]] = sext i16 [[TMP0]] to i32
+; CHECK-NEXT: [[CONV1:%.*]] = sext i16 [[P1]] to i32
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[CONV]], [[CONV1]]
; CHECK-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
; CHECK: [[IF_THEN]]:
@@ -27,7 +27,7 @@ define dso_local i32 @test2(i16 noundef signext %p1) {
entry:
%retval = alloca i32, align 4
%p1.addr = alloca i16, align 2
- %p2 = alloca i16, align 2
+ %p2 = alloca i16, align 2, !immutable !1
%cleanup.dest.slot = alloca i32, align 4
store i16 %p1, ptr %p1.addr, align 2
call void @llvm.lifetime.start.p0(ptr %p2)
@@ -64,3 +64,8 @@ declare void @foo(ptr noundef)
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
declare void @llvm.lifetime.end.p0(ptr captures(none))
+
+!1 = !{i32 1}
+;.
+; CHECK: [[META0]] = !{i32 1}
+;.
More information about the llvm-commits
mailing list