[llvm] [Matrix] Place allocas in function entry. (PR #190032)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 1 12:55:01 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Florian Hahn (fhahn)
<details>
<summary>Changes</summary>
Create allocas for temporary matrixes in the function entry. Limit the lifetime via lifetime.start & lifetime.end. This avoids dynamic allocas.
Improvement suggested in
https://github.com/llvm/llvm-project/pull/188721.
---
Patch is 31.65 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/190032.diff
7 Files Affected:
- (modified) llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp (+24-11)
- (modified) llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll (+12-4)
- (modified) llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-differing-addr-spaces.ll (+24-8)
- (modified) llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll (+9-3)
- (modified) llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll (+6-2)
- (modified) llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-multiple-blocks.ll (+8-4)
- (modified) llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll (+6-2)
``````````diff
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 49a8626da0a18..68779f1969431 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -1873,28 +1873,30 @@ class LowerMatrixIntrinsics {
/// Ensure that the memory in \p Load does not alias \p Store by potentially
/// copying it to a new location. This new or otherwise the original location
/// is returned.
- Value *getNonAliasingPointer(LoadInst *Load, StoreInst *Store,
- CallInst *MatMul) {
+ std::pair<Value *, AllocaInst *>
+ getNonAliasingPointer(LoadInst *Load, StoreInst *Store, CallInst *MatMul) {
MemoryLocation StoreLoc = MemoryLocation::get(Store);
MemoryLocation LoadLoc = MemoryLocation::get(Load);
// If we can statically determine noalias we're good.
if (AA->isNoAlias(LoadLoc, StoreLoc))
- return Load->getPointerOperand();
+ return {Load->getPointerOperand(), nullptr};
// If the pointers are in different address spaces, we cannot compare them
// at runtime. Conservatively copy the load operand to a new buffer.
if (Load->getPointerAddressSpace() != Store->getPointerAddressSpace()) {
- IRBuilder<> Builder(MatMul);
auto *VT = cast<FixedVectorType>(Load->getType());
auto *ArrayTy =
ArrayType::get(VT->getElementType(), VT->getNumElements());
+ IRBuilder<> AllocaBuilder(&Func.getEntryBlock().front());
AllocaInst *Alloca =
- Builder.CreateAlloca(ArrayTy, Load->getPointerAddressSpace());
+ AllocaBuilder.CreateAlloca(ArrayTy, Load->getPointerAddressSpace());
+ IRBuilder<> Builder(MatMul);
+ Builder.CreateLifetimeStart(Alloca);
Builder.CreateMemCpy(Alloca, Alloca->getAlign(),
Load->getPointerOperand(), Load->getAlign(),
LoadLoc.Size.getValue());
- return Alloca;
+ return {Alloca, Alloca};
}
// Create code to check if the memory locations of the Load and Store
@@ -1950,14 +1952,16 @@ class LowerMatrixIntrinsics {
setExplicitlyUnknownBranchWeightsIfProfiled(*BR2, DEBUG_TYPE);
// Copy load operand to new alloca.
- Builder.SetInsertPoint(Copy, Copy->begin());
auto *VT = cast<FixedVectorType>(Load->getType());
// Use an array type for the alloca, to avoid potentially huge alignment
// requirements for large vector types.
auto *ArrayTy = ArrayType::get(VT->getElementType(), VT->getNumElements());
+ IRBuilder<> AllocaBuilder(&Func.getEntryBlock().front());
AllocaInst *Alloca =
- Builder.CreateAlloca(ArrayTy, Load->getPointerAddressSpace());
+ AllocaBuilder.CreateAlloca(ArrayTy, Load->getPointerAddressSpace());
+ Builder.SetInsertPoint(Copy, Copy->begin());
+ Builder.CreateLifetimeStart(Alloca);
Builder.CreateMemCpy(Alloca, Alloca->getAlign(), Load->getPointerOperand(),
Load->getAlign(), LoadLoc.Size.getValue());
Builder.SetInsertPoint(Fusion, Fusion->begin());
@@ -1972,7 +1976,7 @@ class LowerMatrixIntrinsics {
DTUpdates.push_back({DT->Insert, Check1, Copy});
DTUpdates.push_back({DT->Insert, Check1, Fusion});
DT->applyUpdates(DTUpdates);
- return PHI;
+ return {PHI, Alloca};
}
bool isFusionProfitable(CallInst *MatMul) {
@@ -2094,8 +2098,8 @@ class LowerMatrixIntrinsics {
const unsigned M = LShape.NumColumns;
auto *EltType = cast<FixedVectorType>(MatMul->getType())->getElementType();
- Value *APtr = getNonAliasingPointer(LoadOp0, Store, MatMul);
- Value *BPtr = getNonAliasingPointer(LoadOp1, Store, MatMul);
+ auto [APtr, AAlloca] = getNonAliasingPointer(LoadOp0, Store, MatMul);
+ auto [BPtr, BAlloca] = getNonAliasingPointer(LoadOp1, Store, MatMul);
Value *CPtr = Store->getPointerOperand();
// Use loop-based tiling when the number of expected operations exceeds
@@ -2131,6 +2135,15 @@ class LowerMatrixIntrinsics {
}
}
+ // End the lifetime of the allocas used for alias-safe copies.
+ {
+ IRBuilder<> Builder(Store);
+ if (AAlloca)
+ Builder.CreateLifetimeEnd(AAlloca);
+ if (BAlloca)
+ Builder.CreateLifetimeEnd(BAlloca);
+ }
+
// Mark eliminated instructions as fused and remove them.
FusedInsts.insert(Store);
FusedInsts.insert(MatMul);
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
index ac7f8f6b05994..d5b6dee2b4c40 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
@@ -11,6 +11,8 @@ target triple = "aarch64-unknown-unknown"
define void @multiply(ptr %A, ptr %B, ptr %C) {
; PTR64-LABEL: @multiply(
; PTR64-NEXT: entry:
+; PTR64-NEXT: [[TMP6:%.*]] = alloca [16 x double], align 8
+; PTR64-NEXT: [[TMP2:%.*]] = alloca [16 x double], align 8
; PTR64-NEXT: [[STORE_END:%.*]] = getelementptr inbounds nuw i8, ptr [[C:%.*]], i64 128
; PTR64-NEXT: [[TMP0:%.*]] = icmp ult ptr [[A:%.*]], [[STORE_END]]
; PTR64-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
@@ -19,7 +21,7 @@ define void @multiply(ptr %A, ptr %B, ptr %C) {
; PTR64-NEXT: [[TMP1:%.*]] = icmp ult ptr [[C]], [[LOAD_END]]
; PTR64-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
; PTR64: copy:
-; PTR64-NEXT: [[TMP2:%.*]] = alloca [16 x double], align 8
+; PTR64-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP2]])
; PTR64-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
; PTR64-NEXT: br label [[NO_ALIAS]]
; PTR64: no_alias:
@@ -32,7 +34,7 @@ define void @multiply(ptr %A, ptr %B, ptr %C) {
; PTR64-NEXT: [[TMP5:%.*]] = icmp ult ptr [[C]], [[LOAD_END5]]
; PTR64-NEXT: br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]]
; PTR64: copy2:
-; PTR64-NEXT: [[TMP6:%.*]] = alloca [16 x double], align 8
+; PTR64-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP6]])
; PTR64-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
; PTR64-NEXT: br label [[NO_ALIAS3]]
; PTR64: no_alias3:
@@ -176,10 +178,14 @@ define void @multiply(ptr %A, ptr %B, ptr %C) {
; PTR64-NEXT: store <2 x double> [[TMP51]], ptr [[TMP54]], align 8
; PTR64-NEXT: [[VEC_GEP158:%.*]] = getelementptr i8, ptr [[C]], i64 112
; PTR64-NEXT: store <2 x double> [[TMP53]], ptr [[VEC_GEP158]], align 8
+; PTR64-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[TMP2]])
+; PTR64-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[TMP6]])
; PTR64-NEXT: ret void
;
; PTR32-LABEL: @multiply(
; PTR32-NEXT: entry:
+; PTR32-NEXT: [[TMP6:%.*]] = alloca [16 x double], align 8
+; PTR32-NEXT: [[TMP2:%.*]] = alloca [16 x double], align 8
; PTR32-NEXT: [[STORE_END:%.*]] = getelementptr inbounds nuw i8, ptr [[C:%.*]], i32 128
; PTR32-NEXT: [[TMP0:%.*]] = icmp ult ptr [[A:%.*]], [[STORE_END]]
; PTR32-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
@@ -188,7 +194,7 @@ define void @multiply(ptr %A, ptr %B, ptr %C) {
; PTR32-NEXT: [[TMP1:%.*]] = icmp ult ptr [[C]], [[LOAD_END]]
; PTR32-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
; PTR32: copy:
-; PTR32-NEXT: [[TMP2:%.*]] = alloca [16 x double], align 8
+; PTR32-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP2]])
; PTR32-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
; PTR32-NEXT: br label [[NO_ALIAS]]
; PTR32: no_alias:
@@ -201,7 +207,7 @@ define void @multiply(ptr %A, ptr %B, ptr %C) {
; PTR32-NEXT: [[TMP5:%.*]] = icmp ult ptr [[C]], [[LOAD_END5]]
; PTR32-NEXT: br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]]
; PTR32: copy2:
-; PTR32-NEXT: [[TMP6:%.*]] = alloca [16 x double], align 8
+; PTR32-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP6]])
; PTR32-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
; PTR32-NEXT: br label [[NO_ALIAS3]]
; PTR32: no_alias3:
@@ -345,6 +351,8 @@ define void @multiply(ptr %A, ptr %B, ptr %C) {
; PTR32-NEXT: store <2 x double> [[TMP51]], ptr [[TMP54]], align 8
; PTR32-NEXT: [[VEC_GEP158:%.*]] = getelementptr i8, ptr [[C]], i32 112
; PTR32-NEXT: store <2 x double> [[TMP53]], ptr [[VEC_GEP158]], align 8
+; PTR32-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[TMP2]])
+; PTR32-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[TMP6]])
; PTR32-NEXT: ret void
;
entry:
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-differing-addr-spaces.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-differing-addr-spaces.ll
index 1e089a6fa97e3..6b564a65ccbd6 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-differing-addr-spaces.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-differing-addr-spaces.ll
@@ -9,14 +9,16 @@ define void @multiply_diff_addr_spaces(ptr addrspace(1) %A, ptr addrspace(1) %B,
; CHECK-SAME: ptr addrspace(1) [[A:%.*]], ptr addrspace(1) [[B:%.*]], ptr addrspace(2) [[C:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = alloca [4 x float], align 4, addrspace(1)
-; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[TMP0]], ptr addrspace(1) align 8 [[A]], i64 16, i1 false)
; CHECK-NEXT: [[TMP1:%.*]] = alloca [4 x float], align 4, addrspace(1)
-; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[TMP1]], ptr addrspace(1) align 8 [[B]], i64 16, i1 false)
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr addrspace(1) [[TMP0]], i64 0
+; CHECK-NEXT: call void @llvm.lifetime.start.p1(ptr addrspace(1) [[TMP1]])
+; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[TMP1]], ptr addrspace(1) align 8 [[A]], i64 16, i1 false)
+; CHECK-NEXT: call void @llvm.lifetime.start.p1(ptr addrspace(1) [[TMP0]])
+; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[TMP0]], ptr addrspace(1) align 8 [[B]], i64 16, i1 false)
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr addrspace(1) [[TMP1]], i64 0
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP2]], align 8
; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr addrspace(1) [[TMP2]], i64 2
; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x float>, ptr addrspace(1) [[VEC_GEP]], align 8
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr float, ptr addrspace(1) [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr float, ptr addrspace(1) [[TMP0]], i64 0
; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP3]], align 8
; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr float, ptr addrspace(1) [[TMP3]], i64 2
; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x float>, ptr addrspace(1) [[VEC_GEP3]], align 8
@@ -78,6 +80,8 @@ define void @multiply_diff_addr_spaces(ptr addrspace(1) %A, ptr addrspace(1) %B,
; CHECK-NEXT: store <2 x float> [[TMP17]], ptr addrspace(2) [[TMP32]], align 8
; CHECK-NEXT: [[VEC_GEP28:%.*]] = getelementptr float, ptr addrspace(2) [[TMP32]], i64 2
; CHECK-NEXT: store <2 x float> [[TMP31]], ptr addrspace(2) [[VEC_GEP28]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p1(ptr addrspace(1) [[TMP1]])
+; CHECK-NEXT: call void @llvm.lifetime.end.p1(ptr addrspace(1) [[TMP0]])
; CHECK-NEXT: ret void
;
entry:
@@ -92,9 +96,11 @@ define void @multiply_all_diff_addr_spaces(ptr addrspace(1) %A, ptr addrspace(3)
; CHECK-LABEL: define void @multiply_all_diff_addr_spaces(
; CHECK-SAME: ptr addrspace(1) [[A:%.*]], ptr addrspace(3) [[B:%.*]], ptr addrspace(2) [[C:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP1:%.*]] = alloca [4 x float], align 4, addrspace(3)
; CHECK-NEXT: [[TMP0:%.*]] = alloca [4 x float], align 4, addrspace(1)
+; CHECK-NEXT: call void @llvm.lifetime.start.p1(ptr addrspace(1) [[TMP0]])
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[TMP0]], ptr addrspace(1) align 8 [[A]], i64 16, i1 false)
-; CHECK-NEXT: [[TMP1:%.*]] = alloca [4 x float], align 4, addrspace(3)
+; CHECK-NEXT: call void @llvm.lifetime.start.p3(ptr addrspace(3) [[TMP1]])
; CHECK-NEXT: call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) align 4 [[TMP1]], ptr addrspace(3) align 8 [[B]], i64 16, i1 false)
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr addrspace(1) [[TMP0]], i64 0
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP2]], align 8
@@ -162,6 +168,8 @@ define void @multiply_all_diff_addr_spaces(ptr addrspace(1) %A, ptr addrspace(3)
; CHECK-NEXT: store <2 x float> [[TMP17]], ptr addrspace(2) [[TMP32]], align 8
; CHECK-NEXT: [[VEC_GEP28:%.*]] = getelementptr float, ptr addrspace(2) [[TMP32]], i64 2
; CHECK-NEXT: store <2 x float> [[TMP31]], ptr addrspace(2) [[VEC_GEP28]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p1(ptr addrspace(1) [[TMP0]])
+; CHECK-NEXT: call void @llvm.lifetime.end.p3(ptr addrspace(3) [[TMP1]])
; CHECK-NEXT: ret void
;
entry:
@@ -178,6 +186,8 @@ define void @multiply_first_load_same_addr_space(ptr addrspace(1) %A, ptr addrsp
; CHECK-LABEL: define void @multiply_first_load_same_addr_space(
; CHECK-SAME: ptr addrspace(1) [[A:%.*]], ptr addrspace(2) [[B:%.*]], ptr addrspace(1) [[C:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[TMP4:%.*]] = alloca [4 x float], align 4, addrspace(2)
+; CHECK-NEXT: [[TMP2:%.*]] = alloca [4 x float], align 4, addrspace(1)
; CHECK-NEXT: [[STORE_END:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[C]], i64 16
; CHECK-NEXT: [[TMP0:%.*]] = icmp ult ptr addrspace(1) [[A]], [[STORE_END]]
; CHECK-NEXT: br i1 [[TMP0]], label %[[ALIAS_CONT:.*]], label %[[NO_ALIAS:.*]]
@@ -186,12 +196,12 @@ define void @multiply_first_load_same_addr_space(ptr addrspace(1) %A, ptr addrsp
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult ptr addrspace(1) [[C]], [[LOAD_END]]
; CHECK-NEXT: br i1 [[TMP1]], label %[[COPY:.*]], label %[[NO_ALIAS]]
; CHECK: [[COPY]]:
-; CHECK-NEXT: [[TMP2:%.*]] = alloca [4 x float], align 4, addrspace(1)
+; CHECK-NEXT: call void @llvm.lifetime.start.p1(ptr addrspace(1) [[TMP2]])
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(1) align 8 [[A]], i64 16, i1 false)
; CHECK-NEXT: br label %[[NO_ALIAS]]
; CHECK: [[NO_ALIAS]]:
; CHECK-NEXT: [[TMP3:%.*]] = phi ptr addrspace(1) [ [[A]], %[[ENTRY]] ], [ [[A]], %[[ALIAS_CONT]] ], [ [[TMP2]], %[[COPY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = alloca [4 x float], align 4, addrspace(2)
+; CHECK-NEXT: call void @llvm.lifetime.start.p2(ptr addrspace(2) [[TMP4]])
; CHECK-NEXT: call void @llvm.memcpy.p2.p2.i64(ptr addrspace(2) align 4 [[TMP4]], ptr addrspace(2) align 8 [[B]], i64 16, i1 false)
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr float, ptr addrspace(1) [[TMP3]], i64 0
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP5]], align 8
@@ -259,6 +269,8 @@ define void @multiply_first_load_same_addr_space(ptr addrspace(1) %A, ptr addrsp
; CHECK-NEXT: store <2 x float> [[TMP20]], ptr addrspace(1) [[TMP35]], align 8
; CHECK-NEXT: [[VEC_GEP28:%.*]] = getelementptr float, ptr addrspace(1) [[TMP35]], i64 2
; CHECK-NEXT: store <2 x float> [[TMP34]], ptr addrspace(1) [[VEC_GEP28]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p1(ptr addrspace(1) [[TMP2]])
+; CHECK-NEXT: call void @llvm.lifetime.end.p2(ptr addrspace(2) [[TMP4]])
; CHECK-NEXT: ret void
;
entry:
@@ -275,7 +287,9 @@ define void @multiply_second_load_same_addr_space(ptr addrspace(2) %A, ptr addrs
; CHECK-LABEL: define void @multiply_second_load_same_addr_space(
; CHECK-SAME: ptr addrspace(2) [[A:%.*]], ptr addrspace(1) [[B:%.*]], ptr addrspace(1) [[C:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[TMP3:%.*]] = alloca [4 x float], align 4, addrspace(1)
; CHECK-NEXT: [[TMP0:%.*]] = alloca [4 x float], align 4, addrspace(2)
+; CHECK-NEXT: call void @llvm.lifetime.start.p2(ptr addrspace(2) [[TMP0]])
; CHECK-NEXT: call void @llvm.memcpy.p2.p2.i64(ptr addrspace(2) align 4 [[TMP0]], ptr addrspace(2) align 8 [[A]], i64 16, i1 false)
; CHECK-NEXT: [[STORE_END:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[C]], i64 16
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult ptr addrspace(1) [[B]], [[STORE_END]]
@@ -285,7 +299,7 @@ define void @multiply_second_load_same_addr_space(ptr addrspace(2) %A, ptr addrs
; CHECK-NEXT: [[TMP2:%.*]] = icmp ult ptr addrspace(1) [[C]], [[LOAD_END]]
; CHECK-NEXT: br i1 [[TMP2]], label %[[COPY:.*]], label %[[NO_ALIAS]]
; CHECK: [[COPY]]:
-; CHECK-NEXT: [[TMP3:%.*]] = alloca [4 x float], align 4, addrspace(1)
+; CHECK-NEXT: call void @llvm.lifetime.start.p1(ptr addrspace(1) [[TMP3]])
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[TMP3]], ptr addrspace(1) align 8 [[B]], i64 16, i1 false)
; CHECK-NEXT: br label %[[NO_ALIAS]]
; CHECK: [[NO_ALIAS]]:
@@ -356,6 +370,8 @@ define void @multiply_second_load_same_addr_space(ptr addrspace(2) %A, ptr addrs
; CHECK-NEXT: store <2 x float> [[TMP20]], ptr addrspace(1) [[TMP35]], align 8
; CHECK-NEXT: [[VEC_GEP28:%.*]] = getelementptr float, ptr addrspace(1) [[TMP35]], i64 2
; CHECK-NEXT: store <2 x float> [[TMP34]], ptr addrspace(1) [[VEC_GEP28]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p2(ptr addrspace(2) [[TMP0]])
+; CHECK-NEXT: call void @llvm.lifetime.end.p1(ptr addrspace(1) [[TMP3]])
; CHECK-NEXT: ret void
;
entry:
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll
index aa40a2df06817..741d4fb89a48e 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll
@@ -9,6 +9,7 @@ target triple = "aarch64-apple-ios"
define void @multiply_can_hoist_cast(ptr noalias %A, ptr %B, ptr %C) {
; CHECK-LABEL: @multiply_can_hoist_cast(
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP2:%.*]] = alloca [4 x double], align 8
; CHECK-NEXT: [[STORE_END:%.*]] = getelementptr inbounds nuw i8, ptr [[C:%.*]], i64 32
; CHECK-NEXT: [[TMP0:%.*]] = icmp ult ptr [[B:%.*]], [[STORE_END]]
; CHECK-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
@@ -17,7 +18,7 @@ define void @multiply_can_hoist_cast(ptr noalias %A, ptr %B, ptr %C) {
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult ptr [[C]], [[LOAD_END]]
; CHECK-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
; CHECK: copy:
-; CHECK-NEXT: [[TMP2:%.*]] = alloca [4 x double], align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP2]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
; CHECK-NEXT: br label [[NO_ALIAS]]
; CHECK: no_alias:
@@ -65,6 +66,7 @@ define void @multiply_can_hoist_cast(ptr noalias %A, ptr %B, ptr %C) {
; CHECK-NEXT: [[TMP25:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD35]], <1 x double> [[COL_LOAD36]], <1 x double> [[TMP22]])
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[C]], i64 24
; CHECK-NEXT: store <1 x double> [[TMP25]], ptr [[TMP26]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[TMP2]])
; CHECK-NEXT: ret void
;
entry:
@@ -78,6 +80,7 @@ entry:
define void @multiply_can_hoist_multiple_insts(ptr noalias %A, ptr %B, ptr %C) {
; CHECK-LABEL: @mult...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/190032
More information about the llvm-commits
mailing list