[llvm] [Matrix] Place allocas in function entry. (PR #190032)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 2 10:05:58 PDT 2026
https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/190032
>From 913a0ecce5d07cab972830d806f6c75a4b21800a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 28 Mar 2026 09:59:46 +0000
Subject: [PATCH 1/2] [Matrix] Place allocas in function entry.
Create allocas for temporary matrixes in the function entry. Limit the
lifetime via lifetime.start & lifetime.end. This avoids dynamic allocas.
Improvement suggested in
https://github.com/llvm/llvm-project/pull/188721.
---
.../Scalar/LowerMatrixIntrinsics.cpp | 35 +++++++++++++------
.../data-layout-multiply-fused.ll | 16 ++++++---
.../multiply-fused-differing-addr-spaces.ll | 32 ++++++++++++-----
.../multiply-fused-dominance.ll | 12 +++++--
.../multiply-fused-loops.ll | 8 +++--
.../multiply-fused-multiple-blocks.ll | 12 ++++---
.../LowerMatrixIntrinsics/multiply-fused.ll | 8 +++--
7 files changed, 89 insertions(+), 34 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 49a8626da0a18..68779f1969431 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -1873,28 +1873,30 @@ class LowerMatrixIntrinsics {
/// Ensure that the memory in \p Load does not alias \p Store by potentially
/// copying it to a new location. This new or otherwise the original location
/// is returned.
- Value *getNonAliasingPointer(LoadInst *Load, StoreInst *Store,
- CallInst *MatMul) {
+ std::pair<Value *, AllocaInst *>
+ getNonAliasingPointer(LoadInst *Load, StoreInst *Store, CallInst *MatMul) {
MemoryLocation StoreLoc = MemoryLocation::get(Store);
MemoryLocation LoadLoc = MemoryLocation::get(Load);
// If we can statically determine noalias we're good.
if (AA->isNoAlias(LoadLoc, StoreLoc))
- return Load->getPointerOperand();
+ return {Load->getPointerOperand(), nullptr};
// If the pointers are in different address spaces, we cannot compare them
// at runtime. Conservatively copy the load operand to a new buffer.
if (Load->getPointerAddressSpace() != Store->getPointerAddressSpace()) {
- IRBuilder<> Builder(MatMul);
auto *VT = cast<FixedVectorType>(Load->getType());
auto *ArrayTy =
ArrayType::get(VT->getElementType(), VT->getNumElements());
+ IRBuilder<> AllocaBuilder(&Func.getEntryBlock().front());
AllocaInst *Alloca =
- Builder.CreateAlloca(ArrayTy, Load->getPointerAddressSpace());
+ AllocaBuilder.CreateAlloca(ArrayTy, Load->getPointerAddressSpace());
+ IRBuilder<> Builder(MatMul);
+ Builder.CreateLifetimeStart(Alloca);
Builder.CreateMemCpy(Alloca, Alloca->getAlign(),
Load->getPointerOperand(), Load->getAlign(),
LoadLoc.Size.getValue());
- return Alloca;
+ return {Alloca, Alloca};
}
// Create code to check if the memory locations of the Load and Store
@@ -1950,14 +1952,16 @@ class LowerMatrixIntrinsics {
setExplicitlyUnknownBranchWeightsIfProfiled(*BR2, DEBUG_TYPE);
// Copy load operand to new alloca.
- Builder.SetInsertPoint(Copy, Copy->begin());
auto *VT = cast<FixedVectorType>(Load->getType());
// Use an array type for the alloca, to avoid potentially huge alignment
// requirements for large vector types.
auto *ArrayTy = ArrayType::get(VT->getElementType(), VT->getNumElements());
+ IRBuilder<> AllocaBuilder(&Func.getEntryBlock().front());
AllocaInst *Alloca =
- Builder.CreateAlloca(ArrayTy, Load->getPointerAddressSpace());
+ AllocaBuilder.CreateAlloca(ArrayTy, Load->getPointerAddressSpace());
+ Builder.SetInsertPoint(Copy, Copy->begin());
+ Builder.CreateLifetimeStart(Alloca);
Builder.CreateMemCpy(Alloca, Alloca->getAlign(), Load->getPointerOperand(),
Load->getAlign(), LoadLoc.Size.getValue());
Builder.SetInsertPoint(Fusion, Fusion->begin());
@@ -1972,7 +1976,7 @@ class LowerMatrixIntrinsics {
DTUpdates.push_back({DT->Insert, Check1, Copy});
DTUpdates.push_back({DT->Insert, Check1, Fusion});
DT->applyUpdates(DTUpdates);
- return PHI;
+ return {PHI, Alloca};
}
bool isFusionProfitable(CallInst *MatMul) {
@@ -2094,8 +2098,8 @@ class LowerMatrixIntrinsics {
const unsigned M = LShape.NumColumns;
auto *EltType = cast<FixedVectorType>(MatMul->getType())->getElementType();
- Value *APtr = getNonAliasingPointer(LoadOp0, Store, MatMul);
- Value *BPtr = getNonAliasingPointer(LoadOp1, Store, MatMul);
+ auto [APtr, AAlloca] = getNonAliasingPointer(LoadOp0, Store, MatMul);
+ auto [BPtr, BAlloca] = getNonAliasingPointer(LoadOp1, Store, MatMul);
Value *CPtr = Store->getPointerOperand();
// Use loop-based tiling when the number of expected operations exceeds
@@ -2131,6 +2135,15 @@ class LowerMatrixIntrinsics {
}
}
+ // End the lifetime of the allocas used for alias-safe copies.
+ {
+ IRBuilder<> Builder(Store);
+ if (AAlloca)
+ Builder.CreateLifetimeEnd(AAlloca);
+ if (BAlloca)
+ Builder.CreateLifetimeEnd(BAlloca);
+ }
+
// Mark eliminated instructions as fused and remove them.
FusedInsts.insert(Store);
FusedInsts.insert(MatMul);
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
index ac7f8f6b05994..d5b6dee2b4c40 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
@@ -11,6 +11,8 @@ target triple = "aarch64-unknown-unknown"
define void @multiply(ptr %A, ptr %B, ptr %C) {
; PTR64-LABEL: @multiply(
; PTR64-NEXT: entry:
+; PTR64-NEXT: [[TMP6:%.*]] = alloca [16 x double], align 8
+; PTR64-NEXT: [[TMP2:%.*]] = alloca [16 x double], align 8
; PTR64-NEXT: [[STORE_END:%.*]] = getelementptr inbounds nuw i8, ptr [[C:%.*]], i64 128
; PTR64-NEXT: [[TMP0:%.*]] = icmp ult ptr [[A:%.*]], [[STORE_END]]
; PTR64-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
@@ -19,7 +21,7 @@ define void @multiply(ptr %A, ptr %B, ptr %C) {
; PTR64-NEXT: [[TMP1:%.*]] = icmp ult ptr [[C]], [[LOAD_END]]
; PTR64-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
; PTR64: copy:
-; PTR64-NEXT: [[TMP2:%.*]] = alloca [16 x double], align 8
+; PTR64-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP2]])
; PTR64-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
; PTR64-NEXT: br label [[NO_ALIAS]]
; PTR64: no_alias:
@@ -32,7 +34,7 @@ define void @multiply(ptr %A, ptr %B, ptr %C) {
; PTR64-NEXT: [[TMP5:%.*]] = icmp ult ptr [[C]], [[LOAD_END5]]
; PTR64-NEXT: br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]]
; PTR64: copy2:
-; PTR64-NEXT: [[TMP6:%.*]] = alloca [16 x double], align 8
+; PTR64-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP6]])
; PTR64-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
; PTR64-NEXT: br label [[NO_ALIAS3]]
; PTR64: no_alias3:
@@ -176,10 +178,14 @@ define void @multiply(ptr %A, ptr %B, ptr %C) {
; PTR64-NEXT: store <2 x double> [[TMP51]], ptr [[TMP54]], align 8
; PTR64-NEXT: [[VEC_GEP158:%.*]] = getelementptr i8, ptr [[C]], i64 112
; PTR64-NEXT: store <2 x double> [[TMP53]], ptr [[VEC_GEP158]], align 8
+; PTR64-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[TMP2]])
+; PTR64-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[TMP6]])
; PTR64-NEXT: ret void
;
; PTR32-LABEL: @multiply(
; PTR32-NEXT: entry:
+; PTR32-NEXT: [[TMP6:%.*]] = alloca [16 x double], align 8
+; PTR32-NEXT: [[TMP2:%.*]] = alloca [16 x double], align 8
; PTR32-NEXT: [[STORE_END:%.*]] = getelementptr inbounds nuw i8, ptr [[C:%.*]], i32 128
; PTR32-NEXT: [[TMP0:%.*]] = icmp ult ptr [[A:%.*]], [[STORE_END]]
; PTR32-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
@@ -188,7 +194,7 @@ define void @multiply(ptr %A, ptr %B, ptr %C) {
; PTR32-NEXT: [[TMP1:%.*]] = icmp ult ptr [[C]], [[LOAD_END]]
; PTR32-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
; PTR32: copy:
-; PTR32-NEXT: [[TMP2:%.*]] = alloca [16 x double], align 8
+; PTR32-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP2]])
; PTR32-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
; PTR32-NEXT: br label [[NO_ALIAS]]
; PTR32: no_alias:
@@ -201,7 +207,7 @@ define void @multiply(ptr %A, ptr %B, ptr %C) {
; PTR32-NEXT: [[TMP5:%.*]] = icmp ult ptr [[C]], [[LOAD_END5]]
; PTR32-NEXT: br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]]
; PTR32: copy2:
-; PTR32-NEXT: [[TMP6:%.*]] = alloca [16 x double], align 8
+; PTR32-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP6]])
; PTR32-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
; PTR32-NEXT: br label [[NO_ALIAS3]]
; PTR32: no_alias3:
@@ -345,6 +351,8 @@ define void @multiply(ptr %A, ptr %B, ptr %C) {
; PTR32-NEXT: store <2 x double> [[TMP51]], ptr [[TMP54]], align 8
; PTR32-NEXT: [[VEC_GEP158:%.*]] = getelementptr i8, ptr [[C]], i32 112
; PTR32-NEXT: store <2 x double> [[TMP53]], ptr [[VEC_GEP158]], align 8
+; PTR32-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[TMP2]])
+; PTR32-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[TMP6]])
; PTR32-NEXT: ret void
;
entry:
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-differing-addr-spaces.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-differing-addr-spaces.ll
index 1e089a6fa97e3..6b564a65ccbd6 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-differing-addr-spaces.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-differing-addr-spaces.ll
@@ -9,14 +9,16 @@ define void @multiply_diff_addr_spaces(ptr addrspace(1) %A, ptr addrspace(1) %B,
; CHECK-SAME: ptr addrspace(1) [[A:%.*]], ptr addrspace(1) [[B:%.*]], ptr addrspace(2) [[C:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = alloca [4 x float], align 4, addrspace(1)
-; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[TMP0]], ptr addrspace(1) align 8 [[A]], i64 16, i1 false)
; CHECK-NEXT: [[TMP1:%.*]] = alloca [4 x float], align 4, addrspace(1)
-; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[TMP1]], ptr addrspace(1) align 8 [[B]], i64 16, i1 false)
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr addrspace(1) [[TMP0]], i64 0
+; CHECK-NEXT: call void @llvm.lifetime.start.p1(ptr addrspace(1) [[TMP1]])
+; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[TMP1]], ptr addrspace(1) align 8 [[A]], i64 16, i1 false)
+; CHECK-NEXT: call void @llvm.lifetime.start.p1(ptr addrspace(1) [[TMP0]])
+; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[TMP0]], ptr addrspace(1) align 8 [[B]], i64 16, i1 false)
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr addrspace(1) [[TMP1]], i64 0
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP2]], align 8
; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr addrspace(1) [[TMP2]], i64 2
; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x float>, ptr addrspace(1) [[VEC_GEP]], align 8
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr float, ptr addrspace(1) [[TMP1]], i64 0
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr float, ptr addrspace(1) [[TMP0]], i64 0
; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP3]], align 8
; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr float, ptr addrspace(1) [[TMP3]], i64 2
; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x float>, ptr addrspace(1) [[VEC_GEP3]], align 8
@@ -78,6 +80,8 @@ define void @multiply_diff_addr_spaces(ptr addrspace(1) %A, ptr addrspace(1) %B,
; CHECK-NEXT: store <2 x float> [[TMP17]], ptr addrspace(2) [[TMP32]], align 8
; CHECK-NEXT: [[VEC_GEP28:%.*]] = getelementptr float, ptr addrspace(2) [[TMP32]], i64 2
; CHECK-NEXT: store <2 x float> [[TMP31]], ptr addrspace(2) [[VEC_GEP28]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p1(ptr addrspace(1) [[TMP1]])
+; CHECK-NEXT: call void @llvm.lifetime.end.p1(ptr addrspace(1) [[TMP0]])
; CHECK-NEXT: ret void
;
entry:
@@ -92,9 +96,11 @@ define void @multiply_all_diff_addr_spaces(ptr addrspace(1) %A, ptr addrspace(3)
; CHECK-LABEL: define void @multiply_all_diff_addr_spaces(
; CHECK-SAME: ptr addrspace(1) [[A:%.*]], ptr addrspace(3) [[B:%.*]], ptr addrspace(2) [[C:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP1:%.*]] = alloca [4 x float], align 4, addrspace(3)
; CHECK-NEXT: [[TMP0:%.*]] = alloca [4 x float], align 4, addrspace(1)
+; CHECK-NEXT: call void @llvm.lifetime.start.p1(ptr addrspace(1) [[TMP0]])
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[TMP0]], ptr addrspace(1) align 8 [[A]], i64 16, i1 false)
-; CHECK-NEXT: [[TMP1:%.*]] = alloca [4 x float], align 4, addrspace(3)
+; CHECK-NEXT: call void @llvm.lifetime.start.p3(ptr addrspace(3) [[TMP1]])
; CHECK-NEXT: call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) align 4 [[TMP1]], ptr addrspace(3) align 8 [[B]], i64 16, i1 false)
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr addrspace(1) [[TMP0]], i64 0
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP2]], align 8
@@ -162,6 +168,8 @@ define void @multiply_all_diff_addr_spaces(ptr addrspace(1) %A, ptr addrspace(3)
; CHECK-NEXT: store <2 x float> [[TMP17]], ptr addrspace(2) [[TMP32]], align 8
; CHECK-NEXT: [[VEC_GEP28:%.*]] = getelementptr float, ptr addrspace(2) [[TMP32]], i64 2
; CHECK-NEXT: store <2 x float> [[TMP31]], ptr addrspace(2) [[VEC_GEP28]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p1(ptr addrspace(1) [[TMP0]])
+; CHECK-NEXT: call void @llvm.lifetime.end.p3(ptr addrspace(3) [[TMP1]])
; CHECK-NEXT: ret void
;
entry:
@@ -178,6 +186,8 @@ define void @multiply_first_load_same_addr_space(ptr addrspace(1) %A, ptr addrsp
; CHECK-LABEL: define void @multiply_first_load_same_addr_space(
; CHECK-SAME: ptr addrspace(1) [[A:%.*]], ptr addrspace(2) [[B:%.*]], ptr addrspace(1) [[C:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[TMP4:%.*]] = alloca [4 x float], align 4, addrspace(2)
+; CHECK-NEXT: [[TMP2:%.*]] = alloca [4 x float], align 4, addrspace(1)
; CHECK-NEXT: [[STORE_END:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[C]], i64 16
; CHECK-NEXT: [[TMP0:%.*]] = icmp ult ptr addrspace(1) [[A]], [[STORE_END]]
; CHECK-NEXT: br i1 [[TMP0]], label %[[ALIAS_CONT:.*]], label %[[NO_ALIAS:.*]]
@@ -186,12 +196,12 @@ define void @multiply_first_load_same_addr_space(ptr addrspace(1) %A, ptr addrsp
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult ptr addrspace(1) [[C]], [[LOAD_END]]
; CHECK-NEXT: br i1 [[TMP1]], label %[[COPY:.*]], label %[[NO_ALIAS]]
; CHECK: [[COPY]]:
-; CHECK-NEXT: [[TMP2:%.*]] = alloca [4 x float], align 4, addrspace(1)
+; CHECK-NEXT: call void @llvm.lifetime.start.p1(ptr addrspace(1) [[TMP2]])
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(1) align 8 [[A]], i64 16, i1 false)
; CHECK-NEXT: br label %[[NO_ALIAS]]
; CHECK: [[NO_ALIAS]]:
; CHECK-NEXT: [[TMP3:%.*]] = phi ptr addrspace(1) [ [[A]], %[[ENTRY]] ], [ [[A]], %[[ALIAS_CONT]] ], [ [[TMP2]], %[[COPY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = alloca [4 x float], align 4, addrspace(2)
+; CHECK-NEXT: call void @llvm.lifetime.start.p2(ptr addrspace(2) [[TMP4]])
; CHECK-NEXT: call void @llvm.memcpy.p2.p2.i64(ptr addrspace(2) align 4 [[TMP4]], ptr addrspace(2) align 8 [[B]], i64 16, i1 false)
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr float, ptr addrspace(1) [[TMP3]], i64 0
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP5]], align 8
@@ -259,6 +269,8 @@ define void @multiply_first_load_same_addr_space(ptr addrspace(1) %A, ptr addrsp
; CHECK-NEXT: store <2 x float> [[TMP20]], ptr addrspace(1) [[TMP35]], align 8
; CHECK-NEXT: [[VEC_GEP28:%.*]] = getelementptr float, ptr addrspace(1) [[TMP35]], i64 2
; CHECK-NEXT: store <2 x float> [[TMP34]], ptr addrspace(1) [[VEC_GEP28]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p1(ptr addrspace(1) [[TMP2]])
+; CHECK-NEXT: call void @llvm.lifetime.end.p2(ptr addrspace(2) [[TMP4]])
; CHECK-NEXT: ret void
;
entry:
@@ -275,7 +287,9 @@ define void @multiply_second_load_same_addr_space(ptr addrspace(2) %A, ptr addrs
; CHECK-LABEL: define void @multiply_second_load_same_addr_space(
; CHECK-SAME: ptr addrspace(2) [[A:%.*]], ptr addrspace(1) [[B:%.*]], ptr addrspace(1) [[C:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[TMP3:%.*]] = alloca [4 x float], align 4, addrspace(1)
; CHECK-NEXT: [[TMP0:%.*]] = alloca [4 x float], align 4, addrspace(2)
+; CHECK-NEXT: call void @llvm.lifetime.start.p2(ptr addrspace(2) [[TMP0]])
; CHECK-NEXT: call void @llvm.memcpy.p2.p2.i64(ptr addrspace(2) align 4 [[TMP0]], ptr addrspace(2) align 8 [[A]], i64 16, i1 false)
; CHECK-NEXT: [[STORE_END:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[C]], i64 16
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult ptr addrspace(1) [[B]], [[STORE_END]]
@@ -285,7 +299,7 @@ define void @multiply_second_load_same_addr_space(ptr addrspace(2) %A, ptr addrs
; CHECK-NEXT: [[TMP2:%.*]] = icmp ult ptr addrspace(1) [[C]], [[LOAD_END]]
; CHECK-NEXT: br i1 [[TMP2]], label %[[COPY:.*]], label %[[NO_ALIAS]]
; CHECK: [[COPY]]:
-; CHECK-NEXT: [[TMP3:%.*]] = alloca [4 x float], align 4, addrspace(1)
+; CHECK-NEXT: call void @llvm.lifetime.start.p1(ptr addrspace(1) [[TMP3]])
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[TMP3]], ptr addrspace(1) align 8 [[B]], i64 16, i1 false)
; CHECK-NEXT: br label %[[NO_ALIAS]]
; CHECK: [[NO_ALIAS]]:
@@ -356,6 +370,8 @@ define void @multiply_second_load_same_addr_space(ptr addrspace(2) %A, ptr addrs
; CHECK-NEXT: store <2 x float> [[TMP20]], ptr addrspace(1) [[TMP35]], align 8
; CHECK-NEXT: [[VEC_GEP28:%.*]] = getelementptr float, ptr addrspace(1) [[TMP35]], i64 2
; CHECK-NEXT: store <2 x float> [[TMP34]], ptr addrspace(1) [[VEC_GEP28]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p2(ptr addrspace(2) [[TMP0]])
+; CHECK-NEXT: call void @llvm.lifetime.end.p1(ptr addrspace(1) [[TMP3]])
; CHECK-NEXT: ret void
;
entry:
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll
index aa40a2df06817..741d4fb89a48e 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll
@@ -9,6 +9,7 @@ target triple = "aarch64-apple-ios"
define void @multiply_can_hoist_cast(ptr noalias %A, ptr %B, ptr %C) {
; CHECK-LABEL: @multiply_can_hoist_cast(
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP2:%.*]] = alloca [4 x double], align 8
; CHECK-NEXT: [[STORE_END:%.*]] = getelementptr inbounds nuw i8, ptr [[C:%.*]], i64 32
; CHECK-NEXT: [[TMP0:%.*]] = icmp ult ptr [[B:%.*]], [[STORE_END]]
; CHECK-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
@@ -17,7 +18,7 @@ define void @multiply_can_hoist_cast(ptr noalias %A, ptr %B, ptr %C) {
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult ptr [[C]], [[LOAD_END]]
; CHECK-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
; CHECK: copy:
-; CHECK-NEXT: [[TMP2:%.*]] = alloca [4 x double], align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP2]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
; CHECK-NEXT: br label [[NO_ALIAS]]
; CHECK: no_alias:
@@ -65,6 +66,7 @@ define void @multiply_can_hoist_cast(ptr noalias %A, ptr %B, ptr %C) {
; CHECK-NEXT: [[TMP25:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD35]], <1 x double> [[COL_LOAD36]], <1 x double> [[TMP22]])
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[C]], i64 24
; CHECK-NEXT: store <1 x double> [[TMP25]], ptr [[TMP26]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[TMP2]])
; CHECK-NEXT: ret void
;
entry:
@@ -78,6 +80,7 @@ entry:
define void @multiply_can_hoist_multiple_insts(ptr noalias %A, ptr %B, ptr %C) {
; CHECK-LABEL: @multiply_can_hoist_multiple_insts(
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP2:%.*]] = alloca [4 x double], align 8
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 64
; CHECK-NEXT: [[STORE_END:%.*]] = getelementptr i8, ptr [[C]], i64 96
; CHECK-NEXT: [[TMP0:%.*]] = icmp ult ptr [[B:%.*]], [[STORE_END]]
@@ -87,7 +90,7 @@ define void @multiply_can_hoist_multiple_insts(ptr noalias %A, ptr %B, ptr %C) {
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult ptr [[GEP]], [[LOAD_END]]
; CHECK-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
; CHECK: copy:
-; CHECK-NEXT: [[TMP2:%.*]] = alloca [4 x double], align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP2]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
; CHECK-NEXT: br label [[NO_ALIAS]]
; CHECK: no_alias:
@@ -135,6 +138,7 @@ define void @multiply_can_hoist_multiple_insts(ptr noalias %A, ptr %B, ptr %C) {
; CHECK-NEXT: [[TMP25:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD35]], <1 x double> [[COL_LOAD36]], <1 x double> [[TMP22]])
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[C]], i64 88
; CHECK-NEXT: store <1 x double> [[TMP25]], ptr [[TMP26]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[TMP2]])
; CHECK-NEXT: ret void
;
entry:
@@ -150,6 +154,7 @@ entry:
define void @multiply_can_hoist_multiple_insts2(ptr noalias %A, ptr %B, ptr %C) {
; CHECK-LABEL: @multiply_can_hoist_multiple_insts2(
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP2:%.*]] = alloca [4 x double], align 8
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 1344
; CHECK-NEXT: [[STORE_END:%.*]] = getelementptr i8, ptr [[C]], i64 1376
; CHECK-NEXT: [[TMP0:%.*]] = icmp ult ptr [[B:%.*]], [[STORE_END]]
@@ -159,7 +164,7 @@ define void @multiply_can_hoist_multiple_insts2(ptr noalias %A, ptr %B, ptr %C)
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult ptr [[GEP_1]], [[LOAD_END]]
; CHECK-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
; CHECK: copy:
-; CHECK-NEXT: [[TMP2:%.*]] = alloca [4 x double], align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP2]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
; CHECK-NEXT: br label [[NO_ALIAS]]
; CHECK: no_alias:
@@ -207,6 +212,7 @@ define void @multiply_can_hoist_multiple_insts2(ptr noalias %A, ptr %B, ptr %C)
; CHECK-NEXT: [[TMP25:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD35]], <1 x double> [[COL_LOAD36]], <1 x double> [[TMP22]])
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[C]], i64 1368
; CHECK-NEXT: store <1 x double> [[TMP25]], ptr [[TMP26]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[TMP2]])
; CHECK-NEXT: ret void
;
entry:
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll
index 22a17e42b3e48..8f2d7971fb7d1 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll
@@ -258,6 +258,8 @@ declare <32 x i64> @llvm.matrix.multiply.v32i64.v8i64.v16i64(<8 x i64>, <16 x i6
define void @multiply_alias_2x2(ptr %A, ptr %B, ptr %C) {
; CHECK-LABEL: @multiply_alias_2x2(
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP6:%.*]] = alloca [4 x float], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = alloca [4 x float], align 4
; CHECK-NEXT: [[STORE_END:%.*]] = getelementptr inbounds nuw i8, ptr [[C:%.*]], i64 16
; CHECK-NEXT: [[TMP0:%.*]] = icmp ult ptr [[A:%.*]], [[STORE_END]]
; CHECK-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
@@ -266,7 +268,7 @@ define void @multiply_alias_2x2(ptr %A, ptr %B, ptr %C) {
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult ptr [[C]], [[LOAD_END]]
; CHECK-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
; CHECK: copy:
-; CHECK-NEXT: [[TMP2:%.*]] = alloca [4 x float], align 4
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP2]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(16) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(16) [[A]], i64 16, i1 false)
; CHECK-NEXT: br label [[NO_ALIAS]]
; CHECK: no_alias:
@@ -279,7 +281,7 @@ define void @multiply_alias_2x2(ptr %A, ptr %B, ptr %C) {
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[C]], [[LOAD_END5]]
; CHECK-NEXT: br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]]
; CHECK: copy2:
-; CHECK-NEXT: [[TMP6:%.*]] = alloca [4 x float], align 4
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP6]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(16) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
; CHECK-NEXT: br label [[NO_ALIAS3]]
; CHECK: no_alias3:
@@ -341,6 +343,8 @@ define void @multiply_alias_2x2(ptr %A, ptr %B, ptr %C) {
; CHECK-NEXT: [[COLS_COND_NOT:%.*]] = icmp eq i64 [[COLS_IV]], 0
; CHECK-NEXT: br i1 [[COLS_COND_NOT]], label [[CONTINUE:%.*]], label [[COLS_HEADER]], !prof [[PROF4]]
; CHECK: continue:
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[TMP2]])
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[TMP6]])
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-multiple-blocks.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-multiple-blocks.ll
index 2ce5367cc9c0f..251243a3e05f1 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-multiple-blocks.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-multiple-blocks.ll
@@ -9,6 +9,10 @@ target triple = "aarch64-apple-ios"
define void @test(ptr %A, ptr %B, ptr %C, i1 %cond) {
; CHECK-LABEL: @test(
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP38:%.*]] = alloca [6 x double], align 8
+; CHECK-NEXT: [[TMP34:%.*]] = alloca [6 x double], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = alloca [6 x double], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = alloca [6 x double], align 8
; CHECK-NEXT: [[COL_LOAD133:%.*]] = load <3 x double>, ptr [[A:%.*]], align 8
; CHECK-NEXT: [[VEC_GEP134:%.*]] = getelementptr i8, ptr [[A]], i64 24
; CHECK-NEXT: [[COL_LOAD135:%.*]] = load <3 x double>, ptr [[VEC_GEP134]], align 8
@@ -25,7 +29,7 @@ define void @test(ptr %A, ptr %B, ptr %C, i1 %cond) {
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult ptr [[C]], [[LOAD_END]]
; CHECK-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
; CHECK: copy:
-; CHECK-NEXT: [[TMP2:%.*]] = alloca [6 x double], align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP2]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(48) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(48) [[A]], i64 48, i1 false)
; CHECK-NEXT: br label [[NO_ALIAS]]
; CHECK: no_alias:
@@ -38,7 +42,7 @@ define void @test(ptr %A, ptr %B, ptr %C, i1 %cond) {
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[C]], [[LOAD_END5]]
; CHECK-NEXT: br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]]
; CHECK: copy2:
-; CHECK-NEXT: [[TMP6:%.*]] = alloca [6 x double], align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP6]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(48) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(48) [[B]], i64 48, i1 false)
; CHECK-NEXT: br label [[NO_ALIAS3]]
; CHECK: no_alias3:
@@ -129,7 +133,7 @@ define void @test(ptr %A, ptr %B, ptr %C, i1 %cond) {
; CHECK-NEXT: [[TMP33:%.*]] = icmp ult ptr [[C]], [[LOAD_END63]]
; CHECK-NEXT: br i1 [[TMP33]], label [[COPY62:%.*]], label [[NO_ALIAS63]]
; CHECK: copy60:
-; CHECK-NEXT: [[TMP34:%.*]] = alloca [6 x double], align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP34]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(48) [[TMP34]], ptr noundef nonnull align 8 dereferenceable(48) [[A]], i64 48, i1 false)
; CHECK-NEXT: br label [[NO_ALIAS63]]
; CHECK: no_alias61:
@@ -142,7 +146,7 @@ define void @test(ptr %A, ptr %B, ptr %C, i1 %cond) {
; CHECK-NEXT: [[TMP37:%.*]] = icmp ult ptr [[C]], [[LOAD_END68]]
; CHECK-NEXT: br i1 [[TMP37]], label [[COPY69:%.*]], label [[NO_ALIAS70]]
; CHECK: copy65:
-; CHECK-NEXT: [[TMP38:%.*]] = alloca [6 x double], align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP38]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(48) [[TMP38]], ptr noundef nonnull align 8 dereferenceable(48) [[B]], i64 48, i1 false)
; CHECK-NEXT: br label [[NO_ALIAS70]]
; CHECK: no_alias66:
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll
index 9022bff06ac16..0cceeebac34b3 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll
@@ -11,6 +11,8 @@ target triple = "aarch64-apple-ios"
define void @multiply(ptr %A, ptr %B, ptr %C) !prof !0 {
; CHECK-LABEL: @multiply(
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP6:%.*]] = alloca [16 x double], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = alloca [16 x double], align 8
; CHECK-NEXT: [[STORE_END:%.*]] = getelementptr inbounds nuw i8, ptr [[C:%.*]], i64 128
; CHECK-NEXT: [[TMP0:%.*]] = icmp ult ptr [[A:%.*]], [[STORE_END]]
; CHECK-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]], !prof [[PROF1:![0-9]+]]
@@ -19,7 +21,7 @@ define void @multiply(ptr %A, ptr %B, ptr %C) !prof !0 {
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult ptr [[C]], [[LOAD_END]]
; CHECK-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]], !prof [[PROF1]]
; CHECK: copy:
-; CHECK-NEXT: [[TMP2:%.*]] = alloca [16 x double], align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP2]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
; CHECK-NEXT: br label [[NO_ALIAS]]
; CHECK: no_alias:
@@ -32,7 +34,7 @@ define void @multiply(ptr %A, ptr %B, ptr %C) !prof !0 {
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[C]], [[LOAD_END5]]
; CHECK-NEXT: br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]], !prof [[PROF1]]
; CHECK: copy2:
-; CHECK-NEXT: [[TMP6:%.*]] = alloca [16 x double], align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP6]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(128) [[B]], i64 128, i1 false)
; CHECK-NEXT: br label [[NO_ALIAS3]]
; CHECK: no_alias3:
@@ -176,6 +178,8 @@ define void @multiply(ptr %A, ptr %B, ptr %C) !prof !0 {
; CHECK-NEXT: store <2 x double> [[TMP51]], ptr [[TMP54]], align 8
; CHECK-NEXT: [[VEC_GEP158:%.*]] = getelementptr i8, ptr [[C]], i64 112
; CHECK-NEXT: store <2 x double> [[TMP53]], ptr [[VEC_GEP158]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[TMP2]])
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[TMP6]])
; CHECK-NEXT: ret void
;
>From 4b9ca167d1f7f8c56f18deabccb2aa77f2d0e1b8 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 2 Apr 2026 12:33:01 +0100
Subject: [PATCH 2/2] !fixup place lifetime.start before conditional check.
---
.../Scalar/LowerMatrixIntrinsics.cpp | 20 +++++++++----------
.../data-layout-multiply-fused.ll | 8 ++++----
.../multiply-fused-differing-addr-spaces.ll | 4 ++--
.../multiply-fused-dominance.ll | 6 +++---
.../multiply-fused-loops.ll | 4 ++--
.../multiply-fused-multiple-blocks.ll | 12 +++++++----
.../LowerMatrixIntrinsics/multiply-fused.ll | 4 ++--
7 files changed, 31 insertions(+), 27 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 68779f1969431..b920caf737df0 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -1884,11 +1884,11 @@ class LowerMatrixIntrinsics {
// If the pointers are in different address spaces, we cannot compare them
// at runtime. Conservatively copy the load operand to a new buffer.
+ IRBuilder<> AllocaBuilder(&Func.getEntryBlock().front());
if (Load->getPointerAddressSpace() != Store->getPointerAddressSpace()) {
auto *VT = cast<FixedVectorType>(Load->getType());
auto *ArrayTy =
ArrayType::get(VT->getElementType(), VT->getNumElements());
- IRBuilder<> AllocaBuilder(&Func.getEntryBlock().front());
AllocaInst *Alloca =
AllocaBuilder.CreateAlloca(ArrayTy, Load->getPointerAddressSpace());
IRBuilder<> Builder(MatMul);
@@ -1943,6 +1943,15 @@ class LowerMatrixIntrinsics {
// overlap.
Check1->getTerminator()->eraseFromParent();
Builder.SetInsertPoint(Check1, Check1->begin());
+
+ auto *VT = cast<FixedVectorType>(Load->getType());
+ // Use an array type for the alloca, to avoid potentially huge alignment
+ // requirements for large vector types.
+ auto *ArrayTy = ArrayType::get(VT->getElementType(), VT->getNumElements());
+ AllocaInst *Alloca =
+ AllocaBuilder.CreateAlloca(ArrayTy, Load->getPointerAddressSpace());
+ Builder.CreateLifetimeStart(Alloca);
+
Value *LoadEnd = Builder.CreatePtrAdd(
LoadBegin, ConstantInt::get(AddrTy, LoadLoc.Size.getValue()),
"load.end",
@@ -1952,16 +1961,7 @@ class LowerMatrixIntrinsics {
setExplicitlyUnknownBranchWeightsIfProfiled(*BR2, DEBUG_TYPE);
// Copy load operand to new alloca.
- auto *VT = cast<FixedVectorType>(Load->getType());
- // Use an array type for the alloca, to avoid potentially huge alignment
- // requirements for large vector types.
- auto *ArrayTy = ArrayType::get(VT->getElementType(), VT->getNumElements());
- IRBuilder<> AllocaBuilder(&Func.getEntryBlock().front());
- AllocaInst *Alloca =
- AllocaBuilder.CreateAlloca(ArrayTy, Load->getPointerAddressSpace());
-
Builder.SetInsertPoint(Copy, Copy->begin());
- Builder.CreateLifetimeStart(Alloca);
Builder.CreateMemCpy(Alloca, Alloca->getAlign(), Load->getPointerOperand(),
Load->getAlign(), LoadLoc.Size.getValue());
Builder.SetInsertPoint(Fusion, Fusion->begin());
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
index d5b6dee2b4c40..3688794a6910c 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
@@ -17,11 +17,11 @@ define void @multiply(ptr %A, ptr %B, ptr %C) {
; PTR64-NEXT: [[TMP0:%.*]] = icmp ult ptr [[A:%.*]], [[STORE_END]]
; PTR64-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
; PTR64: alias_cont:
+; PTR64-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP2]])
; PTR64-NEXT: [[LOAD_END:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 128
; PTR64-NEXT: [[TMP1:%.*]] = icmp ult ptr [[C]], [[LOAD_END]]
; PTR64-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
; PTR64: copy:
-; PTR64-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP2]])
; PTR64-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
; PTR64-NEXT: br label [[NO_ALIAS]]
; PTR64: no_alias:
@@ -30,11 +30,11 @@ define void @multiply(ptr %A, ptr %B, ptr %C) {
; PTR64-NEXT: [[TMP4:%.*]] = icmp ult ptr [[A]], [[STORE_END4]]
; PTR64-NEXT: br i1 [[TMP4]], label [[ALIAS_CONT1:%.*]], label [[NO_ALIAS3:%.*]]
; PTR64: alias_cont1:
+; PTR64-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP6]])
; PTR64-NEXT: [[LOAD_END5:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 128
; PTR64-NEXT: [[TMP5:%.*]] = icmp ult ptr [[C]], [[LOAD_END5]]
; PTR64-NEXT: br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]]
; PTR64: copy2:
-; PTR64-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP6]])
; PTR64-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
; PTR64-NEXT: br label [[NO_ALIAS3]]
; PTR64: no_alias3:
@@ -190,11 +190,11 @@ define void @multiply(ptr %A, ptr %B, ptr %C) {
; PTR32-NEXT: [[TMP0:%.*]] = icmp ult ptr [[A:%.*]], [[STORE_END]]
; PTR32-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
; PTR32: alias_cont:
+; PTR32-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP2]])
; PTR32-NEXT: [[LOAD_END:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i32 128
; PTR32-NEXT: [[TMP1:%.*]] = icmp ult ptr [[C]], [[LOAD_END]]
; PTR32-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
; PTR32: copy:
-; PTR32-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP2]])
; PTR32-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
; PTR32-NEXT: br label [[NO_ALIAS]]
; PTR32: no_alias:
@@ -203,11 +203,11 @@ define void @multiply(ptr %A, ptr %B, ptr %C) {
; PTR32-NEXT: [[TMP4:%.*]] = icmp ult ptr [[A]], [[STORE_END4]]
; PTR32-NEXT: br i1 [[TMP4]], label [[ALIAS_CONT1:%.*]], label [[NO_ALIAS3:%.*]]
; PTR32: alias_cont1:
+; PTR32-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP6]])
; PTR32-NEXT: [[LOAD_END5:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i32 128
; PTR32-NEXT: [[TMP5:%.*]] = icmp ult ptr [[C]], [[LOAD_END5]]
; PTR32-NEXT: br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]]
; PTR32: copy2:
-; PTR32-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP6]])
; PTR32-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
; PTR32-NEXT: br label [[NO_ALIAS3]]
; PTR32: no_alias3:
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-differing-addr-spaces.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-differing-addr-spaces.ll
index 6b564a65ccbd6..a41d2f7fc033d 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-differing-addr-spaces.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-differing-addr-spaces.ll
@@ -192,11 +192,11 @@ define void @multiply_first_load_same_addr_space(ptr addrspace(1) %A, ptr addrsp
; CHECK-NEXT: [[TMP0:%.*]] = icmp ult ptr addrspace(1) [[A]], [[STORE_END]]
; CHECK-NEXT: br i1 [[TMP0]], label %[[ALIAS_CONT:.*]], label %[[NO_ALIAS:.*]]
; CHECK: [[ALIAS_CONT]]:
+; CHECK-NEXT: call void @llvm.lifetime.start.p1(ptr addrspace(1) [[TMP2]])
; CHECK-NEXT: [[LOAD_END:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[A]], i64 16
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult ptr addrspace(1) [[C]], [[LOAD_END]]
; CHECK-NEXT: br i1 [[TMP1]], label %[[COPY:.*]], label %[[NO_ALIAS]]
; CHECK: [[COPY]]:
-; CHECK-NEXT: call void @llvm.lifetime.start.p1(ptr addrspace(1) [[TMP2]])
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(1) align 8 [[A]], i64 16, i1 false)
; CHECK-NEXT: br label %[[NO_ALIAS]]
; CHECK: [[NO_ALIAS]]:
@@ -295,11 +295,11 @@ define void @multiply_second_load_same_addr_space(ptr addrspace(2) %A, ptr addrs
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult ptr addrspace(1) [[B]], [[STORE_END]]
; CHECK-NEXT: br i1 [[TMP1]], label %[[ALIAS_CONT:.*]], label %[[NO_ALIAS:.*]]
; CHECK: [[ALIAS_CONT]]:
+; CHECK-NEXT: call void @llvm.lifetime.start.p1(ptr addrspace(1) [[TMP3]])
; CHECK-NEXT: [[LOAD_END:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[B]], i64 16
; CHECK-NEXT: [[TMP2:%.*]] = icmp ult ptr addrspace(1) [[C]], [[LOAD_END]]
; CHECK-NEXT: br i1 [[TMP2]], label %[[COPY:.*]], label %[[NO_ALIAS]]
; CHECK: [[COPY]]:
-; CHECK-NEXT: call void @llvm.lifetime.start.p1(ptr addrspace(1) [[TMP3]])
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[TMP3]], ptr addrspace(1) align 8 [[B]], i64 16, i1 false)
; CHECK-NEXT: br label %[[NO_ALIAS]]
; CHECK: [[NO_ALIAS]]:
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll
index 741d4fb89a48e..270c8e2fd42c2 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll
@@ -14,11 +14,11 @@ define void @multiply_can_hoist_cast(ptr noalias %A, ptr %B, ptr %C) {
; CHECK-NEXT: [[TMP0:%.*]] = icmp ult ptr [[B:%.*]], [[STORE_END]]
; CHECK-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
; CHECK: alias_cont:
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP2]])
; CHECK-NEXT: [[LOAD_END:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult ptr [[C]], [[LOAD_END]]
; CHECK-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
; CHECK: copy:
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP2]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
; CHECK-NEXT: br label [[NO_ALIAS]]
; CHECK: no_alias:
@@ -86,11 +86,11 @@ define void @multiply_can_hoist_multiple_insts(ptr noalias %A, ptr %B, ptr %C) {
; CHECK-NEXT: [[TMP0:%.*]] = icmp ult ptr [[B:%.*]], [[STORE_END]]
; CHECK-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
; CHECK: alias_cont:
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP2]])
; CHECK-NEXT: [[LOAD_END:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult ptr [[GEP]], [[LOAD_END]]
; CHECK-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
; CHECK: copy:
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP2]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
; CHECK-NEXT: br label [[NO_ALIAS]]
; CHECK: no_alias:
@@ -160,11 +160,11 @@ define void @multiply_can_hoist_multiple_insts2(ptr noalias %A, ptr %B, ptr %C)
; CHECK-NEXT: [[TMP0:%.*]] = icmp ult ptr [[B:%.*]], [[STORE_END]]
; CHECK-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
; CHECK: alias_cont:
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP2]])
; CHECK-NEXT: [[LOAD_END:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult ptr [[GEP_1]], [[LOAD_END]]
; CHECK-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
; CHECK: copy:
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP2]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false)
; CHECK-NEXT: br label [[NO_ALIAS]]
; CHECK: no_alias:
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll
index 8f2d7971fb7d1..c65729aee66c6 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll
@@ -264,11 +264,11 @@ define void @multiply_alias_2x2(ptr %A, ptr %B, ptr %C) {
; CHECK-NEXT: [[TMP0:%.*]] = icmp ult ptr [[A:%.*]], [[STORE_END]]
; CHECK-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
; CHECK: alias_cont:
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP2]])
; CHECK-NEXT: [[LOAD_END:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 16
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult ptr [[C]], [[LOAD_END]]
; CHECK-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
; CHECK: copy:
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP2]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(16) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(16) [[A]], i64 16, i1 false)
; CHECK-NEXT: br label [[NO_ALIAS]]
; CHECK: no_alias:
@@ -277,11 +277,11 @@ define void @multiply_alias_2x2(ptr %A, ptr %B, ptr %C) {
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult ptr [[B:%.*]], [[STORE_END4]]
; CHECK-NEXT: br i1 [[TMP4]], label [[ALIAS_CONT1:%.*]], label [[NO_ALIAS3:%.*]]
; CHECK: alias_cont1:
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP6]])
; CHECK-NEXT: [[LOAD_END5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[C]], [[LOAD_END5]]
; CHECK-NEXT: br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]]
; CHECK: copy2:
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP6]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(16) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false)
; CHECK-NEXT: br label [[NO_ALIAS3]]
; CHECK: no_alias3:
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-multiple-blocks.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-multiple-blocks.ll
index 251243a3e05f1..7084c3e2d1fc6 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-multiple-blocks.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-multiple-blocks.ll
@@ -25,11 +25,11 @@ define void @test(ptr %A, ptr %B, ptr %C, i1 %cond) {
; CHECK-NEXT: [[TMP0:%.*]] = icmp ult ptr [[A]], [[STORE_END]]
; CHECK-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
; CHECK: alias_cont:
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP2]])
; CHECK-NEXT: [[LOAD_END:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 48
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult ptr [[C]], [[LOAD_END]]
; CHECK-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
; CHECK: copy:
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP2]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(48) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(48) [[A]], i64 48, i1 false)
; CHECK-NEXT: br label [[NO_ALIAS]]
; CHECK: no_alias:
@@ -38,11 +38,11 @@ define void @test(ptr %A, ptr %B, ptr %C, i1 %cond) {
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult ptr [[B]], [[STORE_END4]]
; CHECK-NEXT: br i1 [[TMP4]], label [[ALIAS_CONT1:%.*]], label [[NO_ALIAS3:%.*]]
; CHECK: alias_cont1:
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP6]])
; CHECK-NEXT: [[LOAD_END5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[C]], [[LOAD_END5]]
; CHECK-NEXT: br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]]
; CHECK: copy2:
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP6]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(48) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(48) [[B]], i64 48, i1 false)
; CHECK-NEXT: br label [[NO_ALIAS3]]
; CHECK: no_alias3:
@@ -106,6 +106,8 @@ define void @test(ptr %A, ptr %B, ptr %C, i1 %cond) {
; CHECK-NEXT: [[TMP25:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD53]], <1 x double> [[SPLAT_SPLATINSERT59]], <1 x double> [[TMP24]])
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[C]], i64 64
; CHECK-NEXT: store <1 x double> [[TMP25]], ptr [[TMP26]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[TMP2]])
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[TMP6]])
; CHECK-NEXT: br i1 [[COND:%.*]], label [[TRUE:%.*]], label [[FALSE:%.*]]
; CHECK: true:
; CHECK-NEXT: [[TMP27:%.*]] = fadd contract <3 x double> [[COL_LOAD133]], [[COL_LOAD133]]
@@ -129,11 +131,11 @@ define void @test(ptr %A, ptr %B, ptr %C, i1 %cond) {
; CHECK-NEXT: [[TMP32:%.*]] = icmp ult ptr [[A]], [[STORE_END62]]
; CHECK-NEXT: br i1 [[TMP32]], label [[ALIAS_CONT61:%.*]], label [[NO_ALIAS63:%.*]]
; CHECK: alias_cont59:
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP34]])
; CHECK-NEXT: [[LOAD_END63:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 48
; CHECK-NEXT: [[TMP33:%.*]] = icmp ult ptr [[C]], [[LOAD_END63]]
; CHECK-NEXT: br i1 [[TMP33]], label [[COPY62:%.*]], label [[NO_ALIAS63]]
; CHECK: copy60:
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP34]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(48) [[TMP34]], ptr noundef nonnull align 8 dereferenceable(48) [[A]], i64 48, i1 false)
; CHECK-NEXT: br label [[NO_ALIAS63]]
; CHECK: no_alias61:
@@ -142,11 +144,11 @@ define void @test(ptr %A, ptr %B, ptr %C, i1 %cond) {
; CHECK-NEXT: [[TMP36:%.*]] = icmp ult ptr [[B]], [[STORE_END67]]
; CHECK-NEXT: br i1 [[TMP36]], label [[ALIAS_CONT68:%.*]], label [[NO_ALIAS70:%.*]]
; CHECK: alias_cont64:
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP38]])
; CHECK-NEXT: [[LOAD_END68:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48
; CHECK-NEXT: [[TMP37:%.*]] = icmp ult ptr [[C]], [[LOAD_END68]]
; CHECK-NEXT: br i1 [[TMP37]], label [[COPY69:%.*]], label [[NO_ALIAS70]]
; CHECK: copy65:
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP38]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(48) [[TMP38]], ptr noundef nonnull align 8 dereferenceable(48) [[B]], i64 48, i1 false)
; CHECK-NEXT: br label [[NO_ALIAS70]]
; CHECK: no_alias66:
@@ -210,6 +212,8 @@ define void @test(ptr %A, ptr %B, ptr %C, i1 %cond) {
; CHECK-NEXT: [[TMP57:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD125]], <1 x double> [[SPLAT_SPLATINSERT131]], <1 x double> [[TMP56]])
; CHECK-NEXT: [[TMP58:%.*]] = getelementptr i8, ptr [[C]], i64 64
; CHECK-NEXT: store <1 x double> [[TMP57]], ptr [[TMP58]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[TMP34]])
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr nonnull [[TMP38]])
; CHECK-NEXT: ret void
;
entry:
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll
index 0cceeebac34b3..21fdbf678975a 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll
@@ -17,11 +17,11 @@ define void @multiply(ptr %A, ptr %B, ptr %C) !prof !0 {
; CHECK-NEXT: [[TMP0:%.*]] = icmp ult ptr [[A:%.*]], [[STORE_END]]
; CHECK-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]], !prof [[PROF1:![0-9]+]]
; CHECK: alias_cont:
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP2]])
; CHECK-NEXT: [[LOAD_END:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 128
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult ptr [[C]], [[LOAD_END]]
; CHECK-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]], !prof [[PROF1]]
; CHECK: copy:
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP2]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
; CHECK-NEXT: br label [[NO_ALIAS]]
; CHECK: no_alias:
@@ -30,11 +30,11 @@ define void @multiply(ptr %A, ptr %B, ptr %C) !prof !0 {
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult ptr [[B:%.*]], [[STORE_END4]]
; CHECK-NEXT: br i1 [[TMP4]], label [[ALIAS_CONT1:%.*]], label [[NO_ALIAS3:%.*]], !prof [[PROF1]]
; CHECK: alias_cont1:
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP6]])
; CHECK-NEXT: [[LOAD_END5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 128
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[C]], [[LOAD_END5]]
; CHECK-NEXT: br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]], !prof [[PROF1]]
; CHECK: copy2:
-; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr nonnull [[TMP6]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(128) [[B]], i64 128, i1 false)
; CHECK-NEXT: br label [[NO_ALIAS3]]
; CHECK: no_alias3:
More information about the llvm-commits
mailing list