[llvm] e3284b9 - [LowerMemIntrinsics][AMDGPU] Propagate Debug Value (#178131)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 27 01:16:19 PST 2026
Author: Abhishek Kaushik
Date: 2026-01-27T14:46:14+05:30
New Revision: e3284b96815d7ef9ac358724b49a25befc2c4439
URL: https://github.com/llvm/llvm-project/commit/e3284b96815d7ef9ac358724b49a25befc2c4439
DIFF: https://github.com/llvm/llvm-project/commit/e3284b96815d7ef9ac358724b49a25befc2c4439.diff
LOG: [LowerMemIntrinsics][AMDGPU] Propagate Debug Value (#178131)
Propagate debug value to expanded loops for `memcpy`, `memmove` and
`memset` intrinsics.
Added:
llvm/test/Transforms/PreISelIntrinsicLowering/AMDGPU/expand-mem-intrinsics.ll
Modified:
llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index e294a3e29c28d..e77cdbe7a3876 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -140,7 +140,9 @@ insertLoopExpansion(Instruction *InsertBefore, Value *Len,
InsertBefore, BBNamePrefix + "-post-expansion");
Function *ParentFunc = PreLoopBB->getParent();
LLVMContext &Ctx = PreLoopBB->getContext();
+ const DebugLoc &DbgLoc = InsertBefore->getStableDebugLoc();
IRBuilder<> PreLoopBuilder(PreLoopBB->getTerminator());
+ PreLoopBuilder.SetCurrentDebugLocation(DbgLoc);
// Calculate the main loop trip count and remaining units to cover after the
// loop.
@@ -179,6 +181,7 @@ insertLoopExpansion(Instruction *InsertBefore, Value *Len,
BasicBlock *MainLoopBB = BasicBlock::Create(
Ctx, BBNamePrefix + "-expansion-main-body", ParentFunc, PostLoopBB);
IRBuilder<> LoopBuilder(MainLoopBB);
+ LoopBuilder.SetCurrentDebugLocation(DbgLoc);
PHINode *LoopIndex = LoopBuilder.CreatePHI(LenType, 2, "loop-index");
LEI.MainLoopIndex = LoopIndex;
@@ -228,10 +231,12 @@ insertLoopExpansion(Instruction *InsertBefore, Value *Len,
// Determine if we need to branch to the residual loop or bypass it.
IRBuilder<> RCBuilder(ResidualCondBB);
+ RCBuilder.SetCurrentDebugLocation(DbgLoc);
RCBuilder.CreateCondBr(RCBuilder.CreateICmpNE(ResidualUnits, Zero),
ResLoopBB, PostLoopBB);
IRBuilder<> ResBuilder(ResLoopBB);
+ ResBuilder.SetCurrentDebugLocation(DbgLoc);
PHINode *ResidualIndex =
ResBuilder.CreatePHI(LenType, 2, "residual-loop-index");
ResidualIndex->addIncoming(Zero, ResidualCondBB);
@@ -589,7 +594,9 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
ConstantInt::get(ILengthType, ResidualLoopOpSize);
ConstantInt *Zero = ConstantInt::get(ILengthType, 0);
+ const DebugLoc &DbgLoc = InsertBefore->getStableDebugLoc();
IRBuilder<> PLBuilder(InsertBefore);
+ PLBuilder.SetCurrentDebugLocation(DbgLoc);
Value *RuntimeLoopBytes = CopyLen;
Value *RuntimeLoopRemainder = nullptr;
@@ -675,6 +682,7 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
BasicBlock *ResidualLoopBB = BasicBlock::Create(
F->getContext(), "memmove_bwd_residual_loop", F, MainLoopBB);
IRBuilder<> ResidualLoopBuilder(ResidualLoopBB);
+ ResidualLoopBuilder.SetCurrentDebugLocation(DbgLoc);
PHINode *ResidualLoopPhi = ResidualLoopBuilder.CreatePHI(ILengthType, 0);
Value *ResidualIndex = ResidualLoopBuilder.CreateSub(
ResidualLoopPhi, CIResidualLoopOpSize, "bwd_residual_index");
@@ -697,6 +705,7 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
F->getContext(), "memmove_bwd_middle", F, MainLoopBB);
// Later code expects a terminator in the PredBB.
IRBuilder<> IntermediateBuilder(IntermediateBB);
+ IntermediateBuilder.SetCurrentDebugLocation(DbgLoc);
IntermediateBuilder.CreateUnreachable();
ResidualLoopBuilder.CreateCondBr(
ResidualLoopBuilder.CreateICmpEQ(ResidualIndex, RuntimeLoopBytes),
@@ -706,8 +715,10 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
ResidualLoopPhi->addIncoming(CopyLen, CopyBackwardsBB);
// How to get to the residual:
- BranchInst::Create(IntermediateBB, ResidualLoopBB, SkipResidualCondition,
- ThenTerm->getIterator());
+ BranchInst *BrInst =
+ BranchInst::Create(IntermediateBB, ResidualLoopBB,
+ SkipResidualCondition, ThenTerm->getIterator());
+ BrInst->setDebugLoc(DbgLoc);
ThenTerm->eraseFromParent();
PredBB = IntermediateBB;
@@ -715,6 +726,7 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
// main loop
IRBuilder<> MainLoopBuilder(MainLoopBB);
+ MainLoopBuilder.SetCurrentDebugLocation(DbgLoc);
PHINode *MainLoopPhi = MainLoopBuilder.CreatePHI(ILengthType, 0);
Value *MainIndex =
MainLoopBuilder.CreateSub(MainLoopPhi, CILoopOpSize, "bwd_main_index");
@@ -733,8 +745,9 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
// How to get to the main loop:
Instruction *PredBBTerm = PredBB->getTerminator();
- BranchInst::Create(ExitBB, MainLoopBB, SkipMainCondition,
- PredBBTerm->getIterator());
+ BranchInst *BrInst = BranchInst::Create(
+ ExitBB, MainLoopBB, SkipMainCondition, PredBBTerm->getIterator());
+ BrInst->setDebugLoc(DbgLoc);
PredBBTerm->eraseFromParent();
}
@@ -744,6 +757,7 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
BasicBlock *MainLoopBB =
BasicBlock::Create(F->getContext(), "memmove_fwd_main_loop", F, ExitBB);
IRBuilder<> MainLoopBuilder(MainLoopBB);
+ MainLoopBuilder.SetCurrentDebugLocation(DbgLoc);
PHINode *MainLoopPhi =
MainLoopBuilder.CreatePHI(ILengthType, 0, "fwd_main_index");
Value *LoadGEP =
@@ -770,13 +784,16 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
MainLoopBB);
// getting in or skipping the main loop
- BranchInst::Create(SuccessorBB, MainLoopBB, SkipMainCondition,
- CopyFwdBBTerm->getIterator());
+ BranchInst *BrInst =
+ BranchInst::Create(SuccessorBB, MainLoopBB, SkipMainCondition,
+ CopyFwdBBTerm->getIterator());
+ BrInst->setDebugLoc(DbgLoc);
CopyFwdBBTerm->eraseFromParent();
if (RequiresResidual) {
BasicBlock *IntermediateBB = SuccessorBB;
IRBuilder<> IntermediateBuilder(IntermediateBB);
+ IntermediateBuilder.SetCurrentDebugLocation(DbgLoc);
BasicBlock *ResidualLoopBB = BasicBlock::Create(
F->getContext(), "memmove_fwd_residual_loop", F, ExitBB);
IntermediateBuilder.CreateCondBr(SkipResidualCondition, ExitBB,
@@ -784,6 +801,7 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
// Residual loop
IRBuilder<> ResidualLoopBuilder(ResidualLoopBB);
+ ResidualLoopBuilder.SetCurrentDebugLocation(DbgLoc);
PHINode *ResidualLoopPhi =
ResidualLoopBuilder.CreatePHI(ILengthType, 0, "fwd_residual_index");
Value *LoadGEP = ResidualLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr,
@@ -841,7 +859,9 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
ConstantInt *LoopBound = ConstantInt::get(ILengthType, BytesCopiedInLoop);
ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
+ const DebugLoc &DbgLoc = InsertBefore->getStableDebugLoc();
IRBuilder<> PLBuilder(InsertBefore);
+ PLBuilder.SetCurrentDebugLocation(DbgLoc);
auto [CmpSrcAddr, CmpDstAddr] =
tryInsertCastToCommonAddrSpace(PLBuilder, SrcAddr, DstAddr, TTI);
@@ -894,6 +914,7 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
// instead of after it.
IRBuilder<> BwdResBuilder(CopyBackwardsBB,
CopyBackwardsBB->getFirstNonPHIIt());
+ BwdResBuilder.SetCurrentDebugLocation(DbgLoc);
SmallVector<Type *, 5> RemainingOps;
TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
SrcAS, DstAS, PartSrcAlign,
@@ -917,6 +938,7 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
CopyBackwardsBB->setName("memmove_bwd_loop");
}
IRBuilder<> LoopBuilder(LoopBB->getTerminator());
+ LoopBuilder.SetCurrentDebugLocation(DbgLoc);
PHINode *LoopPhi = LoopBuilder.CreatePHI(ILengthType, 0);
Value *Index = LoopBuilder.CreateSub(LoopPhi, CILoopOpSize, "bwd_index");
Value *LoadGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, Index);
@@ -950,6 +972,7 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
FwdResidualBB = SuccBB;
}
IRBuilder<> LoopBuilder(LoopBB->getTerminator());
+ LoopBuilder.SetCurrentDebugLocation(DbgLoc);
PHINode *LoopPhi = LoopBuilder.CreatePHI(ILengthType, 0, "fwd_index");
Value *LoadGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, LoopPhi);
Value *Element = LoopBuilder.CreateAlignedLoad(
@@ -974,6 +997,7 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
// Residual code is required to move the remaining bytes. In the forward
// case, we emit it in the normal order.
IRBuilder<> FwdResBuilder(FwdResidualBB->getTerminator());
+ FwdResBuilder.SetCurrentDebugLocation(DbgLoc);
SmallVector<Type *, 5> RemainingOps;
TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
SrcAS, DstAS, PartSrcAlign,
@@ -996,7 +1020,9 @@ static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
BasicBlock *LoopBB
= BasicBlock::Create(F->getContext(), "loadstoreloop", F, NewBB);
+ const DebugLoc &DbgLoc = InsertBefore->getStableDebugLoc();
IRBuilder<> Builder(OrigBB->getTerminator());
+ Builder.SetCurrentDebugLocation(DbgLoc);
auto *ToLoopBR = Builder.CreateCondBr(
Builder.CreateICmpEQ(ConstantInt::get(TypeOfCopyLen, 0), CopyLen), NewBB,
@@ -1014,6 +1040,7 @@ static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
Align PartAlign(commonAlignment(DstAlign, PartSize));
IRBuilder<> LoopBuilder(LoopBB);
+ LoopBuilder.SetCurrentDebugLocation(DbgLoc);
PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB);
@@ -1092,6 +1119,7 @@ bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove,
bool SrcIsVolatile = Memmove->isVolatile();
bool DstIsVolatile = SrcIsVolatile;
IRBuilder<> CastBuilder(Memmove);
+ CastBuilder.SetCurrentDebugLocation(Memmove->getStableDebugLoc());
unsigned SrcAS = SrcAddr->getType()->getPointerAddressSpace();
unsigned DstAS = DstAddr->getType()->getPointerAddressSpace();
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/AMDGPU/expand-mem-intrinsics.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/AMDGPU/expand-mem-intrinsics.ll
new file mode 100644
index 0000000000000..0da7b1494ef9f
--- /dev/null
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/AMDGPU/expand-mem-intrinsics.ll
@@ -0,0 +1,330 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -p=debugify,pre-isel-intrinsic-lowering,check-debugify -S < %s | FileCheck %s
+
+define protected amdgpu_kernel void @memcpy(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 noundef %n) {
+; CHECK-LABEL: define protected amdgpu_kernel void @memcpy(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 noundef [[N:%.*]]) !dbg [[DBG5:![0-9]+]] {
+; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[N]], 15, !dbg [[DBG11:![0-9]+]]
+; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[N]], [[TMP1]], !dbg [[DBG11]]
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i64 [[TMP2]], 0, !dbg [[DBG11]]
+; CHECK-NEXT: br i1 [[TMP3]], label %[[DYNAMIC_MEMCPY_EXPANSION_MAIN_BODY:.*]], label %[[DYNAMIC_MEMCPY_EXPANSION_RESIDUAL_COND:.*]], !dbg [[DBG11]]
+; CHECK: [[DYNAMIC_MEMCPY_EXPANSION_MAIN_BODY]]:
+; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP7:%.*]], %[[DYNAMIC_MEMCPY_EXPANSION_MAIN_BODY]] ], !dbg [[DBG11]]
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[LOOP_INDEX]], !dbg [[DBG11]]
+; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP4]], align 1, !dbg [[DBG11]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[LOOP_INDEX]], !dbg [[DBG11]]
+; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr addrspace(1) [[TMP6]], align 1, !dbg [[DBG11]]
+; CHECK-NEXT: [[TMP7]] = add i64 [[LOOP_INDEX]], 16, !dbg [[DBG11]]
+; CHECK-NEXT: [[TMP8:%.*]] = icmp ult i64 [[TMP7]], [[TMP2]], !dbg [[DBG11]]
+; CHECK-NEXT: br i1 [[TMP8]], label %[[DYNAMIC_MEMCPY_EXPANSION_MAIN_BODY]], label %[[DYNAMIC_MEMCPY_EXPANSION_RESIDUAL_COND]], !dbg [[DBG11]]
+; CHECK: [[DYNAMIC_MEMCPY_EXPANSION_RESIDUAL_COND]]:
+; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i64 [[TMP1]], 0, !dbg [[DBG11]]
+; CHECK-NEXT: br i1 [[TMP9]], label %[[DYNAMIC_MEMCPY_EXPANSION_RESIDUAL_BODY:.*]], label %[[DYNAMIC_MEMCPY_POST_EXPANSION:.*]], !dbg [[DBG11]]
+; CHECK: [[DYNAMIC_MEMCPY_EXPANSION_RESIDUAL_BODY]]:
+; CHECK-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, %[[DYNAMIC_MEMCPY_EXPANSION_RESIDUAL_COND]] ], [ [[TMP14:%.*]], %[[DYNAMIC_MEMCPY_EXPANSION_RESIDUAL_BODY]] ], !dbg [[DBG11]]
+; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP2]], [[RESIDUAL_LOOP_INDEX]], !dbg [[DBG11]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]], !dbg [[DBG11]]
+; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1, !dbg [[DBG11]]
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[TMP10]], !dbg [[DBG11]]
+; CHECK-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1, !dbg [[DBG11]]
+; CHECK-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1, !dbg [[DBG11]]
+; CHECK-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP1]], !dbg [[DBG11]]
+; CHECK-NEXT: br i1 [[TMP15]], label %[[DYNAMIC_MEMCPY_EXPANSION_RESIDUAL_BODY]], label %[[DYNAMIC_MEMCPY_POST_EXPANSION]], !dbg [[DBG11]]
+; CHECK: [[DYNAMIC_MEMCPY_POST_EXPANSION]]:
+; CHECK-NEXT: #dbg_value(i32 0, [[META9:![0-9]+]], !DIExpression(), [[META12:![0-9]+]])
+; CHECK-NEXT: ret void, !dbg [[META12]]
+;
+ tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align 1 %src, i64 %n, i1 false)
+ ret void
+}
+
+define protected amdgpu_kernel void @memcpy_1025_bytes(ptr addrspace(1) %dst, ptr addrspace(1) %src) {
+; CHECK-LABEL: define protected amdgpu_kernel void @memcpy_1025_bytes(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]]) !dbg [[DBG13:![0-9]+]] {
+; CHECK-NEXT: br label %[[STATIC_MEMCPY_EXPANSION_MAIN_BODY:.*]], !dbg [[DBG16:![0-9]+]]
+; CHECK: [[STATIC_MEMCPY_EXPANSION_MAIN_BODY]]:
+; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[STATIC_MEMCPY_EXPANSION_MAIN_BODY]] ], !dbg [[DBG16]]
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[LOOP_INDEX]], !dbg [[DBG16]]
+; CHECK-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 1, !dbg [[DBG16]]
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[LOOP_INDEX]], !dbg [[DBG16]]
+; CHECK-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1, !dbg [[DBG16]]
+; CHECK-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256, !dbg [[DBG16]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024, !dbg [[DBG16]]
+; CHECK-NEXT: br i1 [[TMP5]], label %[[STATIC_MEMCPY_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMCPY_POST_EXPANSION:.*]], !dbg [[DBG16]]
+; CHECK: [[STATIC_MEMCPY_POST_EXPANSION]]:
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024, !dbg [[DBG16]]
+; CHECK-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 1, !dbg [[DBG16]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024, !dbg [[DBG16]]
+; CHECK-NEXT: store i8 [[TMP7]], ptr addrspace(1) [[TMP8]], align 1, !dbg [[DBG16]]
+; CHECK-NEXT: #dbg_value(i32 0, [[META15:![0-9]+]], !DIExpression(), [[META17:![0-9]+]])
+; CHECK-NEXT: ret void, !dbg [[META17]]
+;
+ tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef align 1 dereferenceable(1025) %dst, ptr addrspace(1) noundef align 1 dereferenceable(1025) %src, i64 1025, i1 false)
+ ret void
+}
+
+define protected amdgpu_kernel void @memcpy_2048_bytes(ptr addrspace(1) %dst, ptr addrspace(1) %src) {
+; CHECK-LABEL: define protected amdgpu_kernel void @memcpy_2048_bytes(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]]) !dbg [[DBG18:![0-9]+]] {
+; CHECK-NEXT: br label %[[STATIC_MEMCPY_EXPANSION_MAIN_BODY:.*]], !dbg [[DBG21:![0-9]+]]
+; CHECK: [[STATIC_MEMCPY_EXPANSION_MAIN_BODY]]:
+; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[STATIC_MEMCPY_EXPANSION_MAIN_BODY]] ], !dbg [[DBG21]]
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[LOOP_INDEX]], !dbg [[DBG21]]
+; CHECK-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 1, !dbg [[DBG21]]
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[LOOP_INDEX]], !dbg [[DBG21]]
+; CHECK-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1, !dbg [[DBG21]]
+; CHECK-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256, !dbg [[DBG21]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 2048, !dbg [[DBG21]]
+; CHECK-NEXT: br i1 [[TMP5]], label %[[STATIC_MEMCPY_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMCPY_POST_EXPANSION:.*]], !dbg [[DBG21]]
+; CHECK: [[STATIC_MEMCPY_POST_EXPANSION]]:
+; CHECK-NEXT: #dbg_value(i32 0, [[META20:![0-9]+]], !DIExpression(), [[META22:![0-9]+]])
+; CHECK-NEXT: ret void, !dbg [[META22]]
+;
+ tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef align 1 dereferenceable(2048) %dst, ptr addrspace(1) noundef align 1 dereferenceable(2048) %src, i64 2048, i1 false)
+ ret void
+}
+
+define protected amdgpu_kernel void @memmove(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 noundef %n) {
+; CHECK-LABEL: define protected amdgpu_kernel void @memmove(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 noundef [[N:%.*]]) !dbg [[DBG23:![0-9]+]] {
+; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[N]], 15, !dbg [[DBG26:![0-9]+]]
+; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[N]], [[TMP1]], !dbg [[DBG26]]
+; CHECK-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i64 [[TMP1]], 0, !dbg [[DBG26]]
+; CHECK-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i64 [[TMP2]], 0, !dbg [[DBG26]]
+; CHECK-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC]], [[DST]], !dbg [[DBG26]]
+; CHECK-NEXT: br i1 [[COMPARE_SRC_DST]], label %[[MEMMOVE_COPY_BACKWARDS:.*]], label %[[MEMMOVE_COPY_FORWARD:.*]], !dbg [[DBG26]]
+; CHECK: [[MEMMOVE_COPY_BACKWARDS]]:
+; CHECK-NEXT: br i1 [[SKIP_RESIDUAL]], label %[[MEMMOVE_BWD_MIDDLE:.*]], label %[[MEMMOVE_BWD_RESIDUAL_LOOP:.*]], !dbg [[DBG26]]
+; CHECK: [[MEMMOVE_BWD_RESIDUAL_LOOP]]:
+; CHECK-NEXT: [[TMP3:%.*]] = phi i64 [ [[BWD_RESIDUAL_INDEX:%.*]], %[[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[N]], %[[MEMMOVE_COPY_BACKWARDS]] ], !dbg [[DBG26]]
+; CHECK-NEXT: [[BWD_RESIDUAL_INDEX]] = sub i64 [[TMP3]], 1, !dbg [[DBG26]]
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_RESIDUAL_INDEX]], !dbg [[DBG26]]
+; CHECK-NEXT: [[ELEMENT:%.*]] = load i8, ptr addrspace(1) [[TMP4]], align 1, !dbg [[DBG26]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_RESIDUAL_INDEX]], !dbg [[DBG26]]
+; CHECK-NEXT: store i8 [[ELEMENT]], ptr addrspace(1) [[TMP5]], align 1, !dbg [[DBG26]]
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[BWD_RESIDUAL_INDEX]], [[TMP2]], !dbg [[DBG26]]
+; CHECK-NEXT: br i1 [[TMP6]], label %[[MEMMOVE_BWD_MIDDLE]], label %[[MEMMOVE_BWD_RESIDUAL_LOOP]], !dbg [[DBG26]]
+; CHECK: [[MEMMOVE_BWD_MIDDLE]]:
+; CHECK-NEXT: br i1 [[SKIP_MAIN]], label %[[MEMMOVE_DONE:.*]], label %[[MEMMOVE_BWD_MAIN_LOOP:.*]], !dbg [[DBG26]]
+; CHECK: [[MEMMOVE_BWD_MAIN_LOOP]]:
+; CHECK-NEXT: [[TMP7:%.*]] = phi i64 [ [[BWD_MAIN_INDEX:%.*]], %[[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP2]], %[[MEMMOVE_BWD_MIDDLE]] ], !dbg [[DBG26]]
+; CHECK-NEXT: [[BWD_MAIN_INDEX]] = sub i64 [[TMP7]], 16, !dbg [[DBG26]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_MAIN_INDEX]], !dbg [[DBG26]]
+; CHECK-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP8]], align 1, !dbg [[DBG26]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_MAIN_INDEX]], !dbg [[DBG26]]
+; CHECK-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP9]], align 1, !dbg [[DBG26]]
+; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[BWD_MAIN_INDEX]], 0, !dbg [[DBG26]]
+; CHECK-NEXT: br i1 [[TMP10]], label %[[MEMMOVE_DONE]], label %[[MEMMOVE_BWD_MAIN_LOOP]], !dbg [[DBG26]]
+; CHECK: [[MEMMOVE_COPY_FORWARD]]:
+; CHECK-NEXT: br i1 [[SKIP_MAIN]], label %[[MEMMOVE_FWD_MIDDLE:.*]], label %[[MEMMOVE_FWD_MAIN_LOOP:.*]], !dbg [[DBG26]]
+; CHECK: [[MEMMOVE_FWD_MAIN_LOOP]]:
+; CHECK-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i64 [ [[TMP13:%.*]], %[[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, %[[MEMMOVE_COPY_FORWARD]] ], !dbg [[DBG26]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_MAIN_INDEX]], !dbg [[DBG26]]
+; CHECK-NEXT: [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP11]], align 1, !dbg [[DBG26]]
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_MAIN_INDEX]], !dbg [[DBG26]]
+; CHECK-NEXT: store <4 x i32> [[ELEMENT2]], ptr addrspace(1) [[TMP12]], align 1, !dbg [[DBG26]]
+; CHECK-NEXT: [[TMP13]] = add i64 [[FWD_MAIN_INDEX]], 16, !dbg [[DBG26]]
+; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[TMP13]], [[TMP2]], !dbg [[DBG26]]
+; CHECK-NEXT: br i1 [[TMP14]], label %[[MEMMOVE_FWD_MIDDLE]], label %[[MEMMOVE_FWD_MAIN_LOOP]], !dbg [[DBG26]]
+; CHECK: [[MEMMOVE_FWD_MIDDLE]]:
+; CHECK-NEXT: br i1 [[SKIP_RESIDUAL]], label %[[MEMMOVE_DONE]], label %[[MEMMOVE_FWD_RESIDUAL_LOOP:.*]], !dbg [[DBG26]]
+; CHECK: [[MEMMOVE_FWD_RESIDUAL_LOOP]]:
+; CHECK-NEXT: [[FWD_RESIDUAL_INDEX:%.*]] = phi i64 [ [[TMP17:%.*]], %[[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP2]], %[[MEMMOVE_FWD_MIDDLE]] ], !dbg [[DBG26]]
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_RESIDUAL_INDEX]], !dbg [[DBG26]]
+; CHECK-NEXT: [[ELEMENT3:%.*]] = load i8, ptr addrspace(1) [[TMP15]], align 1, !dbg [[DBG26]]
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_RESIDUAL_INDEX]], !dbg [[DBG26]]
+; CHECK-NEXT: store i8 [[ELEMENT3]], ptr addrspace(1) [[TMP16]], align 1, !dbg [[DBG26]]
+; CHECK-NEXT: [[TMP17]] = add i64 [[FWD_RESIDUAL_INDEX]], 1, !dbg [[DBG26]]
+; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[TMP17]], [[N]], !dbg [[DBG26]]
+; CHECK-NEXT: br i1 [[TMP18]], label %[[MEMMOVE_DONE]], label %[[MEMMOVE_FWD_RESIDUAL_LOOP]], !dbg [[DBG26]]
+; CHECK: [[MEMMOVE_DONE]]:
+; CHECK-NEXT: #dbg_value(i32 0, [[META25:![0-9]+]], !DIExpression(), [[META27:![0-9]+]])
+; CHECK-NEXT: ret void, !dbg [[META27]]
+;
+ tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align 1 %src, i64 %n, i1 false)
+ ret void
+}
+
+define protected amdgpu_kernel void @memmove_1025_bytes(ptr addrspace(1) %dst, ptr addrspace(1) %src) {
+; CHECK-LABEL: define protected amdgpu_kernel void @memmove_1025_bytes(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]]) !dbg [[DBG28:![0-9]+]] {
+; CHECK-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC]], [[DST]], !dbg [[DBG31:![0-9]+]]
+; CHECK-NEXT: br i1 [[COMPARE_SRC_DST]], label %[[MEMMOVE_BWD_RESIDUAL:.*]], label %[[MEMMOVE_FWD_LOOP:.*]], !dbg [[DBG31]]
+; CHECK: [[MEMMOVE_BWD_RESIDUAL]]:
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024, !dbg [[DBG31]]
+; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr addrspace(1) [[TMP1]], align 1, !dbg [[DBG31]]
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024, !dbg [[DBG31]]
+; CHECK-NEXT: store i8 [[TMP2]], ptr addrspace(1) [[TMP3]], align 1, !dbg [[DBG31]]
+; CHECK-NEXT: br label %[[MEMMOVE_BWD_LOOP:.*]], !dbg [[DBG31]]
+; CHECK: [[MEMMOVE_BWD_LOOP]]:
+; CHECK-NEXT: [[TMP4:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], %[[MEMMOVE_BWD_LOOP]] ], [ 1024, %[[MEMMOVE_BWD_RESIDUAL]] ], !dbg [[DBG31]]
+; CHECK-NEXT: [[BWD_INDEX]] = sub i64 [[TMP4]], 256, !dbg [[DBG31]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]], !dbg [[DBG31]]
+; CHECK-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP5]], align 1, !dbg [[DBG31]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]], !dbg [[DBG31]]
+; CHECK-NEXT: store <64 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP6]], align 1, !dbg [[DBG31]]
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[BWD_INDEX]], 0, !dbg [[DBG31]]
+; CHECK-NEXT: br i1 [[TMP7]], label %[[MEMMOVE_DONE:.*]], label %[[MEMMOVE_BWD_LOOP]], !dbg [[DBG31]]
+; CHECK: [[MEMMOVE_FWD_LOOP]]:
+; CHECK-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP10:%.*]], %[[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0:%.*]] ], !dbg [[DBG31]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]], !dbg [[DBG31]]
+; CHECK-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP8]], align 1, !dbg [[DBG31]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]], !dbg [[DBG31]]
+; CHECK-NEXT: store <64 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP9]], align 1, !dbg [[DBG31]]
+; CHECK-NEXT: [[TMP10]] = add i64 [[FWD_INDEX]], 256, !dbg [[DBG31]]
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[TMP10]], 1024, !dbg [[DBG31]]
+; CHECK-NEXT: br i1 [[TMP11]], label %[[MEMMOVE_FWD_RESIDUAL:.*]], label %[[MEMMOVE_FWD_LOOP]], !dbg [[DBG31]]
+; CHECK: [[MEMMOVE_FWD_RESIDUAL]]:
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024, !dbg [[DBG31]]
+; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr addrspace(1) [[TMP12]], align 1, !dbg [[DBG31]]
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024, !dbg [[DBG31]]
+; CHECK-NEXT: store i8 [[TMP13]], ptr addrspace(1) [[TMP14]], align 1, !dbg [[DBG31]]
+; CHECK-NEXT: br label %[[MEMMOVE_DONE]], !dbg [[DBG31]]
+; CHECK: [[MEMMOVE_DONE]]:
+; CHECK-NEXT: #dbg_value(i32 0, [[META30:![0-9]+]], !DIExpression(), [[META32:![0-9]+]])
+; CHECK-NEXT: ret void, !dbg [[META32]]
+;
+ tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef align 1 dereferenceable(1025) %dst, ptr addrspace(1) noundef align 1 dereferenceable(1025) %src, i64 1025, i1 false)
+ ret void
+}
+
+define protected amdgpu_kernel void @memmove_2048_bytes(ptr addrspace(1) %dst, ptr addrspace(1) %src) {
+; CHECK-LABEL: define protected amdgpu_kernel void @memmove_2048_bytes(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]]) !dbg [[DBG33:![0-9]+]] {
+; CHECK-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC]], [[DST]], !dbg [[DBG36:![0-9]+]]
+; CHECK-NEXT: br i1 [[COMPARE_SRC_DST]], label %[[MEMMOVE_BWD_LOOP:.*]], label %[[MEMMOVE_FWD_LOOP:.*]], !dbg [[DBG36]]
+; CHECK: [[MEMMOVE_BWD_LOOP]]:
+; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], %[[MEMMOVE_BWD_LOOP]] ], [ 2048, [[TMP0:%.*]] ], !dbg [[DBG36]]
+; CHECK-NEXT: [[BWD_INDEX]] = sub i64 [[TMP1]], 256, !dbg [[DBG36]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]], !dbg [[DBG36]]
+; CHECK-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP2]], align 1, !dbg [[DBG36]]
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]], !dbg [[DBG36]]
+; CHECK-NEXT: store <64 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1, !dbg [[DBG36]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[BWD_INDEX]], 0, !dbg [[DBG36]]
+; CHECK-NEXT: br i1 [[TMP4]], label %[[MEMMOVE_DONE:.*]], label %[[MEMMOVE_BWD_LOOP]], !dbg [[DBG36]]
+; CHECK: [[MEMMOVE_FWD_LOOP]]:
+; CHECK-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP7:%.*]], %[[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ], !dbg [[DBG36]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]], !dbg [[DBG36]]
+; CHECK-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP5]], align 1, !dbg [[DBG36]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]], !dbg [[DBG36]]
+; CHECK-NEXT: store <64 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1, !dbg [[DBG36]]
+; CHECK-NEXT: [[TMP7]] = add i64 [[FWD_INDEX]], 256, !dbg [[DBG36]]
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 2048, !dbg [[DBG36]]
+; CHECK-NEXT: br i1 [[TMP8]], label %[[MEMMOVE_DONE]], label %[[MEMMOVE_FWD_LOOP]], !dbg [[DBG36]]
+; CHECK: [[MEMMOVE_DONE]]:
+; CHECK-NEXT: #dbg_value(i32 0, [[META35:![0-9]+]], !DIExpression(), [[META37:![0-9]+]])
+; CHECK-NEXT: ret void, !dbg [[META37]]
+;
+ tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef align 1 dereferenceable(2048) %dst, ptr addrspace(1) noundef align 1 dereferenceable(2048) %src, i64 2048, i1 false)
+ ret void
+}
+
+define protected amdgpu_kernel void @memset(ptr addrspace(1) %dst, i8 %value, i64 noundef %n) {
+; CHECK-LABEL: define protected amdgpu_kernel void @memset(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], i8 [[VALUE:%.*]], i64 noundef [[N:%.*]]) !dbg [[DBG38:![0-9]+]] {
+; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 0, [[N]], !dbg [[DBG41:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]], !dbg [[DBG41]]
+; CHECK: [[LOADSTORELOOP]]:
+; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ], !dbg [[DBG41]]
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[TMP2]], !dbg [[DBG41]]
+; CHECK-NEXT: store i8 [[VALUE]], ptr addrspace(1) [[TMP3]], align 1, !dbg [[DBG41]]
+; CHECK-NEXT: [[TMP4]] = add i64 [[TMP2]], 1, !dbg [[DBG41]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], [[N]], !dbg [[DBG41]]
+; CHECK-NEXT: br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]], !dbg [[DBG41]]
+; CHECK: [[SPLIT]]:
+; CHECK-NEXT: #dbg_value(i32 0, [[META40:![0-9]+]], !DIExpression(), [[META42:![0-9]+]])
+; CHECK-NEXT: ret void, !dbg [[META42]]
+;
+ tail call void @llvm.memset.p1.i64(ptr addrspace(1) align 1 %dst, i8 %value, i64 %n, i1 false)
+ ret void
+}
+
+define protected amdgpu_kernel void @memset_1025_bytes(ptr addrspace(1) %dst, i8 %value) {
+; CHECK-LABEL: define protected amdgpu_kernel void @memset_1025_bytes(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], i8 [[VALUE:%.*]]) !dbg [[DBG43:![0-9]+]] {
+; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]], !dbg [[DBG46:![0-9]+]]
+; CHECK: [[LOADSTORELOOP]]:
+; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ], !dbg [[DBG46]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[TMP1]], !dbg [[DBG46]]
+; CHECK-NEXT: store i8 [[VALUE]], ptr addrspace(1) [[TMP2]], align 1, !dbg [[DBG46]]
+; CHECK-NEXT: [[TMP3]] = add i64 [[TMP1]], 1, !dbg [[DBG46]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1025, !dbg [[DBG46]]
+; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]], !dbg [[DBG46]]
+; CHECK: [[SPLIT]]:
+; CHECK-NEXT: #dbg_value(i32 0, [[META45:![0-9]+]], !DIExpression(), [[META47:![0-9]+]])
+; CHECK-NEXT: ret void, !dbg [[META47]]
+;
+ tail call void @llvm.memset.p1.i64(ptr addrspace(1) noundef align 1 dereferenceable(1025) %dst, i8 %value, i64 1025, i1 false)
+ ret void
+}
+
+define protected amdgpu_kernel void @memset_2048_bytes(ptr addrspace(1) %dst, i8 %value) {
+; CHECK-LABEL: define protected amdgpu_kernel void @memset_2048_bytes(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], i8 [[VALUE:%.*]]) !dbg [[DBG48:![0-9]+]] {
+; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]], !dbg [[DBG51:![0-9]+]]
+; CHECK: [[LOADSTORELOOP]]:
+; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ], !dbg [[DBG51]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[TMP1]], !dbg [[DBG51]]
+; CHECK-NEXT: store i8 [[VALUE]], ptr addrspace(1) [[TMP2]], align 1, !dbg [[DBG51]]
+; CHECK-NEXT: [[TMP3]] = add i64 [[TMP1]], 1, !dbg [[DBG51]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 2048, !dbg [[DBG51]]
+; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]], !dbg [[DBG51]]
+; CHECK: [[SPLIT]]:
+; CHECK-NEXT: #dbg_value(i32 0, [[META50:![0-9]+]], !DIExpression(), [[META52:![0-9]+]])
+; CHECK-NEXT: ret void, !dbg [[META52]]
+;
+ tail call void @llvm.memset.p1.i64(ptr addrspace(1) noundef align 1 dereferenceable(2048) %dst, i8 %value, i64 2048, i1 false)
+ ret void
+}
+;.
+; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: [[META1:![0-9]+]], producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+; CHECK: [[META1]] = !DIFile(filename: "{{.*}}<stdin>", directory: {{.*}})
+; CHECK: [[DBG5]] = distinct !DISubprogram(name: "memcpy", linkageName: "memcpy", scope: null, file: [[META1]], line: 1, type: [[META6:![0-9]+]], scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META8:![0-9]+]])
+; CHECK: [[META6]] = !DISubroutineType(types: [[META7:![0-9]+]])
+; CHECK: [[META7]] = !{}
+; CHECK: [[META8]] = !{[[META9]]}
+; CHECK: [[META9]] = !DILocalVariable(name: "1", scope: [[DBG5]], file: [[META1]], line: 2, type: [[META10:![0-9]+]])
+; CHECK: [[META10]] = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned)
+; CHECK: [[DBG11]] = !DILocation(line: 1, column: 1, scope: [[DBG5]])
+; CHECK: [[META12]] = !DILocation(line: 2, column: 1, scope: [[DBG5]])
+; CHECK: [[DBG13]] = distinct !DISubprogram(name: "memcpy_1025_bytes", linkageName: "memcpy_1025_bytes", scope: null, file: [[META1]], line: 3, type: [[META6]], scopeLine: 3, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META14:![0-9]+]])
+; CHECK: [[META14]] = !{[[META15]]}
+; CHECK: [[META15]] = !DILocalVariable(name: "2", scope: [[DBG13]], file: [[META1]], line: 4, type: [[META10]])
+; CHECK: [[DBG16]] = !DILocation(line: 3, column: 1, scope: [[DBG13]])
+; CHECK: [[META17]] = !DILocation(line: 4, column: 1, scope: [[DBG13]])
+; CHECK: [[DBG18]] = distinct !DISubprogram(name: "memcpy_2048_bytes", linkageName: "memcpy_2048_bytes", scope: null, file: [[META1]], line: 5, type: [[META6]], scopeLine: 5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META19:![0-9]+]])
+; CHECK: [[META19]] = !{[[META20]]}
+; CHECK: [[META20]] = !DILocalVariable(name: "3", scope: [[DBG18]], file: [[META1]], line: 6, type: [[META10]])
+; CHECK: [[DBG21]] = !DILocation(line: 5, column: 1, scope: [[DBG18]])
+; CHECK: [[META22]] = !DILocation(line: 6, column: 1, scope: [[DBG18]])
+; CHECK: [[DBG23]] = distinct !DISubprogram(name: "memmove", linkageName: "memmove", scope: null, file: [[META1]], line: 7, type: [[META6]], scopeLine: 7, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META24:![0-9]+]])
+; CHECK: [[META24]] = !{[[META25]]}
+; CHECK: [[META25]] = !DILocalVariable(name: "4", scope: [[DBG23]], file: [[META1]], line: 8, type: [[META10]])
+; CHECK: [[DBG26]] = !DILocation(line: 7, column: 1, scope: [[DBG23]])
+; CHECK: [[META27]] = !DILocation(line: 8, column: 1, scope: [[DBG23]])
+; CHECK: [[DBG28]] = distinct !DISubprogram(name: "memmove_1025_bytes", linkageName: "memmove_1025_bytes", scope: null, file: [[META1]], line: 9, type: [[META6]], scopeLine: 9, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META29:![0-9]+]])
+; CHECK: [[META29]] = !{[[META30]]}
+; CHECK: [[META30]] = !DILocalVariable(name: "5", scope: [[DBG28]], file: [[META1]], line: 10, type: [[META10]])
+; CHECK: [[DBG31]] = !DILocation(line: 9, column: 1, scope: [[DBG28]])
+; CHECK: [[META32]] = !DILocation(line: 10, column: 1, scope: [[DBG28]])
+; CHECK: [[DBG33]] = distinct !DISubprogram(name: "memmove_2048_bytes", linkageName: "memmove_2048_bytes", scope: null, file: [[META1]], line: 11, type: [[META6]], scopeLine: 11, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META34:![0-9]+]])
+; CHECK: [[META34]] = !{[[META35]]}
+; CHECK: [[META35]] = !DILocalVariable(name: "6", scope: [[DBG33]], file: [[META1]], line: 12, type: [[META10]])
+; CHECK: [[DBG36]] = !DILocation(line: 11, column: 1, scope: [[DBG33]])
+; CHECK: [[META37]] = !DILocation(line: 12, column: 1, scope: [[DBG33]])
+; CHECK: [[DBG38]] = distinct !DISubprogram(name: "memset", linkageName: "memset", scope: null, file: [[META1]], line: 13, type: [[META6]], scopeLine: 13, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META39:![0-9]+]])
+; CHECK: [[META39]] = !{[[META40]]}
+; CHECK: [[META40]] = !DILocalVariable(name: "7", scope: [[DBG38]], file: [[META1]], line: 14, type: [[META10]])
+; CHECK: [[DBG41]] = !DILocation(line: 13, column: 1, scope: [[DBG38]])
+; CHECK: [[META42]] = !DILocation(line: 14, column: 1, scope: [[DBG38]])
+; CHECK: [[DBG43]] = distinct !DISubprogram(name: "memset_1025_bytes", linkageName: "memset_1025_bytes", scope: null, file: [[META1]], line: 15, type: [[META6]], scopeLine: 15, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META44:![0-9]+]])
+; CHECK: [[META44]] = !{[[META45]]}
+; CHECK: [[META45]] = !DILocalVariable(name: "8", scope: [[DBG43]], file: [[META1]], line: 16, type: [[META10]])
+; CHECK: [[DBG46]] = !DILocation(line: 15, column: 1, scope: [[DBG43]])
+; CHECK: [[META47]] = !DILocation(line: 16, column: 1, scope: [[DBG43]])
+; CHECK: [[DBG48]] = distinct !DISubprogram(name: "memset_2048_bytes", linkageName: "memset_2048_bytes", scope: null, file: [[META1]], line: 17, type: [[META6]], scopeLine: 17, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META49:![0-9]+]])
+; CHECK: [[META49]] = !{[[META50]]}
+; CHECK: [[META50]] = !DILocalVariable(name: "9", scope: [[DBG48]], file: [[META1]], line: 18, type: [[META10]])
+; CHECK: [[DBG51]] = !DILocation(line: 17, column: 1, scope: [[DBG48]])
+; CHECK: [[META52]] = !DILocation(line: 18, column: 1, scope: [[DBG48]])
+;.
More information about the llvm-commits
mailing list