[llvm] e3284b9 - [LowerMemIntrinsics][AMDGPU] Propagate Debug Value (#178131)

via llvm-commits llvm-commits at lists.llvm.org
Tue Jan 27 01:16:19 PST 2026


Author: Abhishek Kaushik
Date: 2026-01-27T14:46:14+05:30
New Revision: e3284b96815d7ef9ac358724b49a25befc2c4439

URL: https://github.com/llvm/llvm-project/commit/e3284b96815d7ef9ac358724b49a25befc2c4439
DIFF: https://github.com/llvm/llvm-project/commit/e3284b96815d7ef9ac358724b49a25befc2c4439.diff

LOG: [LowerMemIntrinsics][AMDGPU] Propagate Debug Value (#178131)

Propagate debug value to expanded loops for `memcpy`, `memmove` and
`memset` intrinsics.

Added: 
    llvm/test/Transforms/PreISelIntrinsicLowering/AMDGPU/expand-mem-intrinsics.ll

Modified: 
    llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index e294a3e29c28d..e77cdbe7a3876 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -140,7 +140,9 @@ insertLoopExpansion(Instruction *InsertBefore, Value *Len,
       InsertBefore, BBNamePrefix + "-post-expansion");
   Function *ParentFunc = PreLoopBB->getParent();
   LLVMContext &Ctx = PreLoopBB->getContext();
+  const DebugLoc &DbgLoc = InsertBefore->getStableDebugLoc();
   IRBuilder<> PreLoopBuilder(PreLoopBB->getTerminator());
+  PreLoopBuilder.SetCurrentDebugLocation(DbgLoc);
 
   // Calculate the main loop trip count and remaining units to cover after the
   // loop.
@@ -179,6 +181,7 @@ insertLoopExpansion(Instruction *InsertBefore, Value *Len,
   BasicBlock *MainLoopBB = BasicBlock::Create(
       Ctx, BBNamePrefix + "-expansion-main-body", ParentFunc, PostLoopBB);
   IRBuilder<> LoopBuilder(MainLoopBB);
+  LoopBuilder.SetCurrentDebugLocation(DbgLoc);
 
   PHINode *LoopIndex = LoopBuilder.CreatePHI(LenType, 2, "loop-index");
   LEI.MainLoopIndex = LoopIndex;
@@ -228,10 +231,12 @@ insertLoopExpansion(Instruction *InsertBefore, Value *Len,
 
     // Determine if we need to branch to the residual loop or bypass it.
     IRBuilder<> RCBuilder(ResidualCondBB);
+    RCBuilder.SetCurrentDebugLocation(DbgLoc);
     RCBuilder.CreateCondBr(RCBuilder.CreateICmpNE(ResidualUnits, Zero),
                            ResLoopBB, PostLoopBB);
 
     IRBuilder<> ResBuilder(ResLoopBB);
+    ResBuilder.SetCurrentDebugLocation(DbgLoc);
     PHINode *ResidualIndex =
         ResBuilder.CreatePHI(LenType, 2, "residual-loop-index");
     ResidualIndex->addIncoming(Zero, ResidualCondBB);
@@ -589,7 +594,9 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
       ConstantInt::get(ILengthType, ResidualLoopOpSize);
   ConstantInt *Zero = ConstantInt::get(ILengthType, 0);
 
+  const DebugLoc &DbgLoc = InsertBefore->getStableDebugLoc();
   IRBuilder<> PLBuilder(InsertBefore);
+  PLBuilder.SetCurrentDebugLocation(DbgLoc);
 
   Value *RuntimeLoopBytes = CopyLen;
   Value *RuntimeLoopRemainder = nullptr;
@@ -675,6 +682,7 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
       BasicBlock *ResidualLoopBB = BasicBlock::Create(
           F->getContext(), "memmove_bwd_residual_loop", F, MainLoopBB);
       IRBuilder<> ResidualLoopBuilder(ResidualLoopBB);
+      ResidualLoopBuilder.SetCurrentDebugLocation(DbgLoc);
       PHINode *ResidualLoopPhi = ResidualLoopBuilder.CreatePHI(ILengthType, 0);
       Value *ResidualIndex = ResidualLoopBuilder.CreateSub(
           ResidualLoopPhi, CIResidualLoopOpSize, "bwd_residual_index");
@@ -697,6 +705,7 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
           F->getContext(), "memmove_bwd_middle", F, MainLoopBB);
       // Later code expects a terminator in the PredBB.
       IRBuilder<> IntermediateBuilder(IntermediateBB);
+      IntermediateBuilder.SetCurrentDebugLocation(DbgLoc);
       IntermediateBuilder.CreateUnreachable();
       ResidualLoopBuilder.CreateCondBr(
           ResidualLoopBuilder.CreateICmpEQ(ResidualIndex, RuntimeLoopBytes),
@@ -706,8 +715,10 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
       ResidualLoopPhi->addIncoming(CopyLen, CopyBackwardsBB);
 
       // How to get to the residual:
-      BranchInst::Create(IntermediateBB, ResidualLoopBB, SkipResidualCondition,
-                         ThenTerm->getIterator());
+      BranchInst *BrInst =
+          BranchInst::Create(IntermediateBB, ResidualLoopBB,
+                             SkipResidualCondition, ThenTerm->getIterator());
+      BrInst->setDebugLoc(DbgLoc);
       ThenTerm->eraseFromParent();
 
       PredBB = IntermediateBB;
@@ -715,6 +726,7 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
 
     // main loop
     IRBuilder<> MainLoopBuilder(MainLoopBB);
+    MainLoopBuilder.SetCurrentDebugLocation(DbgLoc);
     PHINode *MainLoopPhi = MainLoopBuilder.CreatePHI(ILengthType, 0);
     Value *MainIndex =
         MainLoopBuilder.CreateSub(MainLoopPhi, CILoopOpSize, "bwd_main_index");
@@ -733,8 +745,9 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
 
     // How to get to the main loop:
     Instruction *PredBBTerm = PredBB->getTerminator();
-    BranchInst::Create(ExitBB, MainLoopBB, SkipMainCondition,
-                       PredBBTerm->getIterator());
+    BranchInst *BrInst = BranchInst::Create(
+        ExitBB, MainLoopBB, SkipMainCondition, PredBBTerm->getIterator());
+    BrInst->setDebugLoc(DbgLoc);
     PredBBTerm->eraseFromParent();
   }
 
@@ -744,6 +757,7 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
     BasicBlock *MainLoopBB =
         BasicBlock::Create(F->getContext(), "memmove_fwd_main_loop", F, ExitBB);
     IRBuilder<> MainLoopBuilder(MainLoopBB);
+    MainLoopBuilder.SetCurrentDebugLocation(DbgLoc);
     PHINode *MainLoopPhi =
         MainLoopBuilder.CreatePHI(ILengthType, 0, "fwd_main_index");
     Value *LoadGEP =
@@ -770,13 +784,16 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
         MainLoopBB);
 
     // getting in or skipping the main loop
-    BranchInst::Create(SuccessorBB, MainLoopBB, SkipMainCondition,
-                       CopyFwdBBTerm->getIterator());
+    BranchInst *BrInst =
+        BranchInst::Create(SuccessorBB, MainLoopBB, SkipMainCondition,
+                           CopyFwdBBTerm->getIterator());
+    BrInst->setDebugLoc(DbgLoc);
     CopyFwdBBTerm->eraseFromParent();
 
     if (RequiresResidual) {
       BasicBlock *IntermediateBB = SuccessorBB;
       IRBuilder<> IntermediateBuilder(IntermediateBB);
+      IntermediateBuilder.SetCurrentDebugLocation(DbgLoc);
       BasicBlock *ResidualLoopBB = BasicBlock::Create(
           F->getContext(), "memmove_fwd_residual_loop", F, ExitBB);
       IntermediateBuilder.CreateCondBr(SkipResidualCondition, ExitBB,
@@ -784,6 +801,7 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
 
       // Residual loop
       IRBuilder<> ResidualLoopBuilder(ResidualLoopBB);
+      ResidualLoopBuilder.SetCurrentDebugLocation(DbgLoc);
       PHINode *ResidualLoopPhi =
           ResidualLoopBuilder.CreatePHI(ILengthType, 0, "fwd_residual_index");
       Value *LoadGEP = ResidualLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr,
@@ -841,7 +859,9 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
   ConstantInt *LoopBound = ConstantInt::get(ILengthType, BytesCopiedInLoop);
   ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
 
+  const DebugLoc &DbgLoc = InsertBefore->getStableDebugLoc();
   IRBuilder<> PLBuilder(InsertBefore);
+  PLBuilder.SetCurrentDebugLocation(DbgLoc);
 
   auto [CmpSrcAddr, CmpDstAddr] =
       tryInsertCastToCommonAddrSpace(PLBuilder, SrcAddr, DstAddr, TTI);
@@ -894,6 +914,7 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
     // instead of after it.
     IRBuilder<> BwdResBuilder(CopyBackwardsBB,
                               CopyBackwardsBB->getFirstNonPHIIt());
+    BwdResBuilder.SetCurrentDebugLocation(DbgLoc);
     SmallVector<Type *, 5> RemainingOps;
     TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
                                           SrcAS, DstAS, PartSrcAlign,
@@ -917,6 +938,7 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
       CopyBackwardsBB->setName("memmove_bwd_loop");
     }
     IRBuilder<> LoopBuilder(LoopBB->getTerminator());
+    LoopBuilder.SetCurrentDebugLocation(DbgLoc);
     PHINode *LoopPhi = LoopBuilder.CreatePHI(ILengthType, 0);
     Value *Index = LoopBuilder.CreateSub(LoopPhi, CILoopOpSize, "bwd_index");
     Value *LoadGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, Index);
@@ -950,6 +972,7 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
       FwdResidualBB = SuccBB;
     }
     IRBuilder<> LoopBuilder(LoopBB->getTerminator());
+    LoopBuilder.SetCurrentDebugLocation(DbgLoc);
     PHINode *LoopPhi = LoopBuilder.CreatePHI(ILengthType, 0, "fwd_index");
     Value *LoadGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, LoopPhi);
     Value *Element = LoopBuilder.CreateAlignedLoad(
@@ -974,6 +997,7 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
     // Residual code is required to move the remaining bytes. In the forward
     // case, we emit it in the normal order.
     IRBuilder<> FwdResBuilder(FwdResidualBB->getTerminator());
+    FwdResBuilder.SetCurrentDebugLocation(DbgLoc);
     SmallVector<Type *, 5> RemainingOps;
     TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
                                           SrcAS, DstAS, PartSrcAlign,
@@ -996,7 +1020,9 @@ static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
   BasicBlock *LoopBB
     = BasicBlock::Create(F->getContext(), "loadstoreloop", F, NewBB);
 
+  const DebugLoc &DbgLoc = InsertBefore->getStableDebugLoc();
   IRBuilder<> Builder(OrigBB->getTerminator());
+  Builder.SetCurrentDebugLocation(DbgLoc);
 
   auto *ToLoopBR = Builder.CreateCondBr(
       Builder.CreateICmpEQ(ConstantInt::get(TypeOfCopyLen, 0), CopyLen), NewBB,
@@ -1014,6 +1040,7 @@ static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
   Align PartAlign(commonAlignment(DstAlign, PartSize));
 
   IRBuilder<> LoopBuilder(LoopBB);
+  LoopBuilder.SetCurrentDebugLocation(DbgLoc);
   PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
   LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB);
 
@@ -1092,6 +1119,7 @@ bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove,
   bool SrcIsVolatile = Memmove->isVolatile();
   bool DstIsVolatile = SrcIsVolatile;
   IRBuilder<> CastBuilder(Memmove);
+  CastBuilder.SetCurrentDebugLocation(Memmove->getStableDebugLoc());
 
   unsigned SrcAS = SrcAddr->getType()->getPointerAddressSpace();
   unsigned DstAS = DstAddr->getType()->getPointerAddressSpace();

diff  --git a/llvm/test/Transforms/PreISelIntrinsicLowering/AMDGPU/expand-mem-intrinsics.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/AMDGPU/expand-mem-intrinsics.ll
new file mode 100644
index 0000000000000..0da7b1494ef9f
--- /dev/null
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/AMDGPU/expand-mem-intrinsics.ll
@@ -0,0 +1,330 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -p=debugify,pre-isel-intrinsic-lowering,check-debugify -S < %s | FileCheck %s
+
+define protected amdgpu_kernel void @memcpy(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 noundef %n) {
+; CHECK-LABEL: define protected amdgpu_kernel void @memcpy(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 noundef [[N:%.*]]) !dbg [[DBG5:![0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[N]], 15, !dbg [[DBG11:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[N]], [[TMP1]], !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP2]], 0, !dbg [[DBG11]]
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[DYNAMIC_MEMCPY_EXPANSION_MAIN_BODY:.*]], label %[[DYNAMIC_MEMCPY_EXPANSION_RESIDUAL_COND:.*]], !dbg [[DBG11]]
+; CHECK:       [[DYNAMIC_MEMCPY_EXPANSION_MAIN_BODY]]:
+; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP7:%.*]], %[[DYNAMIC_MEMCPY_EXPANSION_MAIN_BODY]] ], !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[LOOP_INDEX]], !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP4]], align 1, !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[LOOP_INDEX]], !dbg [[DBG11]]
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr addrspace(1) [[TMP6]], align 1, !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP7]] = add i64 [[LOOP_INDEX]], 16, !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i64 [[TMP7]], [[TMP2]], !dbg [[DBG11]]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[DYNAMIC_MEMCPY_EXPANSION_MAIN_BODY]], label %[[DYNAMIC_MEMCPY_EXPANSION_RESIDUAL_COND]], !dbg [[DBG11]]
+; CHECK:       [[DYNAMIC_MEMCPY_EXPANSION_RESIDUAL_COND]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[TMP1]], 0, !dbg [[DBG11]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[DYNAMIC_MEMCPY_EXPANSION_RESIDUAL_BODY:.*]], label %[[DYNAMIC_MEMCPY_POST_EXPANSION:.*]], !dbg [[DBG11]]
+; CHECK:       [[DYNAMIC_MEMCPY_EXPANSION_RESIDUAL_BODY]]:
+; CHECK-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, %[[DYNAMIC_MEMCPY_EXPANSION_RESIDUAL_COND]] ], [ [[TMP14:%.*]], %[[DYNAMIC_MEMCPY_EXPANSION_RESIDUAL_BODY]] ], !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[TMP2]], [[RESIDUAL_LOOP_INDEX]], !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]], !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1, !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[TMP10]], !dbg [[DBG11]]
+; CHECK-NEXT:    store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1, !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1, !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP1]], !dbg [[DBG11]]
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[DYNAMIC_MEMCPY_EXPANSION_RESIDUAL_BODY]], label %[[DYNAMIC_MEMCPY_POST_EXPANSION]], !dbg [[DBG11]]
+; CHECK:       [[DYNAMIC_MEMCPY_POST_EXPANSION]]:
+; CHECK-NEXT:      #dbg_value(i32 0, [[META9:![0-9]+]], !DIExpression(), [[META12:![0-9]+]])
+; CHECK-NEXT:    ret void, !dbg [[META12]]
+;
+  tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align 1 %src, i64 %n, i1 false)
+  ret void
+}
+
+define protected amdgpu_kernel void @memcpy_1025_bytes(ptr addrspace(1) %dst, ptr addrspace(1) %src) {
+; CHECK-LABEL: define protected amdgpu_kernel void @memcpy_1025_bytes(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]]) !dbg [[DBG13:![0-9]+]] {
+; CHECK-NEXT:    br label %[[STATIC_MEMCPY_EXPANSION_MAIN_BODY:.*]], !dbg [[DBG16:![0-9]+]]
+; CHECK:       [[STATIC_MEMCPY_EXPANSION_MAIN_BODY]]:
+; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[STATIC_MEMCPY_EXPANSION_MAIN_BODY]] ], !dbg [[DBG16]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[LOOP_INDEX]], !dbg [[DBG16]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 1, !dbg [[DBG16]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[LOOP_INDEX]], !dbg [[DBG16]]
+; CHECK-NEXT:    store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1, !dbg [[DBG16]]
+; CHECK-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256, !dbg [[DBG16]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024, !dbg [[DBG16]]
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[STATIC_MEMCPY_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMCPY_POST_EXPANSION:.*]], !dbg [[DBG16]]
+; CHECK:       [[STATIC_MEMCPY_POST_EXPANSION]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024, !dbg [[DBG16]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 1, !dbg [[DBG16]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024, !dbg [[DBG16]]
+; CHECK-NEXT:    store i8 [[TMP7]], ptr addrspace(1) [[TMP8]], align 1, !dbg [[DBG16]]
+; CHECK-NEXT:      #dbg_value(i32 0, [[META15:![0-9]+]], !DIExpression(), [[META17:![0-9]+]])
+; CHECK-NEXT:    ret void, !dbg [[META17]]
+;
+  tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef align 1 dereferenceable(1025) %dst, ptr addrspace(1) noundef align 1 dereferenceable(1025) %src, i64 1025, i1 false)
+  ret void
+}
+
+define protected amdgpu_kernel void @memcpy_2048_bytes(ptr addrspace(1) %dst, ptr addrspace(1) %src) {
+; CHECK-LABEL: define protected amdgpu_kernel void @memcpy_2048_bytes(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]]) !dbg [[DBG18:![0-9]+]] {
+; CHECK-NEXT:    br label %[[STATIC_MEMCPY_EXPANSION_MAIN_BODY:.*]], !dbg [[DBG21:![0-9]+]]
+; CHECK:       [[STATIC_MEMCPY_EXPANSION_MAIN_BODY]]:
+; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[STATIC_MEMCPY_EXPANSION_MAIN_BODY]] ], !dbg [[DBG21]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[LOOP_INDEX]], !dbg [[DBG21]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 1, !dbg [[DBG21]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[LOOP_INDEX]], !dbg [[DBG21]]
+; CHECK-NEXT:    store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1, !dbg [[DBG21]]
+; CHECK-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256, !dbg [[DBG21]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 2048, !dbg [[DBG21]]
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[STATIC_MEMCPY_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMCPY_POST_EXPANSION:.*]], !dbg [[DBG21]]
+; CHECK:       [[STATIC_MEMCPY_POST_EXPANSION]]:
+; CHECK-NEXT:      #dbg_value(i32 0, [[META20:![0-9]+]], !DIExpression(), [[META22:![0-9]+]])
+; CHECK-NEXT:    ret void, !dbg [[META22]]
+;
+  tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef align 1 dereferenceable(2048) %dst, ptr addrspace(1) noundef align 1 dereferenceable(2048) %src, i64 2048, i1 false)
+  ret void
+}
+
+define protected amdgpu_kernel void @memmove(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 noundef %n) {
+; CHECK-LABEL: define protected amdgpu_kernel void @memmove(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 noundef [[N:%.*]]) !dbg [[DBG23:![0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[N]], 15, !dbg [[DBG26:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[N]], [[TMP1]], !dbg [[DBG26]]
+; CHECK-NEXT:    [[SKIP_RESIDUAL:%.*]] = icmp eq i64 [[TMP1]], 0, !dbg [[DBG26]]
+; CHECK-NEXT:    [[SKIP_MAIN:%.*]] = icmp eq i64 [[TMP2]], 0, !dbg [[DBG26]]
+; CHECK-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC]], [[DST]], !dbg [[DBG26]]
+; CHECK-NEXT:    br i1 [[COMPARE_SRC_DST]], label %[[MEMMOVE_COPY_BACKWARDS:.*]], label %[[MEMMOVE_COPY_FORWARD:.*]], !dbg [[DBG26]]
+; CHECK:       [[MEMMOVE_COPY_BACKWARDS]]:
+; CHECK-NEXT:    br i1 [[SKIP_RESIDUAL]], label %[[MEMMOVE_BWD_MIDDLE:.*]], label %[[MEMMOVE_BWD_RESIDUAL_LOOP:.*]], !dbg [[DBG26]]
+; CHECK:       [[MEMMOVE_BWD_RESIDUAL_LOOP]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i64 [ [[BWD_RESIDUAL_INDEX:%.*]], %[[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[N]], %[[MEMMOVE_COPY_BACKWARDS]] ], !dbg [[DBG26]]
+; CHECK-NEXT:    [[BWD_RESIDUAL_INDEX]] = sub i64 [[TMP3]], 1, !dbg [[DBG26]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_RESIDUAL_INDEX]], !dbg [[DBG26]]
+; CHECK-NEXT:    [[ELEMENT:%.*]] = load i8, ptr addrspace(1) [[TMP4]], align 1, !dbg [[DBG26]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_RESIDUAL_INDEX]], !dbg [[DBG26]]
+; CHECK-NEXT:    store i8 [[ELEMENT]], ptr addrspace(1) [[TMP5]], align 1, !dbg [[DBG26]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[BWD_RESIDUAL_INDEX]], [[TMP2]], !dbg [[DBG26]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MEMMOVE_BWD_MIDDLE]], label %[[MEMMOVE_BWD_RESIDUAL_LOOP]], !dbg [[DBG26]]
+; CHECK:       [[MEMMOVE_BWD_MIDDLE]]:
+; CHECK-NEXT:    br i1 [[SKIP_MAIN]], label %[[MEMMOVE_DONE:.*]], label %[[MEMMOVE_BWD_MAIN_LOOP:.*]], !dbg [[DBG26]]
+; CHECK:       [[MEMMOVE_BWD_MAIN_LOOP]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi i64 [ [[BWD_MAIN_INDEX:%.*]], %[[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP2]], %[[MEMMOVE_BWD_MIDDLE]] ], !dbg [[DBG26]]
+; CHECK-NEXT:    [[BWD_MAIN_INDEX]] = sub i64 [[TMP7]], 16, !dbg [[DBG26]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_MAIN_INDEX]], !dbg [[DBG26]]
+; CHECK-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP8]], align 1, !dbg [[DBG26]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_MAIN_INDEX]], !dbg [[DBG26]]
+; CHECK-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP9]], align 1, !dbg [[DBG26]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[BWD_MAIN_INDEX]], 0, !dbg [[DBG26]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MEMMOVE_DONE]], label %[[MEMMOVE_BWD_MAIN_LOOP]], !dbg [[DBG26]]
+; CHECK:       [[MEMMOVE_COPY_FORWARD]]:
+; CHECK-NEXT:    br i1 [[SKIP_MAIN]], label %[[MEMMOVE_FWD_MIDDLE:.*]], label %[[MEMMOVE_FWD_MAIN_LOOP:.*]], !dbg [[DBG26]]
+; CHECK:       [[MEMMOVE_FWD_MAIN_LOOP]]:
+; CHECK-NEXT:    [[FWD_MAIN_INDEX:%.*]] = phi i64 [ [[TMP13:%.*]], %[[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, %[[MEMMOVE_COPY_FORWARD]] ], !dbg [[DBG26]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_MAIN_INDEX]], !dbg [[DBG26]]
+; CHECK-NEXT:    [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP11]], align 1, !dbg [[DBG26]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_MAIN_INDEX]], !dbg [[DBG26]]
+; CHECK-NEXT:    store <4 x i32> [[ELEMENT2]], ptr addrspace(1) [[TMP12]], align 1, !dbg [[DBG26]]
+; CHECK-NEXT:    [[TMP13]] = add i64 [[FWD_MAIN_INDEX]], 16, !dbg [[DBG26]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP13]], [[TMP2]], !dbg [[DBG26]]
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[MEMMOVE_FWD_MIDDLE]], label %[[MEMMOVE_FWD_MAIN_LOOP]], !dbg [[DBG26]]
+; CHECK:       [[MEMMOVE_FWD_MIDDLE]]:
+; CHECK-NEXT:    br i1 [[SKIP_RESIDUAL]], label %[[MEMMOVE_DONE]], label %[[MEMMOVE_FWD_RESIDUAL_LOOP:.*]], !dbg [[DBG26]]
+; CHECK:       [[MEMMOVE_FWD_RESIDUAL_LOOP]]:
+; CHECK-NEXT:    [[FWD_RESIDUAL_INDEX:%.*]] = phi i64 [ [[TMP17:%.*]], %[[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP2]], %[[MEMMOVE_FWD_MIDDLE]] ], !dbg [[DBG26]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_RESIDUAL_INDEX]], !dbg [[DBG26]]
+; CHECK-NEXT:    [[ELEMENT3:%.*]] = load i8, ptr addrspace(1) [[TMP15]], align 1, !dbg [[DBG26]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_RESIDUAL_INDEX]], !dbg [[DBG26]]
+; CHECK-NEXT:    store i8 [[ELEMENT3]], ptr addrspace(1) [[TMP16]], align 1, !dbg [[DBG26]]
+; CHECK-NEXT:    [[TMP17]] = add i64 [[FWD_RESIDUAL_INDEX]], 1, !dbg [[DBG26]]
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP17]], [[N]], !dbg [[DBG26]]
+; CHECK-NEXT:    br i1 [[TMP18]], label %[[MEMMOVE_DONE]], label %[[MEMMOVE_FWD_RESIDUAL_LOOP]], !dbg [[DBG26]]
+; CHECK:       [[MEMMOVE_DONE]]:
+; CHECK-NEXT:      #dbg_value(i32 0, [[META25:![0-9]+]], !DIExpression(), [[META27:![0-9]+]])
+; CHECK-NEXT:    ret void, !dbg [[META27]]
+;
+  tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align 1 %src, i64 %n, i1 false)
+  ret void
+}
+
+define protected amdgpu_kernel void @memmove_1025_bytes(ptr addrspace(1) %dst, ptr addrspace(1) %src) {
+; CHECK-LABEL: define protected amdgpu_kernel void @memmove_1025_bytes(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]]) !dbg [[DBG28:![0-9]+]] {
+; CHECK-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC]], [[DST]], !dbg [[DBG31:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COMPARE_SRC_DST]], label %[[MEMMOVE_BWD_RESIDUAL:.*]], label %[[MEMMOVE_FWD_LOOP:.*]], !dbg [[DBG31]]
+; CHECK:       [[MEMMOVE_BWD_RESIDUAL]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024, !dbg [[DBG31]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr addrspace(1) [[TMP1]], align 1, !dbg [[DBG31]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024, !dbg [[DBG31]]
+; CHECK-NEXT:    store i8 [[TMP2]], ptr addrspace(1) [[TMP3]], align 1, !dbg [[DBG31]]
+; CHECK-NEXT:    br label %[[MEMMOVE_BWD_LOOP:.*]], !dbg [[DBG31]]
+; CHECK:       [[MEMMOVE_BWD_LOOP]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], %[[MEMMOVE_BWD_LOOP]] ], [ 1024, %[[MEMMOVE_BWD_RESIDUAL]] ], !dbg [[DBG31]]
+; CHECK-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP4]], 256, !dbg [[DBG31]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]], !dbg [[DBG31]]
+; CHECK-NEXT:    [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP5]], align 1, !dbg [[DBG31]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]], !dbg [[DBG31]]
+; CHECK-NEXT:    store <64 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP6]], align 1, !dbg [[DBG31]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[BWD_INDEX]], 0, !dbg [[DBG31]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MEMMOVE_DONE:.*]], label %[[MEMMOVE_BWD_LOOP]], !dbg [[DBG31]]
+; CHECK:       [[MEMMOVE_FWD_LOOP]]:
+; CHECK-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP10:%.*]], %[[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0:%.*]] ], !dbg [[DBG31]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]], !dbg [[DBG31]]
+; CHECK-NEXT:    [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP8]], align 1, !dbg [[DBG31]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]], !dbg [[DBG31]]
+; CHECK-NEXT:    store <64 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP9]], align 1, !dbg [[DBG31]]
+; CHECK-NEXT:    [[TMP10]] = add i64 [[FWD_INDEX]], 256, !dbg [[DBG31]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[TMP10]], 1024, !dbg [[DBG31]]
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[MEMMOVE_FWD_RESIDUAL:.*]], label %[[MEMMOVE_FWD_LOOP]], !dbg [[DBG31]]
+; CHECK:       [[MEMMOVE_FWD_RESIDUAL]]:
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024, !dbg [[DBG31]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr addrspace(1) [[TMP12]], align 1, !dbg [[DBG31]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024, !dbg [[DBG31]]
+; CHECK-NEXT:    store i8 [[TMP13]], ptr addrspace(1) [[TMP14]], align 1, !dbg [[DBG31]]
+; CHECK-NEXT:    br label %[[MEMMOVE_DONE]], !dbg [[DBG31]]
+; CHECK:       [[MEMMOVE_DONE]]:
+; CHECK-NEXT:      #dbg_value(i32 0, [[META30:![0-9]+]], !DIExpression(), [[META32:![0-9]+]])
+; CHECK-NEXT:    ret void, !dbg [[META32]]
+;
+  tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef align 1 dereferenceable(1025) %dst, ptr addrspace(1) noundef align 1 dereferenceable(1025) %src, i64 1025, i1 false)
+  ret void
+}
+
+define protected amdgpu_kernel void @memmove_2048_bytes(ptr addrspace(1) %dst, ptr addrspace(1) %src) {
+; CHECK-LABEL: define protected amdgpu_kernel void @memmove_2048_bytes(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]]) !dbg [[DBG33:![0-9]+]] {
+; CHECK-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC]], [[DST]], !dbg [[DBG36:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COMPARE_SRC_DST]], label %[[MEMMOVE_BWD_LOOP:.*]], label %[[MEMMOVE_FWD_LOOP:.*]], !dbg [[DBG36]]
+; CHECK:       [[MEMMOVE_BWD_LOOP]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], %[[MEMMOVE_BWD_LOOP]] ], [ 2048, [[TMP0:%.*]] ], !dbg [[DBG36]]
+; CHECK-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP1]], 256, !dbg [[DBG36]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]], !dbg [[DBG36]]
+; CHECK-NEXT:    [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP2]], align 1, !dbg [[DBG36]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]], !dbg [[DBG36]]
+; CHECK-NEXT:    store <64 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1, !dbg [[DBG36]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[BWD_INDEX]], 0, !dbg [[DBG36]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MEMMOVE_DONE:.*]], label %[[MEMMOVE_BWD_LOOP]], !dbg [[DBG36]]
+; CHECK:       [[MEMMOVE_FWD_LOOP]]:
+; CHECK-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP7:%.*]], %[[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ], !dbg [[DBG36]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]], !dbg [[DBG36]]
+; CHECK-NEXT:    [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP5]], align 1, !dbg [[DBG36]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]], !dbg [[DBG36]]
+; CHECK-NEXT:    store <64 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1, !dbg [[DBG36]]
+; CHECK-NEXT:    [[TMP7]] = add i64 [[FWD_INDEX]], 256, !dbg [[DBG36]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 2048, !dbg [[DBG36]]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MEMMOVE_DONE]], label %[[MEMMOVE_FWD_LOOP]], !dbg [[DBG36]]
+; CHECK:       [[MEMMOVE_DONE]]:
+; CHECK-NEXT:      #dbg_value(i32 0, [[META35:![0-9]+]], !DIExpression(), [[META37:![0-9]+]])
+; CHECK-NEXT:    ret void, !dbg [[META37]]
+;
+  tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef align 1 dereferenceable(2048) %dst, ptr addrspace(1) noundef align 1 dereferenceable(2048) %src, i64 2048, i1 false)
+  ret void
+}
+
+define protected amdgpu_kernel void @memset(ptr addrspace(1) %dst, i8 %value, i64 noundef %n) {
+; CHECK-LABEL: define protected amdgpu_kernel void @memset(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], i8 [[VALUE:%.*]], i64 noundef [[N:%.*]]) !dbg [[DBG38:![0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 0, [[N]], !dbg [[DBG41:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]], !dbg [[DBG41]]
+; CHECK:       [[LOADSTORELOOP]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ], !dbg [[DBG41]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[TMP2]], !dbg [[DBG41]]
+; CHECK-NEXT:    store i8 [[VALUE]], ptr addrspace(1) [[TMP3]], align 1, !dbg [[DBG41]]
+; CHECK-NEXT:    [[TMP4]] = add i64 [[TMP2]], 1, !dbg [[DBG41]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], [[N]], !dbg [[DBG41]]
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]], !dbg [[DBG41]]
+; CHECK:       [[SPLIT]]:
+; CHECK-NEXT:      #dbg_value(i32 0, [[META40:![0-9]+]], !DIExpression(), [[META42:![0-9]+]])
+; CHECK-NEXT:    ret void, !dbg [[META42]]
+;
+  tail call void @llvm.memset.p1.i64(ptr addrspace(1) align 1 %dst, i8 %value, i64 %n, i1 false)
+  ret void
+}
+
+define protected amdgpu_kernel void @memset_1025_bytes(ptr addrspace(1) %dst, i8 %value) {
+; CHECK-LABEL: define protected amdgpu_kernel void @memset_1025_bytes(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], i8 [[VALUE:%.*]]) !dbg [[DBG43:![0-9]+]] {
+; CHECK-NEXT:    br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]], !dbg [[DBG46:![0-9]+]]
+; CHECK:       [[LOADSTORELOOP]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ], !dbg [[DBG46]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[TMP1]], !dbg [[DBG46]]
+; CHECK-NEXT:    store i8 [[VALUE]], ptr addrspace(1) [[TMP2]], align 1, !dbg [[DBG46]]
+; CHECK-NEXT:    [[TMP3]] = add i64 [[TMP1]], 1, !dbg [[DBG46]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1025, !dbg [[DBG46]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]], !dbg [[DBG46]]
+; CHECK:       [[SPLIT]]:
+; CHECK-NEXT:      #dbg_value(i32 0, [[META45:![0-9]+]], !DIExpression(), [[META47:![0-9]+]])
+; CHECK-NEXT:    ret void, !dbg [[META47]]
+;
+  tail call void @llvm.memset.p1.i64(ptr addrspace(1) noundef align 1 dereferenceable(1025) %dst, i8 %value, i64 1025, i1 false)
+  ret void
+}
+
+define protected amdgpu_kernel void @memset_2048_bytes(ptr addrspace(1) %dst, i8 %value) {
+; CHECK-LABEL: define protected amdgpu_kernel void @memset_2048_bytes(
+; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], i8 [[VALUE:%.*]]) !dbg [[DBG48:![0-9]+]] {
+; CHECK-NEXT:    br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]], !dbg [[DBG51:![0-9]+]]
+; CHECK:       [[LOADSTORELOOP]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ], !dbg [[DBG51]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[TMP1]], !dbg [[DBG51]]
+; CHECK-NEXT:    store i8 [[VALUE]], ptr addrspace(1) [[TMP2]], align 1, !dbg [[DBG51]]
+; CHECK-NEXT:    [[TMP3]] = add i64 [[TMP1]], 1, !dbg [[DBG51]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 2048, !dbg [[DBG51]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]], !dbg [[DBG51]]
+; CHECK:       [[SPLIT]]:
+; CHECK-NEXT:      #dbg_value(i32 0, [[META50:![0-9]+]], !DIExpression(), [[META52:![0-9]+]])
+; CHECK-NEXT:    ret void, !dbg [[META52]]
+;
+  tail call void @llvm.memset.p1.i64(ptr addrspace(1) noundef align 1 dereferenceable(2048) %dst, i8 %value, i64 2048, i1 false)
+  ret void
+}
+;.
+; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: [[META1:![0-9]+]], producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+; CHECK: [[META1]] = !DIFile(filename: "{{.*}}<stdin>", directory: {{.*}})
+; CHECK: [[DBG5]] = distinct !DISubprogram(name: "memcpy", linkageName: "memcpy", scope: null, file: [[META1]], line: 1, type: [[META6:![0-9]+]], scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META8:![0-9]+]])
+; CHECK: [[META6]] = !DISubroutineType(types: [[META7:![0-9]+]])
+; CHECK: [[META7]] = !{}
+; CHECK: [[META8]] = !{[[META9]]}
+; CHECK: [[META9]] = !DILocalVariable(name: "1", scope: [[DBG5]], file: [[META1]], line: 2, type: [[META10:![0-9]+]])
+; CHECK: [[META10]] = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned)
+; CHECK: [[DBG11]] = !DILocation(line: 1, column: 1, scope: [[DBG5]])
+; CHECK: [[META12]] = !DILocation(line: 2, column: 1, scope: [[DBG5]])
+; CHECK: [[DBG13]] = distinct !DISubprogram(name: "memcpy_1025_bytes", linkageName: "memcpy_1025_bytes", scope: null, file: [[META1]], line: 3, type: [[META6]], scopeLine: 3, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META14:![0-9]+]])
+; CHECK: [[META14]] = !{[[META15]]}
+; CHECK: [[META15]] = !DILocalVariable(name: "2", scope: [[DBG13]], file: [[META1]], line: 4, type: [[META10]])
+; CHECK: [[DBG16]] = !DILocation(line: 3, column: 1, scope: [[DBG13]])
+; CHECK: [[META17]] = !DILocation(line: 4, column: 1, scope: [[DBG13]])
+; CHECK: [[DBG18]] = distinct !DISubprogram(name: "memcpy_2048_bytes", linkageName: "memcpy_2048_bytes", scope: null, file: [[META1]], line: 5, type: [[META6]], scopeLine: 5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META19:![0-9]+]])
+; CHECK: [[META19]] = !{[[META20]]}
+; CHECK: [[META20]] = !DILocalVariable(name: "3", scope: [[DBG18]], file: [[META1]], line: 6, type: [[META10]])
+; CHECK: [[DBG21]] = !DILocation(line: 5, column: 1, scope: [[DBG18]])
+; CHECK: [[META22]] = !DILocation(line: 6, column: 1, scope: [[DBG18]])
+; CHECK: [[DBG23]] = distinct !DISubprogram(name: "memmove", linkageName: "memmove", scope: null, file: [[META1]], line: 7, type: [[META6]], scopeLine: 7, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META24:![0-9]+]])
+; CHECK: [[META24]] = !{[[META25]]}
+; CHECK: [[META25]] = !DILocalVariable(name: "4", scope: [[DBG23]], file: [[META1]], line: 8, type: [[META10]])
+; CHECK: [[DBG26]] = !DILocation(line: 7, column: 1, scope: [[DBG23]])
+; CHECK: [[META27]] = !DILocation(line: 8, column: 1, scope: [[DBG23]])
+; CHECK: [[DBG28]] = distinct !DISubprogram(name: "memmove_1025_bytes", linkageName: "memmove_1025_bytes", scope: null, file: [[META1]], line: 9, type: [[META6]], scopeLine: 9, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META29:![0-9]+]])
+; CHECK: [[META29]] = !{[[META30]]}
+; CHECK: [[META30]] = !DILocalVariable(name: "5", scope: [[DBG28]], file: [[META1]], line: 10, type: [[META10]])
+; CHECK: [[DBG31]] = !DILocation(line: 9, column: 1, scope: [[DBG28]])
+; CHECK: [[META32]] = !DILocation(line: 10, column: 1, scope: [[DBG28]])
+; CHECK: [[DBG33]] = distinct !DISubprogram(name: "memmove_2048_bytes", linkageName: "memmove_2048_bytes", scope: null, file: [[META1]], line: 11, type: [[META6]], scopeLine: 11, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META34:![0-9]+]])
+; CHECK: [[META34]] = !{[[META35]]}
+; CHECK: [[META35]] = !DILocalVariable(name: "6", scope: [[DBG33]], file: [[META1]], line: 12, type: [[META10]])
+; CHECK: [[DBG36]] = !DILocation(line: 11, column: 1, scope: [[DBG33]])
+; CHECK: [[META37]] = !DILocation(line: 12, column: 1, scope: [[DBG33]])
+; CHECK: [[DBG38]] = distinct !DISubprogram(name: "memset", linkageName: "memset", scope: null, file: [[META1]], line: 13, type: [[META6]], scopeLine: 13, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META39:![0-9]+]])
+; CHECK: [[META39]] = !{[[META40]]}
+; CHECK: [[META40]] = !DILocalVariable(name: "7", scope: [[DBG38]], file: [[META1]], line: 14, type: [[META10]])
+; CHECK: [[DBG41]] = !DILocation(line: 13, column: 1, scope: [[DBG38]])
+; CHECK: [[META42]] = !DILocation(line: 14, column: 1, scope: [[DBG38]])
+; CHECK: [[DBG43]] = distinct !DISubprogram(name: "memset_1025_bytes", linkageName: "memset_1025_bytes", scope: null, file: [[META1]], line: 15, type: [[META6]], scopeLine: 15, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META44:![0-9]+]])
+; CHECK: [[META44]] = !{[[META45]]}
+; CHECK: [[META45]] = !DILocalVariable(name: "8", scope: [[DBG43]], file: [[META1]], line: 16, type: [[META10]])
+; CHECK: [[DBG46]] = !DILocation(line: 15, column: 1, scope: [[DBG43]])
+; CHECK: [[META47]] = !DILocation(line: 16, column: 1, scope: [[DBG43]])
+; CHECK: [[DBG48]] = distinct !DISubprogram(name: "memset_2048_bytes", linkageName: "memset_2048_bytes", scope: null, file: [[META1]], line: 17, type: [[META6]], scopeLine: 17, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META49:![0-9]+]])
+; CHECK: [[META49]] = !{[[META50]]}
+; CHECK: [[META50]] = !DILocalVariable(name: "9", scope: [[DBG48]], file: [[META1]], line: 18, type: [[META10]])
+; CHECK: [[DBG51]] = !DILocation(line: 17, column: 1, scope: [[DBG48]])
+; CHECK: [[META52]] = !DILocation(line: 18, column: 1, scope: [[DBG48]])
+;.


        


More information about the llvm-commits mailing list