[llvm] 6da31fa - InstCombine: Fix infinite loop in copy-constant-to-alloca transform

Wed Oct 14 09:55:35 PDT 2020

Author: Matt Arsenault
Date: 2020-10-14T12:55:25-04:00
New Revision: 6da31fa4a61d68af21dfa1e144e726ed6d77903e

URL: https://github.com/llvm/llvm-project/commit/6da31fa4a61d68af21dfa1e144e726ed6d77903e
DIFF: https://github.com/llvm/llvm-project/commit/6da31fa4a61d68af21dfa1e144e726ed6d77903e.diff

LOG: InstCombine: Fix infinite loop in copy-constant-to-alloca transform

This was broken by 16295d521e294b27106e51fac29957c1aac8ff89, when
instructions started being handled and not just constant
expressions. This was re-inserting an equivalent bitcast to the
original memcpy operand, which made a non-functional IR change on
every iteration.

This also fixes a secondary problem where it was inserting
addrspacecasts which may not have been legal (i.e. it changed the
source address space). Start visiting all pointer users and fail out
if we can't process them. Also start handling the relevant memory
intrinsic users. These cases can be dealt with by running
InferAddressSpaces separately.

Added: 
    

Modified: 
    llvm/include/llvm/IR/IRBuilder.h
    llvm/lib/IR/IRBuilder.cpp
    llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
    llvm/test/Transforms/InstCombine/AMDGPU/memcpy-from-constant.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index 58bde3e92746..8d11643da03a 100644

--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -573,12 +573,22 @@ class IRBuilderBase {
                         NoAliasTag);
   }
 
+  CallInst *CreateMemTransferInst(
+      Intrinsic::ID IntrID, Value *Dst, MaybeAlign DstAlign, Value *Src,
+      MaybeAlign SrcAlign, Value *Size, bool isVolatile = false,
+      MDNode *TBAATag = nullptr, MDNode *TBAAStructTag = nullptr,
+      MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr);
+
   CallInst *CreateMemCpy(Value *Dst, MaybeAlign DstAlign, Value *Src,
                          MaybeAlign SrcAlign, Value *Size,
                          bool isVolatile = false, MDNode *TBAATag = nullptr,
                          MDNode *TBAAStructTag = nullptr,
                          MDNode *ScopeTag = nullptr,
-                         MDNode *NoAliasTag = nullptr);
+                         MDNode *NoAliasTag = nullptr) {
+    return CreateMemTransferInst(Intrinsic::memcpy, Dst, DstAlign, Src,
+                                 SrcAlign, Size, isVolatile, TBAATag,
+                                 TBAAStructTag, ScopeTag, NoAliasTag);
+  }
 
   CallInst *CreateMemCpyInline(Value *Dst, MaybeAlign DstAlign, Value *Src,
                                MaybeAlign SrcAlign, Value *Size);

diff  --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 3319f48b42d7..982a158ff5c0 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -136,22 +136,21 @@ CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemSet(
   return CI;
 }
 
-CallInst *IRBuilderBase::CreateMemCpy(Value *Dst, MaybeAlign DstAlign,
-                                      Value *Src, MaybeAlign SrcAlign,
-                                      Value *Size, bool isVolatile,
-                                      MDNode *TBAATag, MDNode *TBAAStructTag,
-                                      MDNode *ScopeTag, MDNode *NoAliasTag) {
+CallInst *IRBuilderBase::CreateMemTransferInst(
+    Intrinsic::ID IntrID, Value *Dst, MaybeAlign DstAlign, Value *Src,
+    MaybeAlign SrcAlign, Value *Size, bool isVolatile, MDNode *TBAATag,
+    MDNode *TBAAStructTag, MDNode *ScopeTag, MDNode *NoAliasTag) {
   Dst = getCastedInt8PtrValue(Dst);
   Src = getCastedInt8PtrValue(Src);
 
   Value *Ops[] = {Dst, Src, Size, getInt1(isVolatile)};
   Type *Tys[] = { Dst->getType(), Src->getType(), Size->getType() };
   Module *M = BB->getParent()->getParent();
-  Function *TheFn = Intrinsic::getDeclaration(M, Intrinsic::memcpy, Tys);
+  Function *TheFn = Intrinsic::getDeclaration(M, IntrID, Tys);
 
   CallInst *CI = createCallHelper(TheFn, Ops, this);
 
-  auto* MCI = cast<MemCpyInst>(CI);
+  auto* MCI = cast<MemTransferInst>(CI);
   if (DstAlign)
     MCI->setDestAlignment(*DstAlign);
   if (SrcAlign)

diff  --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 8050c575c1f8..b1ee44d1db19 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -237,37 +237,40 @@ namespace {
 class PointerReplacer {
 public:
   PointerReplacer(InstCombinerImpl &IC) : IC(IC) {}
+
+  bool collectUsers(Instruction &I);
   void replacePointer(Instruction &I, Value *V);
 
 private:
-  void findLoadAndReplace(Instruction &I);
   void replace(Instruction *I);
   Value *getReplacement(Value *I);
 
-  SmallVector<Instruction *, 4> Path;
+  SmallSetVector<Instruction *, 4> Worklist;
   MapVector<Value *, Value *> WorkMap;
   InstCombinerImpl &IC;
 };
 } // end anonymous namespace
 
-void PointerReplacer::findLoadAndReplace(Instruction &I) {
+bool PointerReplacer::collectUsers(Instruction &I) {
   for (auto U : I.users()) {
-    auto *Inst = dyn_cast<Instruction>(&*U);
-    if (!Inst)
-      return;
-    LLVM_DEBUG(dbgs() << "Found pointer user: " << *U << '\n');
-    if (isa<LoadInst>(Inst)) {
-      for (auto P : Path)
-        replace(P);
-      replace(Inst);
+    Instruction *Inst = cast<Instruction>(&*U);
+    if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
+      if (Load->isVolatile())
+        return false;
+      Worklist.insert(Load);
     } else if (isa<GetElementPtrInst>(Inst) || isa<BitCastInst>(Inst)) {
-      Path.push_back(Inst);
-      findLoadAndReplace(*Inst);
-      Path.pop_back();
+      Worklist.insert(Inst);
+      if (!collectUsers(*Inst))
+        return false;
+    } else if (isa<MemTransferInst>(Inst)) {
+      Worklist.insert(Inst);
     } else {
-      return;
+      LLVM_DEBUG(dbgs() << "Cannot handle pointer user: " << *U << '\n');
+      return false;
     }
   }
+
+  return true;
 }
 
 Value *PointerReplacer::getReplacement(Value *V) {
@@ -309,6 +312,28 @@ void PointerReplacer::replace(Instruction *I) {
     IC.InsertNewInstWith(NewI, *BC);
     NewI->takeName(BC);
     WorkMap[BC] = NewI;
+  } else if (auto *MemCpy = dyn_cast<MemTransferInst>(I)) {
+    auto *SrcV = getReplacement(MemCpy->getRawSource());
+    // The pointer may appear in the destination of a copy, but we don't want to
+    // replace it.
+    if (!SrcV) {
+      assert(getReplacement(MemCpy->getRawDest()) &&
+             "destination not in replace list");
+      return;
+    }
+
+    IC.Builder.SetInsertPoint(MemCpy);
+    auto *NewI = IC.Builder.CreateMemTransferInst(
+        MemCpy->getIntrinsicID(), MemCpy->getRawDest(), MemCpy->getDestAlign(),
+        SrcV, MemCpy->getSourceAlign(), MemCpy->getLength(),
+        MemCpy->isVolatile());
+    AAMDNodes AAMD;
+    MemCpy->getAAMetadata(AAMD);
+    if (AAMD)
+      NewI->setAAMetadata(AAMD);
+
+    IC.eraseInstFromFunction(*MemCpy);
+    WorkMap[MemCpy] = NewI;
   } else {
     llvm_unreachable("should never reach here");
   }
@@ -322,7 +347,9 @@ void PointerReplacer::replacePointer(Instruction &I, Value *V) {
          "Invalid usage");
 #endif
   WorkMap[&I] = V;
-  findLoadAndReplace(I);
+
+  for (Instruction *Workitem : Worklist)
+    replace(Workitem);
 }
 
 Instruction *InstCombinerImpl::visitAllocaInst(AllocaInst &AI) {
@@ -376,23 +403,21 @@ Instruction *InstCombinerImpl::visitAllocaInst(AllocaInst &AI) {
   // read.
   SmallVector<Instruction *, 4> ToDelete;
   if (MemTransferInst *Copy = isOnlyCopiedFromConstantMemory(AA, &AI, ToDelete)) {
+    Value *TheSrc = Copy->getSource();
     Align AllocaAlign = AI.getAlign();
     Align SourceAlign = getOrEnforceKnownAlignment(
-        Copy->getSource(), AllocaAlign, DL, &AI, &AC, &DT);
+      TheSrc, AllocaAlign, DL, &AI, &AC, &DT);
     if (AllocaAlign <= SourceAlign &&
-        isDereferenceableForAllocaSize(Copy->getSource(), &AI, DL)) {
+        isDereferenceableForAllocaSize(TheSrc, &AI, DL)) {
       LLVM_DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');
       LLVM_DEBUG(dbgs() << "  memcpy = " << *Copy << '\n');
-      for (unsigned i = 0, e = ToDelete.size(); i != e; ++i)
-        eraseInstFromFunction(*ToDelete[i]);
-      Value *TheSrc = Copy->getSource();
-      auto *SrcTy = TheSrc->getType();
-      auto *DestTy = PointerType::get(AI.getType()->getPointerElementType(),
-                                      SrcTy->getPointerAddressSpace());
-      Value *Cast =
-        Builder.CreatePointerBitCastOrAddrSpaceCast(TheSrc, DestTy);
-      if (AI.getType()->getPointerAddressSpace() ==
-          SrcTy->getPointerAddressSpace()) {
+      unsigned SrcAddrSpace = TheSrc->getType()->getPointerAddressSpace();
+      auto *DestTy = PointerType::get(AI.getAllocatedType(), SrcAddrSpace);
+      if (AI.getType()->getAddressSpace() == SrcAddrSpace) {
+        for (Instruction *Delete : ToDelete)
+          eraseInstFromFunction(*Delete);
+
+        Value *Cast = Builder.CreateBitCast(TheSrc, DestTy);
         Instruction *NewI = replaceInstUsesWith(AI, Cast);
         eraseInstFromFunction(*Copy);
         ++NumGlobalCopies;
@@ -400,8 +425,14 @@ Instruction *InstCombinerImpl::visitAllocaInst(AllocaInst &AI) {
       }
 
       PointerReplacer PtrReplacer(*this);
-      PtrReplacer.replacePointer(AI, Cast);
-      ++NumGlobalCopies;
+      if (PtrReplacer.collectUsers(AI)) {
+        for (Instruction *Delete : ToDelete)
+          eraseInstFromFunction(*Delete);
+
+        Value *Cast = Builder.CreateBitCast(TheSrc, DestTy);
+        PtrReplacer.replacePointer(AI, Cast);
+        ++NumGlobalCopies;
+      }
     }
   }
 

diff  --git a/llvm/test/Transforms/InstCombine/AMDGPU/memcpy-from-constant.ll b/llvm/test/Transforms/InstCombine/AMDGPU/memcpy-from-constant.ll
index c49c5a697aed..2625f6cf22a5 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/memcpy-from-constant.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/memcpy-from-constant.ll
@@ -23,6 +23,23 @@ define i8 @memcpy_constant_arg_ptr_to_alloca([32 x i8] addrspace(4)* noalias rea
   ret i8 %load
 }
 
+; Simple memmove to alloca from constant address space argument.
+define i8 @memmove_constant_arg_ptr_to_alloca([32 x i8] addrspace(4)* noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) {
+; CHECK-LABEL: @memmove_constant_arg_ptr_to_alloca(
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [32 x i8], [32 x i8] addrspace(4)* [[ARG:%.*]], i64 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8 addrspace(4)* [[GEP]], align 1
+; CHECK-NEXT:    ret i8 [[LOAD]]
+;
+  %alloca = alloca [32 x i8], align 4, addrspace(5)
+  %alloca.cast = bitcast [32 x i8] addrspace(5)* %alloca to i8 addrspace(5)*
+  %arg.cast = bitcast [32 x i8] addrspace(4)* %arg to i8 addrspace(4)*
+  call void @llvm.memmove.p5i8.p4i8.i32(i8 addrspace(5)* %alloca.cast, i8 addrspace(4)* %arg.cast, i32 32, i1 false)
+  %gep = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* %alloca, i32 0, i32 %idx
+  %load = load i8, i8 addrspace(5)* %gep
+  ret i8 %load
+}
+
 ; Simple memcpy to alloca from byref constant address space argument.
 define amdgpu_kernel void @memcpy_constant_byref_arg_ptr_to_alloca([32 x i8] addrspace(4)* noalias readonly align 4 byref([32 x i8]) %arg, i8 addrspace(1)* %out, i32 %idx) {
 ; CHECK-LABEL: @memcpy_constant_byref_arg_ptr_to_alloca(
@@ -87,9 +104,13 @@ define amdgpu_kernel void @memcpy_constant_intrinsic_ptr_to_alloca(i8 addrspace(
 ; Alloca is written through a flat pointer
 define i8 @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat([31 x i8] addrspace(4)* noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) {
 ; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat(
-; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [31 x i8], [31 x i8] addrspace(4)* [[ARG:%.*]], i64 0, i64 [[TMP1]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8 addrspace(4)* [[GEP]], align 1
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [32 x i8], align 4, addrspace(5)
+; CHECK-NEXT:    [[ALLOCA_CAST:%.*]] = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* [[ALLOCA]], i32 0, i32 0
+; CHECK-NEXT:    [[ALLOCA_CAST_ASC:%.*]] = addrspacecast i8 addrspace(5)* [[ALLOCA_CAST]] to i8*
+; CHECK-NEXT:    [[ARG_CAST:%.*]] = getelementptr inbounds [31 x i8], [31 x i8] addrspace(4)* [[ARG:%.*]], i64 0, i64 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p4i8.i64(i8* nonnull align 1 dereferenceable(31) [[ALLOCA_CAST_ASC]], i8 addrspace(4)* align 4 dereferenceable(31) [[ARG_CAST]], i64 31, i1 false)
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* [[ALLOCA]], i32 0, i32 [[IDX:%.*]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8 addrspace(5)* [[GEP]], align 1
 ; CHECK-NEXT:    ret i8 [[LOAD]]
 ;
   %alloca = alloca [32 x i8], align 4, addrspace(5)
@@ -125,9 +146,88 @@ define i8 @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat2([32 x i8] ad
   ret i8 %load
 }
 
-declare void @llvm.memcpy.p5i8.p4i8.i64(i8 addrspace(5)* nocapture, i8 addrspace(4)* nocapture, i64, i1) #0
+%struct.ty = type { [4 x i32] }
+
+define amdgpu_kernel void @byref_infloop(i8* %scratch, %struct.ty addrspace(4)* byref(%struct.ty) align 4 %arg) local_unnamed_addr #1 {
+; CHECK-LABEL: @byref_infloop(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[CAST_ALLOCA:%.*]] = bitcast [[STRUCT_TY:%.*]] addrspace(4)* [[ARG:%.*]] to i8 addrspace(4)*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p4i8.i32(i8* nonnull align 4 dereferenceable(16) [[SCRATCH:%.*]], i8 addrspace(4)* align 4 dereferenceable(16) [[CAST_ALLOCA]], i32 16, i1 false)
+; CHECK-NEXT:    ret void
+;
+bb:
+  %alloca = alloca [4 x i32], align 4, addrspace(5)
+  %cast.arg = bitcast %struct.ty addrspace(4)* %arg to i8 addrspace(4)*
+  %cast.alloca = bitcast [4 x i32] addrspace(5)* %alloca to i8 addrspace(5)*
+  call void @llvm.memcpy.p5i8.p4i8.i32(i8 addrspace(5)* align 4 %cast.alloca, i8 addrspace(4)* align 4 %cast.arg, i32 16, i1 false)
+  call void @llvm.memcpy.p0i8.p5i8.i32(i8* align 4 %scratch, i8 addrspace(5)* align 4 %cast.alloca, i32 16, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @byref_infloop_metadata(i8* %scratch, %struct.ty addrspace(4)* byref(%struct.ty) align 4 %arg) local_unnamed_addr #1 {
+; CHECK-LABEL: @byref_infloop_metadata(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[CAST_ALLOCA:%.*]] = bitcast [[STRUCT_TY:%.*]] addrspace(4)* [[ARG:%.*]] to i8 addrspace(4)*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p4i8.i32(i8* nonnull align 4 dereferenceable(16) [[SCRATCH:%.*]], i8 addrspace(4)* align 4 dereferenceable(16) [[CAST_ALLOCA]], i32 16, i1 false), !noalias !0
+; CHECK-NEXT:    ret void
+;
+bb:
+  %alloca = alloca [4 x i32], align 4, addrspace(5)
+  %cast.arg = bitcast %struct.ty addrspace(4)* %arg to i8 addrspace(4)*
+  %cast.alloca = bitcast [4 x i32] addrspace(5)* %alloca to i8 addrspace(5)*
+  call void @llvm.memcpy.p5i8.p4i8.i32(i8 addrspace(5)* align 4 %cast.alloca, i8 addrspace(4)* align 4 %cast.arg, i32 16, i1 false), !noalias !0
+  call void @llvm.memcpy.p0i8.p5i8.i32(i8* align 4 %scratch, i8 addrspace(5)* align 4 %cast.alloca, i32 16, i1 false), !noalias !1
+  ret void
+}
+
+define amdgpu_kernel void @byref_infloop_addrspacecast(i8* %scratch, %struct.ty addrspace(4)* byref(%struct.ty) align 4 %arg) local_unnamed_addr #1 {
+; CHECK-LABEL: @byref_infloop_addrspacecast(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4 x i32], align 4, addrspace(5)
+; CHECK-NEXT:    [[CAST_ARG:%.*]] = bitcast [[STRUCT_TY:%.*]] addrspace(4)* [[ARG:%.*]] to i8 addrspace(4)*
+; CHECK-NEXT:    [[CAST_ALLOCA:%.*]] = bitcast [4 x i32] addrspace(5)* [[ALLOCA]] to i8 addrspace(5)*
+; CHECK-NEXT:    [[ADDRSPACECAST_ALLOCA:%.*]] = addrspacecast i8 addrspace(5)* [[CAST_ALLOCA]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p4i8.i64(i8* nonnull align 4 dereferenceable(16) [[ADDRSPACECAST_ALLOCA]], i8 addrspace(4)* align 4 dereferenceable(16) [[CAST_ARG]], i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 dereferenceable(16) [[SCRATCH:%.*]], i8* nonnull align 4 dereferenceable(16) [[ADDRSPACECAST_ALLOCA]], i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
+bb:
+  %alloca = alloca [4 x i32], align 4, addrspace(5)
+  %cast.arg = bitcast %struct.ty addrspace(4)* %arg to i8 addrspace(4)*
+  %cast.alloca = bitcast [4 x i32] addrspace(5)* %alloca to i8 addrspace(5)*
+  %addrspacecast.alloca = addrspacecast i8 addrspace(5)* %cast.alloca to i8*
+  call void @llvm.memcpy.p0i8.p4i8.i64(i8* nonnull align 4 dereferenceable(16) %addrspacecast.alloca, i8 addrspace(4)* align 4 dereferenceable(16) %cast.arg, i64 16, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 dereferenceable(16) %scratch, i8* nonnull align 4 dereferenceable(16) %addrspacecast.alloca, i64 16, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @byref_infloop_memmove(i8* %scratch, %struct.ty addrspace(4)* byref(%struct.ty) align 4 %arg) local_unnamed_addr #1 {
+; CHECK-LABEL: @byref_infloop_memmove(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[CAST_ALLOCA:%.*]] = bitcast [[STRUCT_TY:%.*]] addrspace(4)* [[ARG:%.*]] to i8 addrspace(4)*
+; CHECK-NEXT:    call void @llvm.memmove.p0i8.p4i8.i32(i8* nonnull align 4 dereferenceable(16) [[SCRATCH:%.*]], i8 addrspace(4)* align 4 dereferenceable(16) [[CAST_ALLOCA]], i32 16, i1 false)
+; CHECK-NEXT:    ret void
+;
+bb:
+  %alloca = alloca [4 x i32], align 4, addrspace(5)
+  %cast.arg = bitcast %struct.ty addrspace(4)* %arg to i8 addrspace(4)*
+  %cast.alloca = bitcast [4 x i32] addrspace(5)* %alloca to i8 addrspace(5)*
+  call void @llvm.memmove.p5i8.p4i8.i32(i8 addrspace(5)* align 4 %cast.alloca, i8 addrspace(4)* align 4 %cast.arg, i32 16, i1 false)
+  call void @llvm.memmove.p0i8.p5i8.i32(i8* align 4 %scratch, i8 addrspace(5)* align 4 %cast.alloca, i32 16, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p5i8.i32(i8* noalias nocapture writeonly, i8 addrspace(5)* noalias nocapture readonly, i32, i1 immarg) #0
+declare void @llvm.memcpy.p5i8.p4i8.i32(i8 addrspace(5)* nocapture, i8 addrspace(4)* nocapture, i32, i1) #0
 declare void @llvm.memcpy.p0i8.p4i8.i64(i8* nocapture, i8 addrspace(4)* nocapture, i64, i1) #0
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #0
+declare void @llvm.memcpy.p5i8.p4i8.i64(i8 addrspace(5)* nocapture, i8 addrspace(4)* nocapture, i64, i1) #0
+declare void @llvm.memmove.p5i8.p4i8.i32(i8 addrspace(5)* nocapture, i8 addrspace(4)* nocapture, i32, i1) #0
+declare void @llvm.memmove.p0i8.p5i8.i32(i8* nocapture, i8 addrspace(5)* nocapture, i32, i1) #0
 declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #1
 
 attributes #0 = { argmemonly nounwind willreturn }
 attributes #1 = { nounwind readnone speculatable }
+
+!0 = !{!0}
+!1 = !{!1}