[llvm] 0696240 - LowerMemIntrinsics: Check address space aliasing for memmove expansion

Wed Jun 14 04:57:05 PDT 2023

Author: Matt Arsenault
Date: 2023-06-14T07:56:58-04:00
New Revision: 0696240384cf4b18f65e31d68b51df96c328cb24

URL: https://github.com/llvm/llvm-project/commit/0696240384cf4b18f65e31d68b51df96c328cb24
DIFF: https://github.com/llvm/llvm-project/commit/0696240384cf4b18f65e31d68b51df96c328cb24.diff

LOG: LowerMemIntrinsics: Check address space aliasing for memmove expansion

For cases where we cannot insert an addrspacecast, we can still expand
like a memcpy if we know the address spaces cannot alias. Normally
non-aliasing memmoves are optimized to memcpy, but we cannot rely on
that for lowering. If a target has aliasing address spaces that cannot
be casted between, we still have to give up lowering this.

Added: 
    

Modified: 
    llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
    llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index d06f7db2aa10d..906eb71fc2d98 100644

--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -376,7 +376,7 @@ void llvm::createMemCpyLoopUnknownSize(
 //   }
 //   return dst;
 // }
-static bool createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
+static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
                               Value *DstAddr, Value *CopyLen, Align SrcAlign,
                               Align DstAlign, bool SrcIsVolatile,
                               bool DstIsVolatile,
@@ -385,30 +385,8 @@ static bool createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
   BasicBlock *OrigBB = InsertBefore->getParent();
   Function *F = OrigBB->getParent();
   const DataLayout &DL = F->getParent()->getDataLayout();
-
   // TODO: Use 
diff erent element type if possible?
-  IRBuilder<> CastBuilder(InsertBefore);
-  Type *EltTy = CastBuilder.getInt8Ty();
-
-  unsigned SrcAS = SrcAddr->getType()->getPointerAddressSpace();
-  unsigned DstAS = DstAddr->getType()->getPointerAddressSpace();
-  if (SrcAS != DstAS) {
-    if (TTI.isValidAddrSpaceCast(DstAS, SrcAS))
-      DstAddr = CastBuilder.CreateAddrSpaceCast(DstAddr, SrcAddr->getType());
-    else if (TTI.isValidAddrSpaceCast(SrcAS, DstAS))
-      SrcAddr = CastBuilder.CreateAddrSpaceCast(SrcAddr, DstAddr->getType());
-    else {
-      // We don't know generically if it's legal to introduce an
-      // addrspacecast. We need to know either if it's legal to insert an
-      // addrspacecast, or if the address spaces cannot alias.
-      //
-      // TODO: Check if address spaces cannot alias and lower as memcpy.
-      LLVM_DEBUG(
-          dbgs() << "Do not know how to expand memmove between 
diff erent "
-                    "address spaces\n");
-      return false;
-    }
-  }
+  Type *EltTy = Type::getInt8Ty(F->getContext());
 
   // Create the a comparison of src and dst, based on which we jump to either
   // the forward-copy part of the function (if src >= dst) or the backwards-copy
@@ -448,6 +426,7 @@ static bool createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
   BasicBlock *LoopBB =
     BasicBlock::Create(F->getContext(), "copy_backwards_loop", F, CopyForwardBB);
   IRBuilder<> LoopBuilder(LoopBB);
+
   PHINode *LoopPhi = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
   Value *IndexPtr = LoopBuilder.CreateSub(
       LoopPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_ptr");
@@ -484,7 +463,6 @@ static bool createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
 
   BranchInst::Create(ExitBB, FwdLoopBB, CompareN, ElseTerm);
   ElseTerm->eraseFromParent();
-  return true;
 }
 
 static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
@@ -575,15 +553,55 @@ void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy,
 
 bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove,
                                const TargetTransformInfo &TTI) {
-  return createMemMoveLoop(
-      /* InsertBefore */ Memmove,
-      /* SrcAddr */ Memmove->getRawSource(),
-      /* DstAddr */ Memmove->getRawDest(),
-      /* CopyLen */ Memmove->getLength(),
-      /* SrcAlign */ Memmove->getSourceAlign().valueOrOne(),
-      /* DestAlign */ Memmove->getDestAlign().valueOrOne(),
-      /* SrcIsVolatile */ Memmove->isVolatile(),
-      /* DstIsVolatile */ Memmove->isVolatile(), TTI);
+  Value *CopyLen = Memmove->getLength();
+  Value *SrcAddr = Memmove->getRawSource();
+  Value *DstAddr = Memmove->getRawDest();
+  Align SrcAlign = Memmove->getSourceAlign().valueOrOne();
+  Align DstAlign = Memmove->getDestAlign().valueOrOne();
+  bool SrcIsVolatile = Memmove->isVolatile();
+  bool DstIsVolatile = SrcIsVolatile;
+  IRBuilder<> CastBuilder(Memmove);
+
+  unsigned SrcAS = SrcAddr->getType()->getPointerAddressSpace();
+  unsigned DstAS = DstAddr->getType()->getPointerAddressSpace();
+  if (SrcAS != DstAS) {
+    if (!TTI.addrspacesMayAlias(SrcAS, DstAS)) {
+      // We may not be able to emit a pointer comparison, but we don't have
+      // to. Expand as memcpy.
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(CopyLen)) {
+        createMemCpyLoopKnownSize(/*InsertBefore=*/Memmove, SrcAddr, DstAddr,
+                                  CI, SrcAlign, DstAlign, SrcIsVolatile,
+                                  DstIsVolatile,
+                                  /*CanOverlap=*/false, TTI);
+      } else {
+        createMemCpyLoopUnknownSize(/*InsertBefore=*/Memmove, SrcAddr, DstAddr,
+                                    CopyLen, SrcAlign, DstAlign, SrcIsVolatile,
+                                    DstIsVolatile,
+                                    /*CanOverlap=*/false, TTI);
+      }
+
+      return true;
+    }
+
+    if (TTI.isValidAddrSpaceCast(DstAS, SrcAS))
+      DstAddr = CastBuilder.CreateAddrSpaceCast(DstAddr, SrcAddr->getType());
+    else if (TTI.isValidAddrSpaceCast(SrcAS, DstAS))
+      SrcAddr = CastBuilder.CreateAddrSpaceCast(SrcAddr, DstAddr->getType());
+    else {
+      // We don't know generically if it's legal to introduce an
+      // addrspacecast. We need to know either if it's legal to insert an
+      // addrspacecast, or if the address spaces cannot alias.
+      LLVM_DEBUG(
+          dbgs() << "Do not know how to expand memmove between 
diff erent "
+                    "address spaces\n");
+      return false;
+    }
+  }
+
+  createMemMoveLoop(
+      /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CopyLen, SrcAlign, DstAlign,
+      SrcIsVolatile, DstIsVolatile, TTI);
+  return true;
 }
 
 void llvm::expandMemSetAsLoop(MemSetInst *Memset) {

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
index 0573d1e0b2fd3..ae49fd2dcf68b 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@@ -11,6 +11,8 @@ declare void @llvm.memcpy.p3.p3.i32(ptr addrspace(3) nocapture, ptr addrspace(3)
 declare void @llvm.memmove.p1.p1.i64(ptr addrspace(1) nocapture, ptr addrspace(1) nocapture readonly, i64, i1) #1
 declare void @llvm.memmove.p1.p3.i32(ptr addrspace(1) nocapture, ptr addrspace(3) nocapture readonly, i32, i1) #1
 declare void @llvm.memmove.p5.p5.i32(ptr addrspace(5) nocapture, ptr addrspace(5) nocapture readonly, i32, i1) #1
+declare void @llvm.memmove.p3.p5.i32(ptr addrspace(3) nocapture, ptr addrspace(5) nocapture readonly, i32, i1) #1
+declare void @llvm.memmove.p5.p3.i32(ptr addrspace(5) nocapture, ptr addrspace(3) nocapture readonly, i32, i1) #1
 declare void @llvm.memmove.p0.p1.i64(ptr nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #1
 declare void @llvm.memmove.p1.p0.i64(ptr addrspace(1) nocapture writeonly, ptr nocapture readonly, i64, i1 immarg) #1
 declare void @llvm.memmove.p5.p1.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #1
@@ -1473,18 +1475,46 @@ define amdgpu_kernel void @memmove_private_align1_flat_align1(ptr addrspace(5) %
 }
 
 define amdgpu_kernel void @memmove_private_align1_global_align1(ptr addrspace(5) %dst, ptr addrspace(1) %src) {
-; OPT-LABEL: @memmove_private_align1_global_align1(
-; OPT-NEXT:    call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 256, i1 false)
-; OPT-NEXT:    ret void
+; MAX1024-LABEL: @memmove_private_align1_global_align1(
+; MAX1024-NEXT:    call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 256, i1 false)
+; MAX1024-NEXT:    ret void
+;
+; ALL-LABEL: @memmove_private_align1_global_align1(
+; ALL-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
+; ALL:       load-store-loop:
+; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
+; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
+; ALL-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 1, !alias.scope !0
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST:%.*]], i64 [[LOOP_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias !0
+; ALL-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 1
+; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 16
+; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
+; ALL:       memcpy-split:
+; ALL-NEXT:    ret void
 ;
   call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) %dst, ptr addrspace(1) %src, i64 256, i1 false)
   ret void
 }
 
 define amdgpu_kernel void @memmove_global_align1_private_align1(ptr addrspace(1) %dst, ptr addrspace(5) %src) {
-; OPT-LABEL: @memmove_global_align1_private_align1(
-; OPT-NEXT:    call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) [[DST:%.*]], ptr addrspace(5) [[SRC:%.*]], i64 256, i1 false)
-; OPT-NEXT:    ret void
+; MAX1024-LABEL: @memmove_global_align1_private_align1(
+; MAX1024-NEXT:    call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) [[DST:%.*]], ptr addrspace(5) [[SRC:%.*]], i64 256, i1 false)
+; MAX1024-NEXT:    ret void
+;
+; ALL-LABEL: @memmove_global_align1_private_align1(
+; ALL-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
+; ALL:       load-store-loop:
+; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
+; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC:%.*]], i64 [[LOOP_INDEX]]
+; ALL-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope !3
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
+; ALL-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1, !noalias !3
+; ALL-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 1
+; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 16
+; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
+; ALL:       memcpy-split:
+; ALL-NEXT:    ret void
 ;
   call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) %dst, ptr addrspace(5) %src, i64 256, i1 false)
   ret void
@@ -1517,5 +1547,185 @@ define amdgpu_kernel void @memmove_p999_align1_p998_align1(ptr addrspace(999) %d
   ret void
 }
 
+define amdgpu_kernel void @memmove_local_align1_private_align1(ptr addrspace(3) %dst, ptr addrspace(5) %src) {
+; MAX1024-LABEL: @memmove_local_align1_private_align1(
+; MAX1024-NEXT:    call void @llvm.memmove.p3.p5.i32(ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[SRC:%.*]], i32 256, i1 false)
+; MAX1024-NEXT:    ret void
+;
+; ALL-LABEL: @memmove_local_align1_private_align1(
+; ALL-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
+; ALL:       load-store-loop:
+; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
+; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
+; ALL-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope !6
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
+; ALL-NEXT:    store <2 x i32> [[TMP2]], ptr addrspace(3) [[TMP3]], align 1, !noalias !6
+; ALL-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 1
+; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 32
+; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
+; ALL:       memcpy-split:
+; ALL-NEXT:    ret void
+;
+  call void @llvm.memmove.p3.p5.i32(ptr addrspace(3) %dst, ptr addrspace(5) %src, i32 256, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(5) %src, i32 %size) {
+; MAX1024-LABEL: @memmove_local_align1_private_align1_unknown_size(
+; MAX1024-NEXT:    [[TMP1:%.*]] = udiv i32 [[SIZE:%.*]], 8
+; MAX1024-NEXT:    [[TMP2:%.*]] = urem i32 [[SIZE]], 8
+; MAX1024-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
+; MAX1024-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
+; MAX1024-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
+; MAX1024:       loop-memcpy-expansion:
+; MAX1024-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
+; MAX1024-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
+; MAX1024-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP5]], align 1, !alias.scope !0
+; MAX1024-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
+; MAX1024-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 1, !noalias !0
+; MAX1024-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 1
+; MAX1024-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP1]]
+; MAX1024-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
+; MAX1024:       loop-memcpy-residual:
+; MAX1024-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
+; MAX1024-NEXT:    [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
+; MAX1024-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[TMP10]]
+; MAX1024-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(5) [[TMP11]], align 1, !alias.scope !0
+; MAX1024-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]]
+; MAX1024-NEXT:    store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1, !noalias !0
+; MAX1024-NEXT:    [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
+; MAX1024-NEXT:    [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
+; MAX1024-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
+; MAX1024:       post-loop-memcpy-expansion:
+; MAX1024-NEXT:    ret void
+; MAX1024:       loop-memcpy-residual-header:
+; MAX1024-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
+; MAX1024-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
+;
+; ALL-LABEL: @memmove_local_align1_private_align1_unknown_size(
+; ALL-NEXT:    [[TMP1:%.*]] = udiv i32 [[SIZE:%.*]], 8
+; ALL-NEXT:    [[TMP2:%.*]] = urem i32 [[SIZE]], 8
+; ALL-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
+; ALL-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
+; ALL-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
+; ALL:       loop-memcpy-expansion:
+; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
+; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
+; ALL-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP5]], align 1, !alias.scope !9
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
+; ALL-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 1, !noalias !9
+; ALL-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 1
+; ALL-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP1]]
+; ALL-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
+; ALL:       loop-memcpy-residual:
+; ALL-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
+; ALL-NEXT:    [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
+; ALL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[TMP10]]
+; ALL-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(5) [[TMP11]], align 1, !alias.scope !9
+; ALL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]]
+; ALL-NEXT:    store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1, !noalias !9
+; ALL-NEXT:    [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
+; ALL-NEXT:    [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
+; ALL-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
+; ALL:       post-loop-memcpy-expansion:
+; ALL-NEXT:    ret void
+; ALL:       loop-memcpy-residual-header:
+; ALL-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
+; ALL-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
+;
+  call void @llvm.memmove.p3.p5.i32(ptr addrspace(3) %dst, ptr addrspace(5) %src, i32 %size, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_private_align1_local_align1(ptr addrspace(5) %dst, ptr addrspace(3) %src) {
+; MAX1024-LABEL: @memmove_private_align1_local_align1(
+; MAX1024-NEXT:    call void @llvm.memmove.p5.p3.i32(ptr addrspace(5) [[DST:%.*]], ptr addrspace(3) [[SRC:%.*]], i32 256, i1 false)
+; MAX1024-NEXT:    ret void
+;
+; ALL-LABEL: @memmove_private_align1_local_align1(
+; ALL-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
+; ALL:       load-store-loop:
+; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
+; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
+; ALL-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP1]], align 1, !alias.scope !12
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
+; ALL-NEXT:    store <2 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias !12
+; ALL-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 1
+; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 32
+; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
+; ALL:       memcpy-split:
+; ALL-NEXT:    ret void
+;
+  call void @llvm.memmove.p5.p3.i32(ptr addrspace(5) %dst, ptr addrspace(3) %src, i32 256, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr addrspace(5) %dst, ptr addrspace(3) %src, i32 %size) {
+; MAX1024-LABEL: @memmove_private_align1_local_align1_unknown_size(
+; MAX1024-NEXT:    [[TMP1:%.*]] = udiv i32 [[SIZE:%.*]], 8
+; MAX1024-NEXT:    [[TMP2:%.*]] = urem i32 [[SIZE]], 8
+; MAX1024-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
+; MAX1024-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
+; MAX1024-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
+; MAX1024:       loop-memcpy-expansion:
+; MAX1024-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
+; MAX1024-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
+; MAX1024-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1, !alias.scope !3
+; MAX1024-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
+; MAX1024-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(5) [[TMP7]], align 1, !noalias !3
+; MAX1024-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 1
+; MAX1024-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP1]]
+; MAX1024-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
+; MAX1024:       loop-memcpy-residual:
+; MAX1024-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
+; MAX1024-NEXT:    [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
+; MAX1024-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]]
+; MAX1024-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1, !alias.scope !3
+; MAX1024-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[TMP10]]
+; MAX1024-NEXT:    store i8 [[TMP12]], ptr addrspace(5) [[TMP13]], align 1, !noalias !3
+; MAX1024-NEXT:    [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
+; MAX1024-NEXT:    [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
+; MAX1024-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
+; MAX1024:       post-loop-memcpy-expansion:
+; MAX1024-NEXT:    ret void
+; MAX1024:       loop-memcpy-residual-header:
+; MAX1024-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
+; MAX1024-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
+;
+; ALL-LABEL: @memmove_private_align1_local_align1_unknown_size(
+; ALL-NEXT:    [[TMP1:%.*]] = udiv i32 [[SIZE:%.*]], 8
+; ALL-NEXT:    [[TMP2:%.*]] = urem i32 [[SIZE]], 8
+; ALL-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
+; ALL-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
+; ALL-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
+; ALL:       loop-memcpy-expansion:
+; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
+; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
+; ALL-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1, !alias.scope !15
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
+; ALL-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(5) [[TMP7]], align 1, !noalias !15
+; ALL-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 1
+; ALL-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP1]]
+; ALL-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
+; ALL:       loop-memcpy-residual:
+; ALL-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
+; ALL-NEXT:    [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
+; ALL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]]
+; ALL-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1, !alias.scope !15
+; ALL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[TMP10]]
+; ALL-NEXT:    store i8 [[TMP12]], ptr addrspace(5) [[TMP13]], align 1, !noalias !15
+; ALL-NEXT:    [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
+; ALL-NEXT:    [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
+; ALL-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
+; ALL:       post-loop-memcpy-expansion:
+; ALL-NEXT:    ret void
+; ALL:       loop-memcpy-residual-header:
+; ALL-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
+; ALL-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
+;
+  call void @llvm.memmove.p5.p3.i32(ptr addrspace(5) %dst, ptr addrspace(3) %src, i32 %size, i1 false)
+  ret void
+}
+
 attributes #0 = { nounwind }
 attributes #1 = { argmemonly nounwind }