[llvm] 0696240 - LowerMemIntrinsics: Check address space aliasing for memmove expansion
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 14 04:57:05 PDT 2023
Author: Matt Arsenault
Date: 2023-06-14T07:56:58-04:00
New Revision: 0696240384cf4b18f65e31d68b51df96c328cb24
URL: https://github.com/llvm/llvm-project/commit/0696240384cf4b18f65e31d68b51df96c328cb24
DIFF: https://github.com/llvm/llvm-project/commit/0696240384cf4b18f65e31d68b51df96c328cb24.diff
LOG: LowerMemIntrinsics: Check address space aliasing for memmove expansion
For cases where we cannot insert an addrspacecast, we can still expand
like a memcpy if we know the address spaces cannot alias. Normally
non-aliasing memmoves are optimized to memcpy, but we cannot rely on
that for lowering. If a target has aliasing address spaces that cannot
be casted between, we still have to give up lowering this.
Added:
Modified:
llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index d06f7db2aa10d..906eb71fc2d98 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -376,7 +376,7 @@ void llvm::createMemCpyLoopUnknownSize(
// }
// return dst;
// }
-static bool createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
+static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
Value *DstAddr, Value *CopyLen, Align SrcAlign,
Align DstAlign, bool SrcIsVolatile,
bool DstIsVolatile,
@@ -385,30 +385,8 @@ static bool createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
BasicBlock *OrigBB = InsertBefore->getParent();
Function *F = OrigBB->getParent();
const DataLayout &DL = F->getParent()->getDataLayout();
-
// TODO: Use
diff erent element type if possible?
- IRBuilder<> CastBuilder(InsertBefore);
- Type *EltTy = CastBuilder.getInt8Ty();
-
- unsigned SrcAS = SrcAddr->getType()->getPointerAddressSpace();
- unsigned DstAS = DstAddr->getType()->getPointerAddressSpace();
- if (SrcAS != DstAS) {
- if (TTI.isValidAddrSpaceCast(DstAS, SrcAS))
- DstAddr = CastBuilder.CreateAddrSpaceCast(DstAddr, SrcAddr->getType());
- else if (TTI.isValidAddrSpaceCast(SrcAS, DstAS))
- SrcAddr = CastBuilder.CreateAddrSpaceCast(SrcAddr, DstAddr->getType());
- else {
- // We don't know generically if it's legal to introduce an
- // addrspacecast. We need to know either if it's legal to insert an
- // addrspacecast, or if the address spaces cannot alias.
- //
- // TODO: Check if address spaces cannot alias and lower as memcpy.
- LLVM_DEBUG(
- dbgs() << "Do not know how to expand memmove between
diff erent "
- "address spaces\n");
- return false;
- }
- }
+ Type *EltTy = Type::getInt8Ty(F->getContext());
// Create the a comparison of src and dst, based on which we jump to either
// the forward-copy part of the function (if src >= dst) or the backwards-copy
@@ -448,6 +426,7 @@ static bool createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
BasicBlock *LoopBB =
BasicBlock::Create(F->getContext(), "copy_backwards_loop", F, CopyForwardBB);
IRBuilder<> LoopBuilder(LoopBB);
+
PHINode *LoopPhi = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
Value *IndexPtr = LoopBuilder.CreateSub(
LoopPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_ptr");
@@ -484,7 +463,6 @@ static bool createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
BranchInst::Create(ExitBB, FwdLoopBB, CompareN, ElseTerm);
ElseTerm->eraseFromParent();
- return true;
}
static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
@@ -575,15 +553,55 @@ void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy,
bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove,
const TargetTransformInfo &TTI) {
- return createMemMoveLoop(
- /* InsertBefore */ Memmove,
- /* SrcAddr */ Memmove->getRawSource(),
- /* DstAddr */ Memmove->getRawDest(),
- /* CopyLen */ Memmove->getLength(),
- /* SrcAlign */ Memmove->getSourceAlign().valueOrOne(),
- /* DestAlign */ Memmove->getDestAlign().valueOrOne(),
- /* SrcIsVolatile */ Memmove->isVolatile(),
- /* DstIsVolatile */ Memmove->isVolatile(), TTI);
+ Value *CopyLen = Memmove->getLength();
+ Value *SrcAddr = Memmove->getRawSource();
+ Value *DstAddr = Memmove->getRawDest();
+ Align SrcAlign = Memmove->getSourceAlign().valueOrOne();
+ Align DstAlign = Memmove->getDestAlign().valueOrOne();
+ bool SrcIsVolatile = Memmove->isVolatile();
+ bool DstIsVolatile = SrcIsVolatile;
+ IRBuilder<> CastBuilder(Memmove);
+
+ unsigned SrcAS = SrcAddr->getType()->getPointerAddressSpace();
+ unsigned DstAS = DstAddr->getType()->getPointerAddressSpace();
+ if (SrcAS != DstAS) {
+ if (!TTI.addrspacesMayAlias(SrcAS, DstAS)) {
+ // We may not be able to emit a pointer comparison, but we don't have
+ // to. Expand as memcpy.
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(CopyLen)) {
+ createMemCpyLoopKnownSize(/*InsertBefore=*/Memmove, SrcAddr, DstAddr,
+ CI, SrcAlign, DstAlign, SrcIsVolatile,
+ DstIsVolatile,
+ /*CanOverlap=*/false, TTI);
+ } else {
+ createMemCpyLoopUnknownSize(/*InsertBefore=*/Memmove, SrcAddr, DstAddr,
+ CopyLen, SrcAlign, DstAlign, SrcIsVolatile,
+ DstIsVolatile,
+ /*CanOverlap=*/false, TTI);
+ }
+
+ return true;
+ }
+
+ if (TTI.isValidAddrSpaceCast(DstAS, SrcAS))
+ DstAddr = CastBuilder.CreateAddrSpaceCast(DstAddr, SrcAddr->getType());
+ else if (TTI.isValidAddrSpaceCast(SrcAS, DstAS))
+ SrcAddr = CastBuilder.CreateAddrSpaceCast(SrcAddr, DstAddr->getType());
+ else {
+ // We don't know generically if it's legal to introduce an
+ // addrspacecast. We need to know either if it's legal to insert an
+ // addrspacecast, or if the address spaces cannot alias.
+ LLVM_DEBUG(
+ dbgs() << "Do not know how to expand memmove between
diff erent "
+ "address spaces\n");
+ return false;
+ }
+ }
+
+ createMemMoveLoop(
+ /*InsertBefore=*/Memmove, SrcAddr, DstAddr, CopyLen, SrcAlign, DstAlign,
+ SrcIsVolatile, DstIsVolatile, TTI);
+ return true;
}
void llvm::expandMemSetAsLoop(MemSetInst *Memset) {
diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
index 0573d1e0b2fd3..ae49fd2dcf68b 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@@ -11,6 +11,8 @@ declare void @llvm.memcpy.p3.p3.i32(ptr addrspace(3) nocapture, ptr addrspace(3)
declare void @llvm.memmove.p1.p1.i64(ptr addrspace(1) nocapture, ptr addrspace(1) nocapture readonly, i64, i1) #1
declare void @llvm.memmove.p1.p3.i32(ptr addrspace(1) nocapture, ptr addrspace(3) nocapture readonly, i32, i1) #1
declare void @llvm.memmove.p5.p5.i32(ptr addrspace(5) nocapture, ptr addrspace(5) nocapture readonly, i32, i1) #1
+declare void @llvm.memmove.p3.p5.i32(ptr addrspace(3) nocapture, ptr addrspace(5) nocapture readonly, i32, i1) #1
+declare void @llvm.memmove.p5.p3.i32(ptr addrspace(5) nocapture, ptr addrspace(3) nocapture readonly, i32, i1) #1
declare void @llvm.memmove.p0.p1.i64(ptr nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #1
declare void @llvm.memmove.p1.p0.i64(ptr addrspace(1) nocapture writeonly, ptr nocapture readonly, i64, i1 immarg) #1
declare void @llvm.memmove.p5.p1.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #1
@@ -1473,18 +1475,46 @@ define amdgpu_kernel void @memmove_private_align1_flat_align1(ptr addrspace(5) %
}
define amdgpu_kernel void @memmove_private_align1_global_align1(ptr addrspace(5) %dst, ptr addrspace(1) %src) {
-; OPT-LABEL: @memmove_private_align1_global_align1(
-; OPT-NEXT: call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 256, i1 false)
-; OPT-NEXT: ret void
+; MAX1024-LABEL: @memmove_private_align1_global_align1(
+; MAX1024-NEXT: call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 256, i1 false)
+; MAX1024-NEXT: ret void
+;
+; ALL-LABEL: @memmove_private_align1_global_align1(
+; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
+; ALL: load-store-loop:
+; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
+; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
+; ALL-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 1, !alias.scope !0
+; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST:%.*]], i64 [[LOOP_INDEX]]
+; ALL-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias !0
+; ALL-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 1
+; ALL-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 16
+; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
+; ALL: memcpy-split:
+; ALL-NEXT: ret void
;
call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) %dst, ptr addrspace(1) %src, i64 256, i1 false)
ret void
}
define amdgpu_kernel void @memmove_global_align1_private_align1(ptr addrspace(1) %dst, ptr addrspace(5) %src) {
-; OPT-LABEL: @memmove_global_align1_private_align1(
-; OPT-NEXT: call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) [[DST:%.*]], ptr addrspace(5) [[SRC:%.*]], i64 256, i1 false)
-; OPT-NEXT: ret void
+; MAX1024-LABEL: @memmove_global_align1_private_align1(
+; MAX1024-NEXT: call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) [[DST:%.*]], ptr addrspace(5) [[SRC:%.*]], i64 256, i1 false)
+; MAX1024-NEXT: ret void
+;
+; ALL-LABEL: @memmove_global_align1_private_align1(
+; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
+; ALL: load-store-loop:
+; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
+; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC:%.*]], i64 [[LOOP_INDEX]]
+; ALL-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope !3
+; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
+; ALL-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1, !noalias !3
+; ALL-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 1
+; ALL-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 16
+; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
+; ALL: memcpy-split:
+; ALL-NEXT: ret void
;
call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) %dst, ptr addrspace(5) %src, i64 256, i1 false)
ret void
@@ -1517,5 +1547,185 @@ define amdgpu_kernel void @memmove_p999_align1_p998_align1(ptr addrspace(999) %d
ret void
}
+define amdgpu_kernel void @memmove_local_align1_private_align1(ptr addrspace(3) %dst, ptr addrspace(5) %src) {
+; MAX1024-LABEL: @memmove_local_align1_private_align1(
+; MAX1024-NEXT: call void @llvm.memmove.p3.p5.i32(ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[SRC:%.*]], i32 256, i1 false)
+; MAX1024-NEXT: ret void
+;
+; ALL-LABEL: @memmove_local_align1_private_align1(
+; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
+; ALL: load-store-loop:
+; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
+; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
+; ALL-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope !6
+; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
+; ALL-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(3) [[TMP3]], align 1, !noalias !6
+; ALL-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 1
+; ALL-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 32
+; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
+; ALL: memcpy-split:
+; ALL-NEXT: ret void
+;
+ call void @llvm.memmove.p3.p5.i32(ptr addrspace(3) %dst, ptr addrspace(5) %src, i32 256, i1 false)
+ ret void
+}
+
+define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(5) %src, i32 %size) {
+; MAX1024-LABEL: @memmove_local_align1_private_align1_unknown_size(
+; MAX1024-NEXT: [[TMP1:%.*]] = udiv i32 [[SIZE:%.*]], 8
+; MAX1024-NEXT: [[TMP2:%.*]] = urem i32 [[SIZE]], 8
+; MAX1024-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
+; MAX1024-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
+; MAX1024-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
+; MAX1024: loop-memcpy-expansion:
+; MAX1024-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
+; MAX1024-NEXT: [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
+; MAX1024-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP5]], align 1, !alias.scope !0
+; MAX1024-NEXT: [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
+; MAX1024-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 1, !noalias !0
+; MAX1024-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 1
+; MAX1024-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP1]]
+; MAX1024-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
+; MAX1024: loop-memcpy-residual:
+; MAX1024-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
+; MAX1024-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
+; MAX1024-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[TMP10]]
+; MAX1024-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(5) [[TMP11]], align 1, !alias.scope !0
+; MAX1024-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]]
+; MAX1024-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1, !noalias !0
+; MAX1024-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
+; MAX1024-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
+; MAX1024-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
+; MAX1024: post-loop-memcpy-expansion:
+; MAX1024-NEXT: ret void
+; MAX1024: loop-memcpy-residual-header:
+; MAX1024-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
+; MAX1024-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
+;
+; ALL-LABEL: @memmove_local_align1_private_align1_unknown_size(
+; ALL-NEXT: [[TMP1:%.*]] = udiv i32 [[SIZE:%.*]], 8
+; ALL-NEXT: [[TMP2:%.*]] = urem i32 [[SIZE]], 8
+; ALL-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
+; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
+; ALL-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
+; ALL: loop-memcpy-expansion:
+; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
+; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
+; ALL-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP5]], align 1, !alias.scope !9
+; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
+; ALL-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 1, !noalias !9
+; ALL-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 1
+; ALL-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP1]]
+; ALL-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
+; ALL: loop-memcpy-residual:
+; ALL-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
+; ALL-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
+; ALL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[TMP10]]
+; ALL-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(5) [[TMP11]], align 1, !alias.scope !9
+; ALL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]]
+; ALL-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1, !noalias !9
+; ALL-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
+; ALL-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
+; ALL-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
+; ALL: post-loop-memcpy-expansion:
+; ALL-NEXT: ret void
+; ALL: loop-memcpy-residual-header:
+; ALL-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
+; ALL-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
+;
+ call void @llvm.memmove.p3.p5.i32(ptr addrspace(3) %dst, ptr addrspace(5) %src, i32 %size, i1 false)
+ ret void
+}
+
+define amdgpu_kernel void @memmove_private_align1_local_align1(ptr addrspace(5) %dst, ptr addrspace(3) %src) {
+; MAX1024-LABEL: @memmove_private_align1_local_align1(
+; MAX1024-NEXT: call void @llvm.memmove.p5.p3.i32(ptr addrspace(5) [[DST:%.*]], ptr addrspace(3) [[SRC:%.*]], i32 256, i1 false)
+; MAX1024-NEXT: ret void
+;
+; ALL-LABEL: @memmove_private_align1_local_align1(
+; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
+; ALL: load-store-loop:
+; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
+; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
+; ALL-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP1]], align 1, !alias.scope !12
+; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
+; ALL-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias !12
+; ALL-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 1
+; ALL-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 32
+; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
+; ALL: memcpy-split:
+; ALL-NEXT: ret void
+;
+ call void @llvm.memmove.p5.p3.i32(ptr addrspace(5) %dst, ptr addrspace(3) %src, i32 256, i1 false)
+ ret void
+}
+
+define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr addrspace(5) %dst, ptr addrspace(3) %src, i32 %size) {
+; MAX1024-LABEL: @memmove_private_align1_local_align1_unknown_size(
+; MAX1024-NEXT: [[TMP1:%.*]] = udiv i32 [[SIZE:%.*]], 8
+; MAX1024-NEXT: [[TMP2:%.*]] = urem i32 [[SIZE]], 8
+; MAX1024-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
+; MAX1024-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
+; MAX1024-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
+; MAX1024: loop-memcpy-expansion:
+; MAX1024-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
+; MAX1024-NEXT: [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
+; MAX1024-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1, !alias.scope !3
+; MAX1024-NEXT: [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
+; MAX1024-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(5) [[TMP7]], align 1, !noalias !3
+; MAX1024-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 1
+; MAX1024-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP1]]
+; MAX1024-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
+; MAX1024: loop-memcpy-residual:
+; MAX1024-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
+; MAX1024-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
+; MAX1024-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]]
+; MAX1024-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1, !alias.scope !3
+; MAX1024-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[TMP10]]
+; MAX1024-NEXT: store i8 [[TMP12]], ptr addrspace(5) [[TMP13]], align 1, !noalias !3
+; MAX1024-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
+; MAX1024-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
+; MAX1024-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
+; MAX1024: post-loop-memcpy-expansion:
+; MAX1024-NEXT: ret void
+; MAX1024: loop-memcpy-residual-header:
+; MAX1024-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
+; MAX1024-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
+;
+; ALL-LABEL: @memmove_private_align1_local_align1_unknown_size(
+; ALL-NEXT: [[TMP1:%.*]] = udiv i32 [[SIZE:%.*]], 8
+; ALL-NEXT: [[TMP2:%.*]] = urem i32 [[SIZE]], 8
+; ALL-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
+; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
+; ALL-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
+; ALL: loop-memcpy-expansion:
+; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
+; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
+; ALL-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1, !alias.scope !15
+; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
+; ALL-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(5) [[TMP7]], align 1, !noalias !15
+; ALL-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 1
+; ALL-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP1]]
+; ALL-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
+; ALL: loop-memcpy-residual:
+; ALL-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
+; ALL-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
+; ALL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]]
+; ALL-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1, !alias.scope !15
+; ALL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[TMP10]]
+; ALL-NEXT: store i8 [[TMP12]], ptr addrspace(5) [[TMP13]], align 1, !noalias !15
+; ALL-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
+; ALL-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
+; ALL-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
+; ALL: post-loop-memcpy-expansion:
+; ALL-NEXT: ret void
+; ALL: loop-memcpy-residual-header:
+; ALL-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
+; ALL-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
+;
+ call void @llvm.memmove.p5.p3.i32(ptr addrspace(5) %dst, ptr addrspace(3) %src, i32 %size, i1 false)
+ ret void
+}
+
attributes #0 = { nounwind }
attributes #1 = { argmemonly nounwind }
More information about the llvm-commits
mailing list