[llvm] [AMDGPU] Remove special cases in TTI::getMemcpyLoop(Residual)LoweringType (PR #125507)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 3 06:29:51 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Fabian Ritter (ritter-x2a)
<details>
<summary>Changes</summary>
These special cases limit the width of memory operations we use for lowering memcpy/memmove when the pointer arguments are 2-aligned or in the LDS/GDS.
I found that performance in microbenchmarks on gfx90a, gfx1030, and gfx1100 is better without this limitation.
---
Patch is 70.70 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/125507.diff
3 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp (+18-41)
- (modified) llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll (+145-113)
- (modified) llvm/test/CodeGen/AMDGPU/memmove-var-size.ll (+95-89)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 5bfd8914b9a46b..09f7877b13b3ae 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -416,8 +416,6 @@ int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
return 1024;
}
-// FIXME: Should we use narrower types for local/region, or account for when
-// unaligned access is legal?
Type *GCNTTIImpl::getMemcpyLoopLoweringType(
LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
@@ -426,29 +424,12 @@ Type *GCNTTIImpl::getMemcpyLoopLoweringType(
if (AtomicElementSize)
return Type::getIntNTy(Context, *AtomicElementSize * 8);
- Align MinAlign = std::min(SrcAlign, DestAlign);
-
- // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
- // hardware into byte accesses. If you assume all alignments are equally
- // probable, it's more efficient on average to use short accesses for this
- // case.
- if (MinAlign == Align(2))
- return Type::getInt16Ty(Context);
-
- // Not all subtargets have 128-bit DS instructions, and we currently don't
- // form them by default.
- if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
- SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
- DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
- DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
- return FixedVectorType::get(Type::getInt32Ty(Context), 2);
- }
-
- // Global memory works best with 16-byte accesses.
+ // 16-byte accesses achieve the highest copy throughput.
// If the operation has a fixed known length that is large enough, it is
// worthwhile to return an even wider type and let legalization lower it into
- // multiple accesses, effectively unrolling the memcpy loop. Private memory
- // also hits this, although accesses may be decomposed.
+ // multiple accesses, effectively unrolling the memcpy loop.
+ // We also rely on legalization to decompose into smaller accesses for
+ // subtargets and address spaces where it is necessary.
//
// Don't unroll if Length is not a constant, since unrolling leads to worse
// performance for length values that are smaller or slightly larger than the
@@ -473,26 +454,22 @@ void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
DestAlign, AtomicCpySize);
- Align MinAlign = std::min(SrcAlign, DestAlign);
-
- if (MinAlign != Align(2)) {
- Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
- while (RemainingBytes >= 16) {
- OpsOut.push_back(I32x4Ty);
- RemainingBytes -= 16;
- }
+ Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
+ while (RemainingBytes >= 16) {
+ OpsOut.push_back(I32x4Ty);
+ RemainingBytes -= 16;
+ }
- Type *I64Ty = Type::getInt64Ty(Context);
- while (RemainingBytes >= 8) {
- OpsOut.push_back(I64Ty);
- RemainingBytes -= 8;
- }
+ Type *I64Ty = Type::getInt64Ty(Context);
+ while (RemainingBytes >= 8) {
+ OpsOut.push_back(I64Ty);
+ RemainingBytes -= 8;
+ }
- Type *I32Ty = Type::getInt32Ty(Context);
- while (RemainingBytes >= 4) {
- OpsOut.push_back(I32Ty);
- RemainingBytes -= 4;
- }
+ Type *I32Ty = Type::getInt32Ty(Context);
+ while (RemainingBytes >= 4) {
+ OpsOut.push_back(I32Ty);
+ RemainingBytes -= 4;
}
Type *I16Ty = Type::getInt16Ty(Context);
diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
index ffe9e06c04ae45..5a9f53ec0077db 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@@ -330,17 +330,17 @@ define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0,
define amdgpu_kernel void @memcpy_alt_type(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %n) #0 {
; OPT-LABEL: @memcpy_alt_type(
-; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7
+; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 15
; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
; OPT: loop-memcpy-expansion:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1
+; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 1
; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1
-; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1
+; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16
; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
; OPT: loop-memcpy-residual:
@@ -681,13 +681,25 @@ define amdgpu_kernel void @memcpy_global_align2_global_align2_1039(ptr addrspace
; OPT: load-store-loop:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(1) [[TMP1]], align 2
+; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 2
; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[TMP3]], align 2
-; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 2
-; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1038
+; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 2
+; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
+; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
; OPT: memcpy-split:
+; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
+; OPT-NEXT: [[TMP16:%.*]] = load i64, ptr addrspace(1) [[TMP15]], align 2
+; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
+; OPT-NEXT: store i64 [[TMP16]], ptr addrspace(1) [[TMP17]], align 2
+; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1032
+; OPT-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) [[TMP9]], align 2
+; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1032
+; OPT-NEXT: store i32 [[TMP10]], ptr addrspace(1) [[TMP11]], align 2
+; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1036
+; OPT-NEXT: [[TMP13:%.*]] = load i16, ptr addrspace(1) [[TMP12]], align 2
+; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1036
+; OPT-NEXT: store i16 [[TMP13]], ptr addrspace(1) [[TMP14]], align 2
; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1038
; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 2
; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1038
@@ -731,13 +743,17 @@ define amdgpu_kernel void @memcpy_global_align2_global_align4_1027(ptr addrspace
; OPT: load-store-loop:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(1) [[TMP1]], align 2
+; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[TMP3]], align 2
-; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 2
-; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1026
+; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 2
+; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
+; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
; OPT: memcpy-split:
+; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
+; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(1) [[TMP9]], align 4
+; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
+; OPT-NEXT: store i16 [[TMP10]], ptr addrspace(1) [[TMP11]], align 2
; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1026
; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 2
; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1026
@@ -754,13 +770,17 @@ define amdgpu_kernel void @memcpy_global_align4_global_align2_1027(ptr addrspace
; OPT: load-store-loop:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(1) [[TMP1]], align 2
+; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 2
; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[TMP3]], align 2
-; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 2
-; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1026
+; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
+; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
+; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
; OPT: memcpy-split:
+; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
+; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(1) [[TMP9]], align 2
+; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
+; OPT-NEXT: store i16 [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1026
; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 2
; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1026
@@ -804,13 +824,17 @@ define amdgpu_kernel void @memcpy_private_align2_private_align4_1027(ptr addrspa
; OPT: load-store-loop:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(5) [[TMP1]], align 2
+; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 4
; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(5) [[TMP3]], align 2
-; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 2
-; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1026
+; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 2
+; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256
+; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024
; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
; OPT: memcpy-split:
+; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024
+; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(5) [[TMP9]], align 4
+; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024
+; OPT-NEXT: store i16 [[TMP10]], ptr addrspace(5) [[TMP11]], align 2
; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026
; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[TMP6]], align 2
; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1026
@@ -854,13 +878,17 @@ define amdgpu_kernel void @memcpy_private_align4_private_align2_1027(ptr addrspa
; OPT: load-store-loop:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(5) [[TMP1]], align 2
+; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 2
; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(5) [[TMP3]], align 2
-; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 2
-; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1026
+; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 4
+; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256
+; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024
; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
; OPT: memcpy-split:
+; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024
+; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(5) [[TMP9]], align 2
+; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024
+; OPT-NEXT: store i16 [[TMP10]], ptr addrspace(5) [[TMP11]], align 4
; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026
; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[TMP6]], align 2
; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1026
@@ -904,13 +932,17 @@ define amdgpu_kernel void @memcpy_private_align2_private_align2_1027(ptr addrspa
; OPT: load-store-loop:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(5) [[TMP1]], align 2
+; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 2
; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(5) [[TMP3]], align 2
-; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 2
-; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1026
+; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 2
+; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256
+; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024
; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
; OPT: memcpy-split:
+; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024
+; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(5) [[TMP9]], align 2
+; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024
+; OPT-NEXT: store i16 [[TMP10]], ptr addrspace(5) [[TMP11]], align 2
; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026
; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[TMP6]], align 2
; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1026
@@ -958,17 +990,17 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_variable(ptr addrs
define amdgpu_kernel void @memcpy_global_align2_global_align2_variable(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
; OPT-LABEL: @memcpy_global_align2_global_align2_variable(
-; OPT-NEXT: [[TMP2:%.*]] = and i64 [[N:%.*]], 1
+; OPT-NEXT: [[TMP2:%.*]] = and i64 [[N:%.*]], 15
; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
; OPT: loop-memcpy-expansion:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP6:%.*]] = load i16, ptr addrspace(1) [[TMP5]], align 2
+; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 2
; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT: store i16 [[TMP6]], ptr addrspace(1) [[TMP7]], align 2
-; OPT-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX]], 2
+; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 2
+; OPT-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX]], 16
; OPT-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]]
; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
; OPT: loop-memcpy-residual:
@@ -1028,17 +1060,17 @@ define amdgpu_kernel void @memcpy_global_align1_global_align1_variable(ptr addrs
define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 {
; OPT-LABEL: @memcpy_local_align4_local_align4_variable(
-; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7
+; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 15
; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
; OPT: loop-memcpy-expansion:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 4
+; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 4
; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 4
-; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 4
+; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16
; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
; OPT: loop-memcpy-residual:
@@ -1063,17 +1095,17 @@ define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspa
define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 {
; OPT-LABEL: @memcpy_local_align2_local_align2_variable(
-; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 1
+; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 15
; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
; OPT: loop-memcpy-expansion:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP6:%.*]] = load i16, ptr addrspace(3) [[TMP5]], align 2
+; OPT-NEXT: [[TMP6:%.*]] = load...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/125507
More information about the llvm-commits
mailing list