[llvm] [AMDGPU] Remove special cases in TTI::getMemcpyLoop(Residual)LoweringType (PR #125507)
Fabian Ritter via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 3 06:29:30 PST 2025
https://github.com/ritter-x2a created https://github.com/llvm/llvm-project/pull/125507
These special cases limit the width of memory operations we use for lowering memcpy/memmove when the pointer arguments are 2-aligned or in the LDS/GDS.
I found that performance in microbenchmarks on gfx90a, gfx1030, and gfx1100 is better without this limitation.
>From d79ec360d83b352f7dee00ccb194b8ecf44cb885 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Mon, 3 Feb 2025 09:09:02 -0500
Subject: [PATCH] [AMDGPU] Remove special cases in
TTI::getMemcpyLoop(Residual)LoweringType
These special cases limit the width of memory operations we use for
lowering memcpy/memmove when the pointer arguments are 2-aligned or in
the LDS/GDS.
I found that performance in microbenchmarks on gfx90a, gfx1030, and
gfx1100 is better without this limitation.
---
.../AMDGPU/AMDGPUTargetTransformInfo.cpp | 59 ++--
.../CodeGen/AMDGPU/lower-mem-intrinsics.ll | 258 ++++++++++--------
llvm/test/CodeGen/AMDGPU/memmove-var-size.ll | 184 +++++++------
3 files changed, 258 insertions(+), 243 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 5bfd8914b9a46b..09f7877b13b3ae 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -416,8 +416,6 @@ int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
return 1024;
}
-// FIXME: Should we use narrower types for local/region, or account for when
-// unaligned access is legal?
Type *GCNTTIImpl::getMemcpyLoopLoweringType(
LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
@@ -426,29 +424,12 @@ Type *GCNTTIImpl::getMemcpyLoopLoweringType(
if (AtomicElementSize)
return Type::getIntNTy(Context, *AtomicElementSize * 8);
- Align MinAlign = std::min(SrcAlign, DestAlign);
-
- // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
- // hardware into byte accesses. If you assume all alignments are equally
- // probable, it's more efficient on average to use short accesses for this
- // case.
- if (MinAlign == Align(2))
- return Type::getInt16Ty(Context);
-
- // Not all subtargets have 128-bit DS instructions, and we currently don't
- // form them by default.
- if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
- SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
- DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
- DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
- return FixedVectorType::get(Type::getInt32Ty(Context), 2);
- }
-
- // Global memory works best with 16-byte accesses.
+ // 16-byte accesses achieve the highest copy throughput.
// If the operation has a fixed known length that is large enough, it is
// worthwhile to return an even wider type and let legalization lower it into
- // multiple accesses, effectively unrolling the memcpy loop. Private memory
- // also hits this, although accesses may be decomposed.
+ // multiple accesses, effectively unrolling the memcpy loop.
+ // We also rely on legalization to decompose into smaller accesses for
+ // subtargets and address spaces where it is necessary.
//
// Don't unroll if Length is not a constant, since unrolling leads to worse
// performance for length values that are smaller or slightly larger than the
@@ -473,26 +454,22 @@ void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
DestAlign, AtomicCpySize);
- Align MinAlign = std::min(SrcAlign, DestAlign);
-
- if (MinAlign != Align(2)) {
- Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
- while (RemainingBytes >= 16) {
- OpsOut.push_back(I32x4Ty);
- RemainingBytes -= 16;
- }
+ Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
+ while (RemainingBytes >= 16) {
+ OpsOut.push_back(I32x4Ty);
+ RemainingBytes -= 16;
+ }
- Type *I64Ty = Type::getInt64Ty(Context);
- while (RemainingBytes >= 8) {
- OpsOut.push_back(I64Ty);
- RemainingBytes -= 8;
- }
+ Type *I64Ty = Type::getInt64Ty(Context);
+ while (RemainingBytes >= 8) {
+ OpsOut.push_back(I64Ty);
+ RemainingBytes -= 8;
+ }
- Type *I32Ty = Type::getInt32Ty(Context);
- while (RemainingBytes >= 4) {
- OpsOut.push_back(I32Ty);
- RemainingBytes -= 4;
- }
+ Type *I32Ty = Type::getInt32Ty(Context);
+ while (RemainingBytes >= 4) {
+ OpsOut.push_back(I32Ty);
+ RemainingBytes -= 4;
}
Type *I16Ty = Type::getInt16Ty(Context);
diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
index ffe9e06c04ae45..5a9f53ec0077db 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@@ -330,17 +330,17 @@ define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0,
define amdgpu_kernel void @memcpy_alt_type(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %n) #0 {
; OPT-LABEL: @memcpy_alt_type(
-; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7
+; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 15
; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
; OPT: loop-memcpy-expansion:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1
+; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 1
; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1
-; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1
+; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16
; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
; OPT: loop-memcpy-residual:
@@ -681,13 +681,25 @@ define amdgpu_kernel void @memcpy_global_align2_global_align2_1039(ptr addrspace
; OPT: load-store-loop:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(1) [[TMP1]], align 2
+; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 2
; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[TMP3]], align 2
-; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 2
-; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1038
+; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 2
+; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
+; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
; OPT: memcpy-split:
+; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
+; OPT-NEXT: [[TMP16:%.*]] = load i64, ptr addrspace(1) [[TMP15]], align 2
+; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
+; OPT-NEXT: store i64 [[TMP16]], ptr addrspace(1) [[TMP17]], align 2
+; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1032
+; OPT-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) [[TMP9]], align 2
+; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1032
+; OPT-NEXT: store i32 [[TMP10]], ptr addrspace(1) [[TMP11]], align 2
+; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1036
+; OPT-NEXT: [[TMP13:%.*]] = load i16, ptr addrspace(1) [[TMP12]], align 2
+; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1036
+; OPT-NEXT: store i16 [[TMP13]], ptr addrspace(1) [[TMP14]], align 2
; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1038
; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 2
; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1038
@@ -731,13 +743,17 @@ define amdgpu_kernel void @memcpy_global_align2_global_align4_1027(ptr addrspace
; OPT: load-store-loop:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(1) [[TMP1]], align 2
+; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[TMP3]], align 2
-; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 2
-; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1026
+; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 2
+; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
+; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
; OPT: memcpy-split:
+; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
+; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(1) [[TMP9]], align 4
+; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
+; OPT-NEXT: store i16 [[TMP10]], ptr addrspace(1) [[TMP11]], align 2
; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1026
; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 2
; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1026
@@ -754,13 +770,17 @@ define amdgpu_kernel void @memcpy_global_align4_global_align2_1027(ptr addrspace
; OPT: load-store-loop:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(1) [[TMP1]], align 2
+; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 2
; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[TMP3]], align 2
-; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 2
-; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1026
+; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
+; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
+; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
; OPT: memcpy-split:
+; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
+; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(1) [[TMP9]], align 2
+; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
+; OPT-NEXT: store i16 [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1026
; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 2
; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1026
@@ -804,13 +824,17 @@ define amdgpu_kernel void @memcpy_private_align2_private_align4_1027(ptr addrspa
; OPT: load-store-loop:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(5) [[TMP1]], align 2
+; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 4
; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(5) [[TMP3]], align 2
-; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 2
-; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1026
+; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 2
+; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256
+; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024
; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
; OPT: memcpy-split:
+; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024
+; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(5) [[TMP9]], align 4
+; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024
+; OPT-NEXT: store i16 [[TMP10]], ptr addrspace(5) [[TMP11]], align 2
; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026
; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[TMP6]], align 2
; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1026
@@ -854,13 +878,17 @@ define amdgpu_kernel void @memcpy_private_align4_private_align2_1027(ptr addrspa
; OPT: load-store-loop:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(5) [[TMP1]], align 2
+; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 2
; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(5) [[TMP3]], align 2
-; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 2
-; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1026
+; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 4
+; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256
+; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024
; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
; OPT: memcpy-split:
+; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024
+; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(5) [[TMP9]], align 2
+; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024
+; OPT-NEXT: store i16 [[TMP10]], ptr addrspace(5) [[TMP11]], align 4
; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026
; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[TMP6]], align 2
; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1026
@@ -904,13 +932,17 @@ define amdgpu_kernel void @memcpy_private_align2_private_align2_1027(ptr addrspa
; OPT: load-store-loop:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(5) [[TMP1]], align 2
+; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 2
; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(5) [[TMP3]], align 2
-; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 2
-; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1026
+; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 2
+; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256
+; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024
; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
; OPT: memcpy-split:
+; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024
+; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(5) [[TMP9]], align 2
+; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024
+; OPT-NEXT: store i16 [[TMP10]], ptr addrspace(5) [[TMP11]], align 2
; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026
; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[TMP6]], align 2
; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1026
@@ -958,17 +990,17 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_variable(ptr addrs
define amdgpu_kernel void @memcpy_global_align2_global_align2_variable(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
; OPT-LABEL: @memcpy_global_align2_global_align2_variable(
-; OPT-NEXT: [[TMP2:%.*]] = and i64 [[N:%.*]], 1
+; OPT-NEXT: [[TMP2:%.*]] = and i64 [[N:%.*]], 15
; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
; OPT: loop-memcpy-expansion:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP6:%.*]] = load i16, ptr addrspace(1) [[TMP5]], align 2
+; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 2
; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; OPT-NEXT: store i16 [[TMP6]], ptr addrspace(1) [[TMP7]], align 2
-; OPT-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX]], 2
+; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 2
+; OPT-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX]], 16
; OPT-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]]
; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
; OPT: loop-memcpy-residual:
@@ -1028,17 +1060,17 @@ define amdgpu_kernel void @memcpy_global_align1_global_align1_variable(ptr addrs
define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 {
; OPT-LABEL: @memcpy_local_align4_local_align4_variable(
-; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7
+; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 15
; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
; OPT: loop-memcpy-expansion:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 4
+; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 4
; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 4
-; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 4
+; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16
; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
; OPT: loop-memcpy-residual:
@@ -1063,17 +1095,17 @@ define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspa
define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 {
; OPT-LABEL: @memcpy_local_align2_local_align2_variable(
-; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 1
+; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 15
; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
; OPT: loop-memcpy-expansion:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP6:%.*]] = load i16, ptr addrspace(3) [[TMP5]], align 2
+; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 2
; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: store i16 [[TMP6]], ptr addrspace(3) [[TMP7]], align 2
-; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 2
+; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 2
+; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16
; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
; OPT: loop-memcpy-residual:
@@ -1098,17 +1130,17 @@ define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspa
define amdgpu_kernel void @memcpy_local_align1_local_align1_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 {
; OPT-LABEL: @memcpy_local_align1_local_align1_variable(
-; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7
+; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 15
; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
; OPT: loop-memcpy-expansion:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1
+; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 1
; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 1
-; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 1
+; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16
; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
; OPT: loop-memcpy-residual:
@@ -1133,17 +1165,17 @@ define amdgpu_kernel void @memcpy_local_align1_local_align1_variable(ptr addrspa
define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrspace(3) %dst, ptr addrspace(1) %src, i32 %n) #0 {
; OPT-LABEL: @memcpy_local_align4_global_align4_variable(
-; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7
+; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 15
; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
; OPT: loop-memcpy-expansion:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(1) [[TMP5]], align 4
+; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 4
; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 4
-; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 4
+; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16
; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
; OPT: loop-memcpy-residual:
@@ -1168,17 +1200,17 @@ define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrsp
define amdgpu_kernel void @memcpy_global_align4_local_align4_variable(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %n) #0 {
; OPT-LABEL: @memcpy_global_align4_local_align4_variable(
-; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7
+; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 15
; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
; OPT: loop-memcpy-expansion:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 4
+; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP5]], align 4
; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4
-; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4
+; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16
; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
; OPT: loop-memcpy-residual:
@@ -1693,10 +1725,10 @@ define amdgpu_kernel void @memmove_local_align1_private_align1(ptr addrspace(3)
; ALL: load-store-loop:
; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META6:![0-9]+]]
+; ALL-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META6:![0-9]+]]
; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(3) [[TMP3]], align 1, !noalias [[META6]]
-; ALL-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 8
+; ALL-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(3) [[TMP3]], align 1, !noalias [[META6]]
+; ALL-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256
; ALL-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 256
; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
; ALL: memcpy-split:
@@ -1708,17 +1740,17 @@ define amdgpu_kernel void @memmove_local_align1_private_align1(ptr addrspace(3)
define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(5) %src, i32 %size) {
; MAX1024-LABEL: @memmove_local_align1_private_align1_unknown_size(
-; MAX1024-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
+; MAX1024-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15
; MAX1024-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
; MAX1024-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
; MAX1024-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
; MAX1024: loop-memcpy-expansion:
; MAX1024-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; MAX1024-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; MAX1024-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP7]], align 1, !alias.scope [[META0:![0-9]+]]
+; MAX1024-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP7]], align 1, !alias.scope [[META0:![0-9]+]]
; MAX1024-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; MAX1024-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(3) [[TMP6]], align 1, !noalias [[META0]]
-; MAX1024-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; MAX1024-NEXT: store <4 x i32> [[TMP5]], ptr addrspace(3) [[TMP6]], align 1, !noalias [[META0]]
+; MAX1024-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16
; MAX1024-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
; MAX1024-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
; MAX1024: loop-memcpy-residual:
@@ -1738,17 +1770,17 @@ define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr
; MAX1024-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
;
; ALL-LABEL: @memmove_local_align1_private_align1_unknown_size(
-; ALL-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
+; ALL-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15
; ALL-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
; ALL-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
; ALL: loop-memcpy-expansion:
; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP7]], align 1, !alias.scope [[META9:![0-9]+]]
+; ALL-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP7]], align 1, !alias.scope [[META9:![0-9]+]]
; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(3) [[TMP6]], align 1, !noalias [[META9]]
-; ALL-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; ALL-NEXT: store <4 x i32> [[TMP5]], ptr addrspace(3) [[TMP6]], align 1, !noalias [[META9]]
+; ALL-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16
; ALL-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
; ALL-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
; ALL: loop-memcpy-residual:
@@ -1781,10 +1813,10 @@ define amdgpu_kernel void @memmove_private_align1_local_align1(ptr addrspace(5)
; ALL: load-store-loop:
; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP1]], align 1, !alias.scope [[META12:![0-9]+]]
+; ALL-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(3) [[TMP1]], align 1, !alias.scope [[META12:![0-9]+]]
; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias [[META12]]
-; ALL-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 8
+; ALL-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias [[META12]]
+; ALL-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256
; ALL-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 256
; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
; ALL: memcpy-split:
@@ -1796,17 +1828,17 @@ define amdgpu_kernel void @memmove_private_align1_local_align1(ptr addrspace(5)
define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr addrspace(5) %dst, ptr addrspace(3) %src, i32 %size) {
; MAX1024-LABEL: @memmove_private_align1_local_align1_unknown_size(
-; MAX1024-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
+; MAX1024-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15
; MAX1024-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
; MAX1024-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
; MAX1024-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
; MAX1024: loop-memcpy-expansion:
; MAX1024-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; MAX1024-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; MAX1024-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP7]], align 1, !alias.scope [[META3:![0-9]+]]
+; MAX1024-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP7]], align 1, !alias.scope [[META3:![0-9]+]]
; MAX1024-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; MAX1024-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(5) [[TMP6]], align 1, !noalias [[META3]]
-; MAX1024-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; MAX1024-NEXT: store <4 x i32> [[TMP5]], ptr addrspace(5) [[TMP6]], align 1, !noalias [[META3]]
+; MAX1024-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16
; MAX1024-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
; MAX1024-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
; MAX1024: loop-memcpy-residual:
@@ -1826,17 +1858,17 @@ define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr
; MAX1024-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
;
; ALL-LABEL: @memmove_private_align1_local_align1_unknown_size(
-; ALL-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
+; ALL-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15
; ALL-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
; ALL-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
; ALL: loop-memcpy-expansion:
; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP7]], align 1, !alias.scope [[META15:![0-9]+]]
+; ALL-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP7]], align 1, !alias.scope [[META15:![0-9]+]]
; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(5) [[TMP6]], align 1, !noalias [[META15]]
-; ALL-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; ALL-NEXT: store <4 x i32> [[TMP5]], ptr addrspace(5) [[TMP6]], align 1, !noalias [[META15]]
+; ALL-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 16
; ALL-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
; ALL-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
; ALL: loop-memcpy-residual:
@@ -1871,20 +1903,20 @@ define amdgpu_kernel void @memmove_flat_align1_local_align1(ptr addrspace(0) %ds
; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
; ALL: memmove_bwd_loop:
; ALL-NEXT: [[TMP2:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ]
-; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP2]], 8
+; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP2]], 256
; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_INDEX]]
-; ALL-NEXT: [[ELEMENT:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP3]], align 1
+; ALL-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(3) [[TMP3]], align 1
; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[BWD_INDEX]]
-; ALL-NEXT: store <2 x i32> [[ELEMENT]], ptr [[TMP4]], align 1
+; ALL-NEXT: store <64 x i32> [[ELEMENT]], ptr [[TMP4]], align 1
; ALL-NEXT: [[TMP5:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
; ALL: memmove_fwd_loop:
; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i32 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_INDEX]]
-; ALL-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP6]], align 1
+; ALL-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(3) [[TMP6]], align 1
; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[FWD_INDEX]]
-; ALL-NEXT: store <2 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1
-; ALL-NEXT: [[TMP8]] = add i32 [[FWD_INDEX]], 8
+; ALL-NEXT: store <64 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1
+; ALL-NEXT: [[TMP8]] = add i32 [[FWD_INDEX]], 256
; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 256
; ALL-NEXT: br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
; ALL: memmove_done:
@@ -1896,7 +1928,7 @@ define amdgpu_kernel void @memmove_flat_align1_local_align1(ptr addrspace(0) %ds
define amdgpu_kernel void @memmove_flat_align1_local_align1_unknown_size(ptr addrspace(0) %dst, ptr addrspace(3) %src, i32 %size) {
; OPT-LABEL: @memmove_flat_align1_local_align1_unknown_size(
-; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
+; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15
; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP3]], 0
@@ -1918,11 +1950,11 @@ define amdgpu_kernel void @memmove_flat_align1_local_align1_unknown_size(ptr add
; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
; OPT: memmove_bwd_main_loop:
; OPT-NEXT: [[TMP9:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ]
-; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 8
+; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 16
; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_MAIN_INDEX]]
-; OPT-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP10]], align 1
+; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP10]], align 1
; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[BWD_MAIN_INDEX]]
-; OPT-NEXT: store <2 x i32> [[ELEMENT1]], ptr [[TMP11]], align 1
+; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr [[TMP11]], align 1
; OPT-NEXT: [[TMP12:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
; OPT-NEXT: br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
; OPT: memmove_copy_forward:
@@ -1930,10 +1962,10 @@ define amdgpu_kernel void @memmove_flat_align1_local_align1_unknown_size(ptr add
; OPT: memmove_fwd_main_loop:
; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_MAIN_INDEX]]
-; OPT-NEXT: [[ELEMENT2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP13]], align 1
+; OPT-NEXT: [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP13]], align 1
; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[FWD_MAIN_INDEX]]
-; OPT-NEXT: store <2 x i32> [[ELEMENT2]], ptr [[TMP14]], align 1
-; OPT-NEXT: [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 8
+; OPT-NEXT: store <4 x i32> [[ELEMENT2]], ptr [[TMP14]], align 1
+; OPT-NEXT: [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 16
; OPT-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP3]]
; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
; OPT: memmove_fwd_middle:
@@ -1965,20 +1997,20 @@ define amdgpu_kernel void @memmove_local_align1_flat_align1(ptr addrspace(3) %ds
; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
; ALL: memmove_bwd_loop:
; ALL-NEXT: [[TMP2:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ]
-; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP2]], 8
+; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP2]], 256
; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[BWD_INDEX]]
-; ALL-NEXT: [[ELEMENT:%.*]] = load <2 x i32>, ptr [[TMP3]], align 1
+; ALL-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr [[TMP3]], align 1
; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_INDEX]]
-; ALL-NEXT: store <2 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP4]], align 1
+; ALL-NEXT: store <64 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP4]], align 1
; ALL-NEXT: [[TMP5:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
; ALL: memmove_fwd_loop:
; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i32 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[FWD_INDEX]]
-; ALL-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr [[TMP6]], align 1
+; ALL-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr [[TMP6]], align 1
; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_INDEX]]
-; ALL-NEXT: store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP7]], align 1
-; ALL-NEXT: [[TMP8]] = add i32 [[FWD_INDEX]], 8
+; ALL-NEXT: store <64 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP7]], align 1
+; ALL-NEXT: [[TMP8]] = add i32 [[FWD_INDEX]], 256
; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 256
; ALL-NEXT: br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
; ALL: memmove_done:
@@ -1990,7 +2022,7 @@ define amdgpu_kernel void @memmove_local_align1_flat_align1(ptr addrspace(3) %ds
define amdgpu_kernel void @memmove_local_align1_flat_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(0) %src, i32 %size) {
; OPT-LABEL: @memmove_local_align1_flat_align1_unknown_size(
-; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
+; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15
; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP3]], 0
@@ -2012,11 +2044,11 @@ define amdgpu_kernel void @memmove_local_align1_flat_align1_unknown_size(ptr add
; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
; OPT: memmove_bwd_main_loop:
; OPT-NEXT: [[TMP9:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ]
-; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 8
+; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 16
; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[BWD_MAIN_INDEX]]
-; OPT-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr [[TMP10]], align 1
+; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr [[TMP10]], align 1
; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_MAIN_INDEX]]
-; OPT-NEXT: store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP11]], align 1
+; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP11]], align 1
; OPT-NEXT: [[TMP12:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
; OPT-NEXT: br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
; OPT: memmove_copy_forward:
@@ -2024,10 +2056,10 @@ define amdgpu_kernel void @memmove_local_align1_flat_align1_unknown_size(ptr add
; OPT: memmove_fwd_main_loop:
; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[FWD_MAIN_INDEX]]
-; OPT-NEXT: [[ELEMENT2:%.*]] = load <2 x i32>, ptr [[TMP13]], align 1
+; OPT-NEXT: [[ELEMENT2:%.*]] = load <4 x i32>, ptr [[TMP13]], align 1
; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_MAIN_INDEX]]
-; OPT-NEXT: store <2 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP14]], align 1
-; OPT-NEXT: [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 8
+; OPT-NEXT: store <4 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP14]], align 1
+; OPT-NEXT: [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 16
; OPT-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP3]]
; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
; OPT: memmove_fwd_middle:
@@ -2058,20 +2090,20 @@ define amdgpu_kernel void @memmove_local_align1_local_align1(ptr addrspace(3) %d
; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
; ALL: memmove_bwd_loop:
; ALL-NEXT: [[TMP1:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ]
-; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP1]], 8
+; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP1]], 256
; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_INDEX]]
-; ALL-NEXT: [[ELEMENT:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP2]], align 1
+; ALL-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(3) [[TMP2]], align 1
; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_INDEX]]
-; ALL-NEXT: store <2 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP3]], align 1
+; ALL-NEXT: store <64 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP3]], align 1
; ALL-NEXT: [[TMP4:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
; ALL-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
; ALL: memmove_fwd_loop:
; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i32 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_INDEX]]
-; ALL-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1
+; ALL-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(3) [[TMP5]], align 1
; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_INDEX]]
-; ALL-NEXT: store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP6]], align 1
-; ALL-NEXT: [[TMP7]] = add i32 [[FWD_INDEX]], 8
+; ALL-NEXT: store <64 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP6]], align 1
+; ALL-NEXT: [[TMP7]] = add i32 [[FWD_INDEX]], 256
; ALL-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 256
; ALL-NEXT: br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
; ALL: memmove_done:
@@ -2083,7 +2115,7 @@ define amdgpu_kernel void @memmove_local_align1_local_align1(ptr addrspace(3) %d
define amdgpu_kernel void @memmove_local_align1_local_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %size) {
; OPT-LABEL: @memmove_local_align1_local_align1_unknown_size(
-; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
+; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15
; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP3]], 0
@@ -2104,11 +2136,11 @@ define amdgpu_kernel void @memmove_local_align1_local_align1_unknown_size(ptr ad
; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
; OPT: memmove_bwd_main_loop:
; OPT-NEXT: [[TMP8:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ]
-; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP8]], 8
+; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP8]], 16
; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_MAIN_INDEX]]
-; OPT-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP9]], align 1
+; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP9]], align 1
; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_MAIN_INDEX]]
-; OPT-NEXT: store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP10]], align 1
+; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP10]], align 1
; OPT-NEXT: [[TMP11:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
; OPT-NEXT: br i1 [[TMP11]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
; OPT: memmove_copy_forward:
@@ -2116,10 +2148,10 @@ define amdgpu_kernel void @memmove_local_align1_local_align1_unknown_size(ptr ad
; OPT: memmove_fwd_main_loop:
; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP14:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_MAIN_INDEX]]
-; OPT-NEXT: [[ELEMENT2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP12]], align 1
+; OPT-NEXT: [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP12]], align 1
; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_MAIN_INDEX]]
-; OPT-NEXT: store <2 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP13]], align 1
-; OPT-NEXT: [[TMP14]] = add i32 [[FWD_MAIN_INDEX]], 8
+; OPT-NEXT: store <4 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP13]], align 1
+; OPT-NEXT: [[TMP14]] = add i32 [[FWD_MAIN_INDEX]], 16
; OPT-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP14]], [[TMP3]]
; OPT-NEXT: br i1 [[TMP15]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
; OPT: memmove_fwd_middle:
diff --git a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
index a68d2e575607d4..bc8bcc622810f5 100644
--- a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
@@ -306,10 +306,10 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align
; CHECK-LABEL: memmove_p0_p3:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_and_b32_e32 v7, 7, v3
+; CHECK-NEXT: v_and_b32_e32 v7, 15, v3
; CHECK-NEXT: v_mov_b32_e32 v8, 0
; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; CHECK-NEXT: v_and_b32_e32 v5, -8, v3
+; CHECK-NEXT: v_and_b32_e32 v5, -16, v3
; CHECK-NEXT: v_mov_b32_e32 v6, v4
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[7:8]
@@ -338,15 +338,15 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB2_5: ; %memmove_fwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ds_read_b64 v[13:14], v4
-; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -8
+; CHECK-NEXT: ds_read_b128 v[13:16], v4
+; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -16
; CHECK-NEXT: v_add_co_ci_u32_e64 v12, s5, -1, v12, s5
-; CHECK-NEXT: v_add_nc_u32_e32 v4, 8, v4
+; CHECK-NEXT: v_add_nc_u32_e32 v4, 16, v4
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[11:12]
; CHECK-NEXT: s_or_b32 s9, s5, s9
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[9:10], v[13:14]
-; CHECK-NEXT: v_add_co_u32 v9, s6, v9, 8
+; CHECK-NEXT: flat_store_dwordx4 v[9:10], v[13:16]
+; CHECK-NEXT: v_add_co_u32 v9, s6, v9, 16
; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s6, 0, v10, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB2_5
@@ -355,7 +355,7 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: s_and_saveexec_b32 s8, s4
; CHECK-NEXT: s_cbranch_execz .LBB2_9
; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
-; CHECK-NEXT: v_and_b32_e32 v3, -8, v3
+; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v5
; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v6, s5
; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3
@@ -414,26 +414,26 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB2_16
; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
-; CHECK-NEXT: v_and_b32_e32 v3, -8, v3
-; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -8
+; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
+; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; CHECK-NEXT: v_add3_u32 v2, v3, v2, -8
+; CHECK-NEXT: v_add3_u32 v2, v3, v2, -16
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB2_15: ; %memmove_bwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ds_read_b64 v[3:4], v2
-; CHECK-NEXT: v_add_co_u32 v7, vcc_lo, v5, -8
-; CHECK-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, -1, v6, vcc_lo
-; CHECK-NEXT: v_add_co_u32 v9, vcc_lo, v0, v5
-; CHECK-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v1, v6, vcc_lo
-; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[7:8]
-; CHECK-NEXT: v_mov_b32_e32 v5, v7
-; CHECK-NEXT: v_add_nc_u32_e32 v2, -8, v2
-; CHECK-NEXT: v_mov_b32_e32 v6, v8
+; CHECK-NEXT: ds_read_b128 v[7:10], v2
+; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v5, -16
+; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v6, vcc_lo
+; CHECK-NEXT: v_add_co_u32 v11, vcc_lo, v0, v5
+; CHECK-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v1, v6, vcc_lo
+; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[3:4]
+; CHECK-NEXT: v_mov_b32_e32 v6, v4
+; CHECK-NEXT: v_add_nc_u32_e32 v2, -16, v2
+; CHECK-NEXT: v_mov_b32_e32 v5, v3
; CHECK-NEXT: s_or_b32 s7, s4, s7
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[9:10], v[3:4]
+; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[7:10]
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB2_15
; CHECK-NEXT: .LBB2_16: ; %Flow36
@@ -1043,9 +1043,9 @@ define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align
; CHECK-LABEL: memmove_p1_p3:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_and_b32_e32 v7, -8, v3
+; CHECK-NEXT: v_and_b32_e32 v7, -16, v3
; CHECK-NEXT: v_mov_b32_e32 v8, v4
-; CHECK-NEXT: v_and_b32_e32 v5, 7, v3
+; CHECK-NEXT: v_and_b32_e32 v5, 15, v3
; CHECK-NEXT: v_mov_b32_e32 v6, 0
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, exec_lo
@@ -1056,16 +1056,16 @@ define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: .LBB7_2: ; %loop-memcpy-expansion
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ds_read_b64 v[10:11], v9
-; CHECK-NEXT: v_add_co_u32 v12, vcc_lo, v0, s4
-; CHECK-NEXT: s_add_u32 s4, s4, 8
-; CHECK-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT: ds_read_b128 v[10:13], v9
+; CHECK-NEXT: v_add_co_u32 v14, vcc_lo, v0, s4
+; CHECK-NEXT: s_add_u32 s4, s4, 16
+; CHECK-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, s5, v1, vcc_lo
; CHECK-NEXT: s_addc_u32 s5, s5, 0
-; CHECK-NEXT: v_add_nc_u32_e32 v9, 8, v9
+; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8]
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_store_dwordx2 v[12:13], v[10:11], off
+; CHECK-NEXT: global_store_dwordx4 v[14:15], v[10:13], off
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB7_2
; CHECK-NEXT: .LBB7_3: ; %Flow9
@@ -1076,7 +1076,7 @@ define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6
; CHECK-NEXT: s_cbranch_execz .LBB7_7
; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader
-; CHECK-NEXT: v_and_b32_e32 v3, -8, v3
+; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v3
@@ -1327,11 +1327,11 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align
; CHECK-LABEL: memmove_p3_p0:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_and_b32_e32 v5, 7, v3
+; CHECK-NEXT: v_and_b32_e32 v5, 15, v3
; CHECK-NEXT: v_mov_b32_e32 v6, 0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v0
; CHECK-NEXT: s_mov_b64 s[4:5], src_shared_base
-; CHECK-NEXT: v_and_b32_e32 v7, -8, v3
+; CHECK-NEXT: v_and_b32_e32 v7, -16, v3
; CHECK-NEXT: v_mov_b32_e32 v8, v4
; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[5:6]
; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, s5, vcc_lo
@@ -1361,16 +1361,16 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB10_5: ; %memmove_fwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: flat_load_dwordx2 v[13:14], v[9:10]
-; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -8
+; CHECK-NEXT: flat_load_dwordx4 v[13:16], v[9:10]
+; CHECK-NEXT: v_add_co_u32 v11, s5, v11, -16
; CHECK-NEXT: v_add_co_ci_u32_e64 v12, s5, -1, v12, s5
-; CHECK-NEXT: v_add_co_u32 v9, s5, v9, 8
+; CHECK-NEXT: v_add_co_u32 v9, s5, v9, 16
; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s5, 0, v10, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s6, 0, v[11:12]
; CHECK-NEXT: s_or_b32 s9, s6, s9
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_write_b64 v4, v[13:14]
-; CHECK-NEXT: v_add_nc_u32_e32 v4, 8, v4
+; CHECK-NEXT: ds_write_b128 v4, v[13:16]
+; CHECK-NEXT: v_add_nc_u32_e32 v4, 16, v4
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; CHECK-NEXT: s_cbranch_execnz .LBB10_5
; CHECK-NEXT: .LBB10_6: ; %Flow34
@@ -1378,7 +1378,7 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align
; CHECK-NEXT: s_and_saveexec_b32 s8, s4
; CHECK-NEXT: s_cbranch_execz .LBB10_9
; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
-; CHECK-NEXT: v_and_b32_e32 v3, -8, v3
+; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: v_add_nc_u32_e32 v3, v0, v3
; CHECK-NEXT: v_add_co_u32 v0, s5, v1, v7
@@ -1437,23 +1437,23 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align
; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB10_16
; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
-; CHECK-NEXT: v_and_b32_e32 v3, -8, v3
-; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, -8
+; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
+; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, -1, v2, vcc_lo
-; CHECK-NEXT: v_add3_u32 v0, v3, v0, -8
+; CHECK-NEXT: v_add3_u32 v0, v3, v0, -16
; CHECK-NEXT: s_mov_b32 s5, 0
; CHECK-NEXT: .LBB10_15: ; %memmove_bwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v3, vcc_lo, v1, v7
; CHECK-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v2, v8, vcc_lo
-; CHECK-NEXT: v_add_co_u32 v7, vcc_lo, v7, -8
+; CHECK-NEXT: v_add_co_u32 v7, vcc_lo, v7, -16
; CHECK-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, -1, v8, vcc_lo
-; CHECK-NEXT: flat_load_dwordx2 v[3:4], v[3:4]
+; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[3:4]
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[7:8]
; CHECK-NEXT: s_or_b32 s5, vcc_lo, s5
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_write_b64 v0, v[3:4]
-; CHECK-NEXT: v_add_nc_u32_e32 v0, -8, v0
+; CHECK-NEXT: ds_write_b128 v0, v[3:6]
+; CHECK-NEXT: v_add_nc_u32_e32 v0, -16, v0
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; CHECK-NEXT: s_cbranch_execnz .LBB10_15
; CHECK-NEXT: .LBB10_16: ; %Flow36
@@ -1470,9 +1470,9 @@ define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align
; CHECK-LABEL: memmove_p3_p1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_and_b32_e32 v7, -8, v3
+; CHECK-NEXT: v_and_b32_e32 v7, -16, v3
; CHECK-NEXT: v_mov_b32_e32 v8, v4
-; CHECK-NEXT: v_and_b32_e32 v5, 7, v3
+; CHECK-NEXT: v_and_b32_e32 v5, 15, v3
; CHECK-NEXT: v_mov_b32_e32 v6, 0
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, exec_lo
@@ -1485,14 +1485,14 @@ define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4
; CHECK-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, s5, v2, vcc_lo
-; CHECK-NEXT: s_add_u32 s4, s4, 8
+; CHECK-NEXT: s_add_u32 s4, s4, 16
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8]
-; CHECK-NEXT: global_load_dwordx2 v[10:11], v[10:11], off
+; CHECK-NEXT: global_load_dwordx4 v[10:13], v[10:11], off
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: ds_write_b64 v9, v[10:11]
-; CHECK-NEXT: v_add_nc_u32_e32 v9, 8, v9
+; CHECK-NEXT: ds_write_b128 v9, v[10:13]
+; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB11_2
; CHECK-NEXT: .LBB11_3: ; %Flow9
@@ -1503,7 +1503,7 @@ define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align
; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6
; CHECK-NEXT: s_cbranch_execz .LBB11_7
; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader
-; CHECK-NEXT: v_and_b32_e32 v3, -8, v3
+; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3
@@ -1538,8 +1538,8 @@ define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v5, 0
-; CHECK-NEXT: v_and_b32_e32 v4, 7, v2
-; CHECK-NEXT: v_and_b32_e32 v6, -8, v2
+; CHECK-NEXT: v_and_b32_e32 v4, 15, v2
+; CHECK-NEXT: v_and_b32_e32 v6, -16, v2
; CHECK-NEXT: v_mov_b32_e32 v7, v3
; CHECK-NEXT: s_mov_b32 s6, exec_lo
; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[4:5]
@@ -1563,15 +1563,15 @@ define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: .LBB12_5: ; %memmove_fwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ds_read_b64 v[9:10], v3
-; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -8
+; CHECK-NEXT: ds_read_b128 v[9:12], v3
+; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -16
; CHECK-NEXT: v_add_co_ci_u32_e64 v7, s5, -1, v7, s5
-; CHECK-NEXT: v_add_nc_u32_e32 v3, 8, v3
+; CHECK-NEXT: v_add_nc_u32_e32 v3, 16, v3
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[6:7]
; CHECK-NEXT: s_or_b32 s8, s5, s8
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: ds_write_b64 v8, v[9:10]
-; CHECK-NEXT: v_add_nc_u32_e32 v8, 8, v8
+; CHECK-NEXT: ds_write_b128 v8, v[9:12]
+; CHECK-NEXT: v_add_nc_u32_e32 v8, 16, v8
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execnz .LBB12_5
; CHECK-NEXT: .LBB12_6: ; %Flow41
@@ -1579,7 +1579,7 @@ define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: s_and_saveexec_b32 s7, s4
; CHECK-NEXT: s_cbranch_execz .LBB12_9
; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
-; CHECK-NEXT: v_and_b32_e32 v2, -8, v2
+; CHECK-NEXT: v_and_b32_e32 v2, -16, v2
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2
; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2
@@ -1630,24 +1630,24 @@ define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: s_and_saveexec_b32 s4, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB12_16
; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
-; CHECK-NEXT: v_and_b32_e32 v5, -8, v2
+; CHECK-NEXT: v_and_b32_e32 v5, -16, v2
; CHECK-NEXT: s_mov_b32 s6, 0
-; CHECK-NEXT: v_add_nc_u32_e32 v4, -8, v5
+; CHECK-NEXT: v_add_nc_u32_e32 v4, -16, v5
; CHECK-NEXT: v_add_nc_u32_e32 v2, v0, v4
; CHECK-NEXT: v_sub_co_u32 v0, vcc_lo, 0, v5
; CHECK-NEXT: v_add_nc_u32_e32 v4, v1, v4
; CHECK-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
; CHECK-NEXT: .LBB12_15: ; %memmove_bwd_main_loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ds_read_b64 v[5:6], v4
-; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, 8
+; CHECK-NEXT: ds_read_b128 v[5:8], v4
+; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, 16
; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; CHECK-NEXT: v_add_nc_u32_e32 v4, -8, v4
+; CHECK-NEXT: v_add_nc_u32_e32 v4, -16, v4
; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
; CHECK-NEXT: s_or_b32 s6, vcc_lo, s6
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: ds_write_b64 v2, v[5:6]
-; CHECK-NEXT: v_add_nc_u32_e32 v2, -8, v2
+; CHECK-NEXT: ds_write_b128 v2, v[5:8]
+; CHECK-NEXT: v_add_nc_u32_e32 v2, -16, v2
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_cbranch_execnz .LBB12_15
; CHECK-NEXT: .LBB12_16: ; %Flow43
@@ -1664,9 +1664,9 @@ define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align
; CHECK-LABEL: memmove_p3_p4:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_and_b32_e32 v7, -8, v3
+; CHECK-NEXT: v_and_b32_e32 v7, -16, v3
; CHECK-NEXT: v_mov_b32_e32 v8, v4
-; CHECK-NEXT: v_and_b32_e32 v5, 7, v3
+; CHECK-NEXT: v_and_b32_e32 v5, 15, v3
; CHECK-NEXT: v_mov_b32_e32 v6, 0
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, exec_lo
@@ -1679,14 +1679,14 @@ define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_co_u32 v10, vcc_lo, v1, s4
; CHECK-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, s5, v2, vcc_lo
-; CHECK-NEXT: s_add_u32 s4, s4, 8
+; CHECK-NEXT: s_add_u32 s4, s4, 16
; CHECK-NEXT: s_addc_u32 s5, s5, 0
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8]
-; CHECK-NEXT: global_load_dwordx2 v[10:11], v[10:11], off
+; CHECK-NEXT: global_load_dwordx4 v[10:13], v[10:11], off
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: ds_write_b64 v9, v[10:11]
-; CHECK-NEXT: v_add_nc_u32_e32 v9, 8, v9
+; CHECK-NEXT: ds_write_b128 v9, v[10:13]
+; CHECK-NEXT: v_add_nc_u32_e32 v9, 16, v9
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB13_2
; CHECK-NEXT: .LBB13_3: ; %Flow9
@@ -1697,7 +1697,7 @@ define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align
; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6
; CHECK-NEXT: s_cbranch_execz .LBB13_7
; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader
-; CHECK-NEXT: v_and_b32_e32 v3, -8, v3
+; CHECK-NEXT: v_and_b32_e32 v3, -16, v3
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v3
@@ -1735,27 +1735,30 @@ define void @memmove_p3_p5(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align
; CHECK-NEXT: v_mov_b32_e32 v6, 0
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, exec_lo
-; CHECK-NEXT: v_and_b32_e32 v2, -8, v4
-; CHECK-NEXT: v_and_b32_e32 v5, 7, v4
+; CHECK-NEXT: v_and_b32_e32 v2, -16, v4
+; CHECK-NEXT: v_and_b32_e32 v5, 15, v4
; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[2:3]
; CHECK-NEXT: s_cbranch_execz .LBB14_3
; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader
; CHECK-NEXT: v_mov_b32_e32 v7, v1
; CHECK-NEXT: v_mov_b32_e32 v8, v0
; CHECK-NEXT: s_mov_b32 s7, 0
+; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB14_2: ; %loop-memcpy-expansion
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: s_clause 0x3
; CHECK-NEXT: buffer_load_dword v9, v7, s[0:3], 0 offen
; CHECK-NEXT: buffer_load_dword v10, v7, s[0:3], 0 offen offset:4
-; CHECK-NEXT: s_add_u32 s4, s4, 8
+; CHECK-NEXT: buffer_load_dword v11, v7, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v12, v7, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_add_u32 s4, s4, 16
; CHECK-NEXT: s_addc_u32 s5, s5, 0
-; CHECK-NEXT: v_add_nc_u32_e32 v7, 8, v7
+; CHECK-NEXT: v_add_nc_u32_e32 v7, 16, v7
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[2:3]
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: ds_write_b64 v8, v[9:10]
-; CHECK-NEXT: v_add_nc_u32_e32 v8, 8, v8
+; CHECK-NEXT: ds_write_b128 v8, v[9:12]
+; CHECK-NEXT: v_add_nc_u32_e32 v8, 16, v8
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB14_2
; CHECK-NEXT: .LBB14_3: ; %Flow14
@@ -1766,7 +1769,7 @@ define void @memmove_p3_p5(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align
; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6
; CHECK-NEXT: s_cbranch_execz .LBB14_7
; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader
-; CHECK-NEXT: v_and_b32_e32 v2, -8, v4
+; CHECK-NEXT: v_and_b32_e32 v2, -16, v4
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2
; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2
@@ -2021,25 +2024,28 @@ define void @memmove_p5_p3(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: v_mov_b32_e32 v6, 0
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, exec_lo
-; CHECK-NEXT: v_and_b32_e32 v2, -8, v4
-; CHECK-NEXT: v_and_b32_e32 v5, 7, v4
+; CHECK-NEXT: v_and_b32_e32 v2, -16, v4
+; CHECK-NEXT: v_and_b32_e32 v5, 15, v4
; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[2:3]
; CHECK-NEXT: s_cbranch_execz .LBB17_3
; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader
; CHECK-NEXT: v_mov_b32_e32 v7, v1
; CHECK-NEXT: v_mov_b32_e32 v8, v0
; CHECK-NEXT: s_mov_b32 s7, 0
+; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB17_2: ; %loop-memcpy-expansion
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ds_read_b64 v[9:10], v7
-; CHECK-NEXT: s_add_u32 s4, s4, 8
+; CHECK-NEXT: ds_read_b128 v[9:12], v7
+; CHECK-NEXT: s_add_u32 s4, s4, 16
; CHECK-NEXT: s_addc_u32 s5, s5, 0
-; CHECK-NEXT: v_add_nc_u32_e32 v7, 8, v7
+; CHECK-NEXT: v_add_nc_u32_e32 v7, 16, v7
; CHECK-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[2:3]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v12, v8, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v11, v8, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_store_dword v10, v8, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen
-; CHECK-NEXT: v_add_nc_u32_e32 v8, 8, v8
+; CHECK-NEXT: v_add_nc_u32_e32 v8, 16, v8
; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: s_cbranch_execnz .LBB17_2
@@ -2051,7 +2057,7 @@ define void @memmove_p5_p3(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: s_xor_b32 s6, exec_lo, s6
; CHECK-NEXT: s_cbranch_execz .LBB17_7
; CHECK-NEXT: ; %bb.4: ; %loop-memcpy-residual.preheader
-; CHECK-NEXT: v_and_b32_e32 v2, -8, v4
+; CHECK-NEXT: v_and_b32_e32 v2, -16, v4
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v2
; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v2
More information about the llvm-commits
mailing list