[llvm] 4c697f7 - [LowerMemIntrinsics] Use i8 GEPs in memcpy/memmove lowering (#112707)

via llvm-commits llvm-commits at lists.llvm.org
Tue Oct 22 07:48:54 PDT 2024


Author: Fabian Ritter
Date: 2024-10-22T16:48:50+02:00
New Revision: 4c697f7037b719b9ed1fa3bcaa68c4a4219c2fc5

URL: https://github.com/llvm/llvm-project/commit/4c697f7037b719b9ed1fa3bcaa68c4a4219c2fc5
DIFF: https://github.com/llvm/llvm-project/commit/4c697f7037b719b9ed1fa3bcaa68c4a4219c2fc5.diff

LOG: [LowerMemIntrinsics] Use i8 GEPs in memcpy/memmove lowering (#112707)

The IR lowering of memcpy/memmove intrinsics uses a target-specific type
for its load/store operations. So far, the loaded and stored addresses
are computed with GEPs based on this type. That is wrong if the
allocation size of the type differs from its store size: The width of
the accesses is determined by the store size, while the GEP stride is
determined by the allocation size. If the allocation size is greater
than the store size, some bytes are not copied/moved.

This patch changes the GEPs to use i8 addressing, with offsets based on
the type's store size. The correctness of the lowering therefore no
longer depends on the type's allocation size.

This is in support of PR #112332, which allows adjusting the memcpy loop
lowering type through a command line argument in the AMDGPU backend.

Added: 
    

Modified: 
    llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
    llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
    llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
    llvm/test/CodeGen/AMDGPU/memmove-var-size.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index ba62d75250c85e..546217093550a2 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -49,11 +49,12 @@ void llvm::createMemCpyLoopKnownSize(
   assert((!AtomicElementSize || !LoopOpType->isVectorTy()) &&
          "Atomic memcpy lowering is not supported for vector operand type");
 
+  Type *Int8Type = Type::getInt8Ty(Ctx);
   unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
   assert((!AtomicElementSize || LoopOpSize % *AtomicElementSize == 0) &&
-      "Atomic memcpy lowering is not supported for selected operand size");
+         "Atomic memcpy lowering is not supported for selected operand size");
 
-  uint64_t LoopEndCount = CopyLen->getZExtValue() / LoopOpSize;
+  uint64_t LoopEndCount = alignDown(CopyLen->getZExtValue(), LoopOpSize);
 
   if (LoopEndCount != 0) {
     // Split
@@ -71,8 +72,12 @@ void llvm::createMemCpyLoopKnownSize(
     PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 2, "loop-index");
     LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0U), PreLoopBB);
     // Loop Body
-    Value *SrcGEP =
-        LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex);
+
+    // If we used LoopOpType as GEP element type, we would iterate over the
+    // buffers in TypeStoreSize strides while copying TypeAllocSize bytes, i.e.,
+    // we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore, use
+    // byte offsets computed from the TypeStoreSize.
+    Value *SrcGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, LoopIndex);
     LoadInst *Load = LoopBuilder.CreateAlignedLoad(LoopOpType, SrcGEP,
                                                    PartSrcAlign, SrcIsVolatile);
     if (!CanOverlap) {
@@ -80,8 +85,7 @@ void llvm::createMemCpyLoopKnownSize(
       Load->setMetadata(LLVMContext::MD_alias_scope,
                         MDNode::get(Ctx, NewScope));
     }
-    Value *DstGEP =
-        LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex);
+    Value *DstGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LoopIndex);
     StoreInst *Store = LoopBuilder.CreateAlignedStore(
         Load, DstGEP, PartDstAlign, DstIsVolatile);
     if (!CanOverlap) {
@@ -92,8 +96,8 @@ void llvm::createMemCpyLoopKnownSize(
       Load->setAtomic(AtomicOrdering::Unordered);
       Store->setAtomic(AtomicOrdering::Unordered);
     }
-    Value *NewIndex =
-        LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1U));
+    Value *NewIndex = LoopBuilder.CreateAdd(
+        LoopIndex, ConstantInt::get(TypeOfCopyLen, LoopOpSize));
     LoopIndex->addIncoming(NewIndex, LoopBB);
 
     // Create the loop branch condition.
@@ -102,7 +106,7 @@ void llvm::createMemCpyLoopKnownSize(
                              LoopBB, PostLoopBB);
   }
 
-  uint64_t BytesCopied = LoopEndCount * LoopOpSize;
+  uint64_t BytesCopied = LoopEndCount;
   uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopied;
   if (RemainingBytes) {
     IRBuilder<> RBuilder(PostLoopBB ? PostLoopBB->getFirstNonPHI()
@@ -117,18 +121,13 @@ void llvm::createMemCpyLoopKnownSize(
       Align PartSrcAlign(commonAlignment(SrcAlign, BytesCopied));
       Align PartDstAlign(commonAlignment(DstAlign, BytesCopied));
 
-      // Calculate the new index
       unsigned OperandSize = DL.getTypeStoreSize(OpTy);
       assert(
           (!AtomicElementSize || OperandSize % *AtomicElementSize == 0) &&
           "Atomic memcpy lowering is not supported for selected operand size");
 
-      uint64_t GepIndex = BytesCopied / OperandSize;
-      assert(GepIndex * OperandSize == BytesCopied &&
-             "Division should have no Remainder!");
-
       Value *SrcGEP = RBuilder.CreateInBoundsGEP(
-          OpTy, SrcAddr, ConstantInt::get(TypeOfCopyLen, GepIndex));
+          Int8Type, SrcAddr, ConstantInt::get(TypeOfCopyLen, BytesCopied));
       LoadInst *Load =
           RBuilder.CreateAlignedLoad(OpTy, SrcGEP, PartSrcAlign, SrcIsVolatile);
       if (!CanOverlap) {
@@ -137,7 +136,7 @@ void llvm::createMemCpyLoopKnownSize(
                           MDNode::get(Ctx, NewScope));
       }
       Value *DstGEP = RBuilder.CreateInBoundsGEP(
-          OpTy, DstAddr, ConstantInt::get(TypeOfCopyLen, GepIndex));
+          Int8Type, DstAddr, ConstantInt::get(TypeOfCopyLen, BytesCopied));
       StoreInst *Store = RBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign,
                                                      DstIsVolatile);
       if (!CanOverlap) {
@@ -155,16 +154,6 @@ void llvm::createMemCpyLoopKnownSize(
          "Bytes copied should match size in the call!");
 }
 
-// \returns \p Len udiv \p OpSize, checking for optimization opportunities.
-static Value *getRuntimeLoopCount(const DataLayout &DL, IRBuilderBase &B,
-                                  Value *Len, Value *OpSize,
-                                  unsigned OpSizeVal) {
-  // For powers of 2, we can lshr by log2 instead of using udiv.
-  if (isPowerOf2_32(OpSizeVal))
-    return B.CreateLShr(Len, Log2_32(OpSizeVal));
-  return B.CreateUDiv(Len, OpSize);
-}
-
 // \returns \p Len urem \p OpSize, checking for optimization opportunities.
 static Value *getRuntimeLoopRemainder(const DataLayout &DL, IRBuilderBase &B,
                                       Value *Len, Value *OpSize,
@@ -175,6 +164,18 @@ static Value *getRuntimeLoopRemainder(const DataLayout &DL, IRBuilderBase &B,
   return B.CreateURem(Len, OpSize);
 }
 
+// \returns (\p Len udiv \p OpSize) mul \p OpSize, checking for optimization
+// opportunities.
+// If RTLoopRemainder is provided, it must be the result of
+// getRuntimeLoopRemainder() with the same arguments.
+static Value *getRuntimeLoopBytes(const DataLayout &DL, IRBuilderBase &B,
+                                  Value *Len, Value *OpSize, unsigned OpSizeVal,
+                                  Value *RTLoopRemainder = nullptr) {
+  if (!RTLoopRemainder)
+    RTLoopRemainder = getRuntimeLoopRemainder(DL, B, Len, OpSize, OpSizeVal);
+  return B.CreateSub(Len, RTLoopRemainder);
+}
+
 void llvm::createMemCpyLoopUnknownSize(
     Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, Value *CopyLen,
     Align SrcAlign, Align DstAlign, bool SrcIsVolatile, bool DstIsVolatile,
@@ -213,10 +214,15 @@ void llvm::createMemCpyLoopUnknownSize(
   Type *Int8Type = Type::getInt8Ty(Ctx);
   bool LoopOpIsInt8 = LoopOpType == Int8Type;
   ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
-  Value *RuntimeLoopCount = LoopOpIsInt8
-                                ? CopyLen
-                                : getRuntimeLoopCount(DL, PLBuilder, CopyLen,
-                                                      CILoopOpSize, LoopOpSize);
+
+  Value *RuntimeLoopBytes = CopyLen;
+  Value *RuntimeResidualBytes = nullptr;
+  if (!LoopOpIsInt8) {
+    RuntimeResidualBytes = getRuntimeLoopRemainder(DL, PLBuilder, CopyLen,
+                                                   CILoopOpSize, LoopOpSize);
+    RuntimeLoopBytes = getRuntimeLoopBytes(DL, PLBuilder, CopyLen, CILoopOpSize,
+                                           LoopOpSize, RuntimeResidualBytes);
+  }
 
   BasicBlock *LoopBB =
       BasicBlock::Create(Ctx, "loop-memcpy-expansion", ParentFunc, PostLoopBB);
@@ -228,14 +234,18 @@ void llvm::createMemCpyLoopUnknownSize(
   PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLenType, 2, "loop-index");
   LoopIndex->addIncoming(ConstantInt::get(CopyLenType, 0U), PreLoopBB);
 
-  Value *SrcGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex);
+  // If we used LoopOpType as GEP element type, we would iterate over the
+  // buffers in TypeStoreSize strides while copying TypeAllocSize bytes, i.e.,
+  // we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore, use byte
+  // offsets computed from the TypeStoreSize.
+  Value *SrcGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, LoopIndex);
   LoadInst *Load = LoopBuilder.CreateAlignedLoad(LoopOpType, SrcGEP,
                                                  PartSrcAlign, SrcIsVolatile);
   if (!CanOverlap) {
     // Set alias scope for loads.
     Load->setMetadata(LLVMContext::MD_alias_scope, MDNode::get(Ctx, NewScope));
   }
-  Value *DstGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex);
+  Value *DstGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LoopIndex);
   StoreInst *Store =
       LoopBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
   if (!CanOverlap) {
@@ -246,13 +256,13 @@ void llvm::createMemCpyLoopUnknownSize(
     Load->setAtomic(AtomicOrdering::Unordered);
     Store->setAtomic(AtomicOrdering::Unordered);
   }
-  Value *NewIndex =
-      LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLenType, 1U));
+  Value *NewIndex = LoopBuilder.CreateAdd(
+      LoopIndex, ConstantInt::get(CopyLenType, LoopOpSize));
   LoopIndex->addIncoming(NewIndex, LoopBB);
 
-  bool requiresResidual =
+  bool RequiresResidual =
       !LoopOpIsInt8 && !(AtomicElementSize && LoopOpSize == AtomicElementSize);
-  if (requiresResidual) {
+  if (RequiresResidual) {
     Type *ResLoopOpType = AtomicElementSize
                               ? Type::getIntNTy(Ctx, *AtomicElementSize * 8)
                               : Int8Type;
@@ -263,14 +273,9 @@ void llvm::createMemCpyLoopUnknownSize(
     Align ResSrcAlign(commonAlignment(PartSrcAlign, ResLoopOpSize));
     Align ResDstAlign(commonAlignment(PartDstAlign, ResLoopOpSize));
 
-    Value *RuntimeResidual = getRuntimeLoopRemainder(DL, PLBuilder, CopyLen,
-                                                     CILoopOpSize, LoopOpSize);
-    Value *RuntimeBytesCopied = PLBuilder.CreateSub(CopyLen, RuntimeResidual);
-
     // Loop body for the residual copy.
-    BasicBlock *ResLoopBB = BasicBlock::Create(Ctx, "loop-memcpy-residual",
-                                               PreLoopBB->getParent(),
-                                               PostLoopBB);
+    BasicBlock *ResLoopBB = BasicBlock::Create(
+        Ctx, "loop-memcpy-residual", PreLoopBB->getParent(), PostLoopBB);
     // Residual loop header.
     BasicBlock *ResHeaderBB = BasicBlock::Create(
         Ctx, "loop-memcpy-residual-header", PreLoopBB->getParent(), nullptr);
@@ -281,17 +286,17 @@ void llvm::createMemCpyLoopUnknownSize(
     // non-zero and finally branch to after the residual loop if the memcpy
     //  size is zero.
     ConstantInt *Zero = ConstantInt::get(ILengthType, 0U);
-    PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopCount, Zero),
+    PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopBytes, Zero),
                            LoopBB, ResHeaderBB);
     PreLoopBB->getTerminator()->eraseFromParent();
 
     LoopBuilder.CreateCondBr(
-        LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopCount), LoopBB,
+        LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopBytes), LoopBB,
         ResHeaderBB);
 
     // Determine if we need to branch to the residual loop or bypass it.
     IRBuilder<> RHBuilder(ResHeaderBB);
-    RHBuilder.CreateCondBr(RHBuilder.CreateICmpNE(RuntimeResidual, Zero),
+    RHBuilder.CreateCondBr(RHBuilder.CreateICmpNE(RuntimeResidualBytes, Zero),
                            ResLoopBB, PostLoopBB);
 
     // Copy the residual with single byte load/store loop.
@@ -300,9 +305,8 @@ void llvm::createMemCpyLoopUnknownSize(
         ResBuilder.CreatePHI(CopyLenType, 2, "residual-loop-index");
     ResidualIndex->addIncoming(Zero, ResHeaderBB);
 
-    Value *FullOffset = ResBuilder.CreateAdd(RuntimeBytesCopied, ResidualIndex);
-    Value *SrcGEP =
-        ResBuilder.CreateInBoundsGEP(ResLoopOpType, SrcAddr, FullOffset);
+    Value *FullOffset = ResBuilder.CreateAdd(RuntimeLoopBytes, ResidualIndex);
+    Value *SrcGEP = ResBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, FullOffset);
     LoadInst *Load = ResBuilder.CreateAlignedLoad(ResLoopOpType, SrcGEP,
                                                   ResSrcAlign, SrcIsVolatile);
     if (!CanOverlap) {
@@ -310,8 +314,7 @@ void llvm::createMemCpyLoopUnknownSize(
       Load->setMetadata(LLVMContext::MD_alias_scope,
                         MDNode::get(Ctx, NewScope));
     }
-    Value *DstGEP =
-        ResBuilder.CreateInBoundsGEP(ResLoopOpType, DstAddr, FullOffset);
+    Value *DstGEP = ResBuilder.CreateInBoundsGEP(Int8Type, DstAddr, FullOffset);
     StoreInst *Store =
         ResBuilder.CreateAlignedStore(Load, DstGEP, ResDstAlign, DstIsVolatile);
     if (!CanOverlap) {
@@ -328,7 +331,7 @@ void llvm::createMemCpyLoopUnknownSize(
 
     // Create the loop branch condition.
     ResBuilder.CreateCondBr(
-        ResBuilder.CreateICmpULT(ResNewIndex, RuntimeResidual), ResLoopBB,
+        ResBuilder.CreateICmpULT(ResNewIndex, RuntimeResidualBytes), ResLoopBB,
         PostLoopBB);
   } else {
     // In this case the loop operand type was a byte, and there is no need for a
@@ -336,11 +339,11 @@ void llvm::createMemCpyLoopUnknownSize(
     // We do however need to patch up the control flow by creating the
     // terminators for the preloop block and the memcpy loop.
     ConstantInt *Zero = ConstantInt::get(ILengthType, 0U);
-    PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopCount, Zero),
+    PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopBytes, Zero),
                            LoopBB, PostLoopBB);
     PreLoopBB->getTerminator()->eraseFromParent();
     LoopBuilder.CreateCondBr(
-        LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopCount), LoopBB,
+        LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopBytes), LoopBB,
         PostLoopBB);
   }
 }
@@ -425,27 +428,25 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
   // Calculate the loop trip count and remaining bytes to copy after the loop.
   IntegerType *ILengthType = cast<IntegerType>(TypeOfCopyLen);
   ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
+  ConstantInt *CIResidualLoopOpSize =
+      ConstantInt::get(ILengthType, ResidualLoopOpSize);
   ConstantInt *Zero = ConstantInt::get(ILengthType, 0);
-  ConstantInt *One = ConstantInt::get(ILengthType, 1);
 
   IRBuilder<> PLBuilder(InsertBefore);
 
-  Value *RuntimeLoopCount = CopyLen;
+  Value *RuntimeLoopBytes = CopyLen;
   Value *RuntimeLoopRemainder = nullptr;
-  Value *RuntimeBytesCopiedMainLoop = CopyLen;
   Value *SkipResidualCondition = nullptr;
   if (RequiresResidual) {
-    RuntimeLoopCount =
-        getRuntimeLoopCount(DL, PLBuilder, CopyLen, CILoopOpSize, LoopOpSize);
     RuntimeLoopRemainder = getRuntimeLoopRemainder(DL, PLBuilder, CopyLen,
                                                    CILoopOpSize, LoopOpSize);
-    RuntimeBytesCopiedMainLoop =
-        PLBuilder.CreateSub(CopyLen, RuntimeLoopRemainder);
+    RuntimeLoopBytes = getRuntimeLoopBytes(DL, PLBuilder, CopyLen, CILoopOpSize,
+                                           LoopOpSize, RuntimeLoopRemainder);
     SkipResidualCondition =
         PLBuilder.CreateICmpEQ(RuntimeLoopRemainder, Zero, "skip_residual");
   }
   Value *SkipMainCondition =
-      PLBuilder.CreateICmpEQ(RuntimeLoopCount, Zero, "skip_main");
+      PLBuilder.CreateICmpEQ(RuntimeLoopBytes, Zero, "skip_main");
 
   // Create the a comparison of src and dst, based on which we jump to either
   // the forward-copy part of the function (if src >= dst) or the backwards-copy
@@ -464,7 +465,7 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
   SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore->getIterator(),
                                 &ThenTerm, &ElseTerm);
 
-  // If the LoopOpSize is greater than 1, each part of the function consist of
+  // If the LoopOpSize is greater than 1, each part of the function consists of
   // four blocks:
   //   memmove_copy_backwards:
   //       skip the residual loop when 0 iterations are required
@@ -519,14 +520,18 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
       IRBuilder<> ResidualLoopBuilder(ResidualLoopBB);
       PHINode *ResidualLoopPhi = ResidualLoopBuilder.CreatePHI(ILengthType, 0);
       Value *ResidualIndex = ResidualLoopBuilder.CreateSub(
-          ResidualLoopPhi, One, "bwd_residual_index");
-      Value *LoadGEP = ResidualLoopBuilder.CreateInBoundsGEP(
-          ResidualLoopOpType, SrcAddr, ResidualIndex);
+          ResidualLoopPhi, CIResidualLoopOpSize, "bwd_residual_index");
+      // If we used LoopOpType as GEP element type, we would iterate over the
+      // buffers in TypeStoreSize strides while copying TypeAllocSize bytes,
+      // i.e., we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore,
+      // use byte offsets computed from the TypeStoreSize.
+      Value *LoadGEP = ResidualLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr,
+                                                             ResidualIndex);
       Value *Element = ResidualLoopBuilder.CreateAlignedLoad(
           ResidualLoopOpType, LoadGEP, ResidualSrcAlign, SrcIsVolatile,
           "element");
-      Value *StoreGEP = ResidualLoopBuilder.CreateInBoundsGEP(
-          ResidualLoopOpType, DstAddr, ResidualIndex);
+      Value *StoreGEP = ResidualLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr,
+                                                              ResidualIndex);
       ResidualLoopBuilder.CreateAlignedStore(Element, StoreGEP,
                                              ResidualDstAlign, DstIsVolatile);
 
@@ -537,8 +542,7 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
       IRBuilder<> IntermediateBuilder(IntermediateBB);
       IntermediateBuilder.CreateUnreachable();
       ResidualLoopBuilder.CreateCondBr(
-          ResidualLoopBuilder.CreateICmpEQ(ResidualIndex,
-                                           RuntimeBytesCopiedMainLoop),
+          ResidualLoopBuilder.CreateICmpEQ(ResidualIndex, RuntimeLoopBytes),
           IntermediateBB, ResidualLoopBB);
 
       ResidualLoopPhi->addIncoming(ResidualIndex, ResidualLoopBB);
@@ -556,19 +560,19 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
     IRBuilder<> MainLoopBuilder(MainLoopBB);
     PHINode *MainLoopPhi = MainLoopBuilder.CreatePHI(ILengthType, 0);
     Value *MainIndex =
-        MainLoopBuilder.CreateSub(MainLoopPhi, One, "bwd_main_index");
+        MainLoopBuilder.CreateSub(MainLoopPhi, CILoopOpSize, "bwd_main_index");
     Value *LoadGEP =
-        MainLoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, MainIndex);
+        MainLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, MainIndex);
     Value *Element = MainLoopBuilder.CreateAlignedLoad(
         LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
     Value *StoreGEP =
-        MainLoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, MainIndex);
+        MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, MainIndex);
     MainLoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
                                        DstIsVolatile);
     MainLoopBuilder.CreateCondBr(MainLoopBuilder.CreateICmpEQ(MainIndex, Zero),
                                  ExitBB, MainLoopBB);
     MainLoopPhi->addIncoming(MainIndex, MainLoopBB);
-    MainLoopPhi->addIncoming(RuntimeLoopCount, PredBB);
+    MainLoopPhi->addIncoming(RuntimeLoopBytes, PredBB);
 
     // How to get to the main loop:
     Instruction *PredBBTerm = PredBB->getTerminator();
@@ -586,14 +590,14 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
     PHINode *MainLoopPhi =
         MainLoopBuilder.CreatePHI(ILengthType, 0, "fwd_main_index");
     Value *LoadGEP =
-        MainLoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, MainLoopPhi);
+        MainLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, MainLoopPhi);
     Value *Element = MainLoopBuilder.CreateAlignedLoad(
         LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
     Value *StoreGEP =
-        MainLoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, MainLoopPhi);
+        MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, MainLoopPhi);
     MainLoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
                                        DstIsVolatile);
-    Value *MainIndex = MainLoopBuilder.CreateAdd(MainLoopPhi, One);
+    Value *MainIndex = MainLoopBuilder.CreateAdd(MainLoopPhi, CILoopOpSize);
     MainLoopPhi->addIncoming(MainIndex, MainLoopBB);
     MainLoopPhi->addIncoming(Zero, CopyForwardBB);
 
@@ -605,7 +609,7 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
 
     // leaving or staying in the main loop
     MainLoopBuilder.CreateCondBr(
-        MainLoopBuilder.CreateICmpEQ(MainIndex, RuntimeLoopCount), SuccessorBB,
+        MainLoopBuilder.CreateICmpEQ(MainIndex, RuntimeLoopBytes), SuccessorBB,
         MainLoopBB);
 
     // getting in or skipping the main loop
@@ -625,22 +629,22 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
       IRBuilder<> ResidualLoopBuilder(ResidualLoopBB);
       PHINode *ResidualLoopPhi =
           ResidualLoopBuilder.CreatePHI(ILengthType, 0, "fwd_residual_index");
-      Value *LoadGEP = ResidualLoopBuilder.CreateInBoundsGEP(
-          ResidualLoopOpType, SrcAddr, ResidualLoopPhi);
+      Value *LoadGEP = ResidualLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr,
+                                                             ResidualLoopPhi);
       Value *Element = ResidualLoopBuilder.CreateAlignedLoad(
           ResidualLoopOpType, LoadGEP, ResidualSrcAlign, SrcIsVolatile,
           "element");
-      Value *StoreGEP = ResidualLoopBuilder.CreateInBoundsGEP(
-          ResidualLoopOpType, DstAddr, ResidualLoopPhi);
+      Value *StoreGEP = ResidualLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr,
+                                                              ResidualLoopPhi);
       ResidualLoopBuilder.CreateAlignedStore(Element, StoreGEP,
                                              ResidualDstAlign, DstIsVolatile);
       Value *ResidualIndex =
-          ResidualLoopBuilder.CreateAdd(ResidualLoopPhi, One);
+          ResidualLoopBuilder.CreateAdd(ResidualLoopPhi, CIResidualLoopOpSize);
       ResidualLoopBuilder.CreateCondBr(
           ResidualLoopBuilder.CreateICmpEQ(ResidualIndex, CopyLen), ExitBB,
           ResidualLoopBB);
       ResidualLoopPhi->addIncoming(ResidualIndex, ResidualLoopBB);
-      ResidualLoopPhi->addIncoming(RuntimeBytesCopiedMainLoop, IntermediateBB);
+      ResidualLoopPhi->addIncoming(RuntimeLoopBytes, IntermediateBB);
     }
   }
 }
@@ -669,16 +673,16 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
   Type *LoopOpType = TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAS, DstAS,
                                                    SrcAlign, DstAlign);
   unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
+  Type *Int8Type = Type::getInt8Ty(Ctx);
 
   // Calculate the loop trip count and remaining bytes to copy after the loop.
-  uint64_t LoopEndCount = CopyLen->getZExtValue() / LoopOpSize;
-  uint64_t BytesCopiedInLoop = LoopEndCount * LoopOpSize;
+  uint64_t BytesCopiedInLoop = alignDown(CopyLen->getZExtValue(), LoopOpSize);
   uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopiedInLoop;
 
   IntegerType *ILengthType = cast<IntegerType>(TypeOfCopyLen);
   ConstantInt *Zero = ConstantInt::get(ILengthType, 0);
-  ConstantInt *One = ConstantInt::get(ILengthType, 1);
-  ConstantInt *TripCount = ConstantInt::get(ILengthType, LoopEndCount);
+  ConstantInt *LoopBound = ConstantInt::get(ILengthType, BytesCopiedInLoop);
+  ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
 
   IRBuilder<> PLBuilder(InsertBefore);
 
@@ -705,19 +709,18 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
     Align ResSrcAlign(commonAlignment(SrcAlign, BytesCopied));
     Align ResDstAlign(commonAlignment(DstAlign, BytesCopied));
 
-    // Calculate the new index
     unsigned OperandSize = DL.getTypeStoreSize(OpTy);
 
-    uint64_t GepIndex = BytesCopied / OperandSize;
-    assert(GepIndex * OperandSize == BytesCopied &&
-           "Division should have no Remainder!");
-
+    // If we used LoopOpType as GEP element type, we would iterate over the
+    // buffers in TypeStoreSize strides while copying TypeAllocSize bytes, i.e.,
+    // we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore, use
+    // byte offsets computed from the TypeStoreSize.
     Value *SrcGEP = Builder.CreateInBoundsGEP(
-        OpTy, SrcAddr, ConstantInt::get(TypeOfCopyLen, GepIndex));
+        Int8Type, SrcAddr, ConstantInt::get(TypeOfCopyLen, BytesCopied));
     LoadInst *Load =
         Builder.CreateAlignedLoad(OpTy, SrcGEP, ResSrcAlign, SrcIsVolatile);
     Value *DstGEP = Builder.CreateInBoundsGEP(
-        OpTy, DstAddr, ConstantInt::get(TypeOfCopyLen, GepIndex));
+        Int8Type, DstAddr, ConstantInt::get(TypeOfCopyLen, BytesCopied));
     Builder.CreateAlignedStore(Load, DstGEP, ResDstAlign, DstIsVolatile);
     BytesCopied += OperandSize;
   };
@@ -743,7 +746,7 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
       GenerateResidualLdStPair(OpTy, BwdResBuilder, BytesCopied);
     }
   }
-  if (LoopEndCount != 0) {
+  if (BytesCopiedInLoop != 0) {
     BasicBlock *LoopBB = CopyBackwardsBB;
     BasicBlock *PredBB = OrigBB;
     if (RemainingBytes != 0) {
@@ -756,11 +759,11 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
     }
     IRBuilder<> LoopBuilder(LoopBB->getTerminator());
     PHINode *LoopPhi = LoopBuilder.CreatePHI(ILengthType, 0);
-    Value *Index = LoopBuilder.CreateSub(LoopPhi, One, "bwd_index");
-    Value *LoadGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, Index);
+    Value *Index = LoopBuilder.CreateSub(LoopPhi, CILoopOpSize, "bwd_index");
+    Value *LoadGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, Index);
     Value *Element = LoopBuilder.CreateAlignedLoad(
         LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
-    Value *StoreGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, Index);
+    Value *StoreGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, Index);
     LoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
                                    DstIsVolatile);
 
@@ -772,12 +775,12 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
     UncondTerm->eraseFromParent();
 
     LoopPhi->addIncoming(Index, LoopBB);
-    LoopPhi->addIncoming(TripCount, PredBB);
+    LoopPhi->addIncoming(LoopBound, PredBB);
   }
 
   // Copying forward.
   BasicBlock *FwdResidualBB = CopyForwardBB;
-  if (LoopEndCount != 0) {
+  if (BytesCopiedInLoop != 0) {
     CopyForwardBB->setName("memmove_fwd_loop");
     BasicBlock *LoopBB = CopyForwardBB;
     BasicBlock *SuccBB = ExitBB;
@@ -789,21 +792,19 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
     }
     IRBuilder<> LoopBuilder(LoopBB->getTerminator());
     PHINode *LoopPhi = LoopBuilder.CreatePHI(ILengthType, 0, "fwd_index");
-    Value *LoadGEP =
-        LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopPhi);
+    Value *LoadGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, LoopPhi);
     Value *Element = LoopBuilder.CreateAlignedLoad(
         LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
-    Value *StoreGEP =
-        LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopPhi);
+    Value *StoreGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LoopPhi);
     LoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
                                    DstIsVolatile);
-    Value *Index = LoopBuilder.CreateAdd(LoopPhi, One);
+    Value *Index = LoopBuilder.CreateAdd(LoopPhi, CILoopOpSize);
     LoopPhi->addIncoming(Index, LoopBB);
     LoopPhi->addIncoming(Zero, OrigBB);
 
     // Replace the unconditional branch to turn LoopBB into a loop.
     Instruction *UncondTerm = LoopBB->getTerminator();
-    LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpEQ(Index, TripCount), SuccBB,
+    LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpEQ(Index, LoopBound), SuccBB,
                              LoopBB);
     UncondTerm->eraseFromParent();
   }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
index 6b054556135156..7f23434c9dfdd6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
@@ -7,107 +7,101 @@ declare void @llvm.memcpy.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1
 define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src) {
 ; LOOP-LABEL: memcpy_p1i8:
 ; LOOP:       ; %bb.0:
-; LOOP-NEXT:    s_mov_b32 s6, 0
-; LOOP-NEXT:    s_mov_b32 s7, 0xf000
-; LOOP-NEXT:    s_mov_b64 s[4:5], 0
-; LOOP-NEXT:    v_mov_b32_e32 v5, v3
-; LOOP-NEXT:    v_mov_b32_e32 v4, v2
-; LOOP-NEXT:    v_mov_b32_e32 v7, v1
-; LOOP-NEXT:    v_mov_b32_e32 v6, v0
-; LOOP-NEXT:    v_mov_b32_e32 v8, s6
+; LOOP-NEXT:    s_mov_b64 s[0:1], 0
+; LOOP-NEXT:    s_mov_b32 s2, 0
+; LOOP-NEXT:    s_mov_b32 s3, 0xf000
+; LOOP-NEXT:    v_mov_b32_e32 v5, s1
+; LOOP-NEXT:    v_mov_b32_e32 v4, s0
 ; LOOP-NEXT:  .LBB0_1: ; %load-store-loop
 ; LOOP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; LOOP-NEXT:    buffer_load_ubyte v9, v[4:5], s[4:7], 0 addr64
-; LOOP-NEXT:    s_waitcnt expcnt(6)
-; LOOP-NEXT:    buffer_load_ubyte v10, v[4:5], s[4:7], 0 addr64 offset:1
-; LOOP-NEXT:    s_waitcnt expcnt(3)
-; LOOP-NEXT:    buffer_load_ubyte v11, v[4:5], s[4:7], 0 addr64 offset:2
 ; LOOP-NEXT:    s_waitcnt expcnt(0)
-; LOOP-NEXT:    buffer_load_ubyte v12, v[4:5], s[4:7], 0 addr64 offset:3
-; LOOP-NEXT:    buffer_load_ubyte v13, v[4:5], s[4:7], 0 addr64 offset:4
-; LOOP-NEXT:    buffer_load_ubyte v14, v[4:5], s[4:7], 0 addr64 offset:5
-; LOOP-NEXT:    buffer_load_ubyte v15, v[4:5], s[4:7], 0 addr64 offset:6
-; LOOP-NEXT:    buffer_load_ubyte v16, v[4:5], s[4:7], 0 addr64 offset:7
-; LOOP-NEXT:    buffer_load_ubyte v17, v[4:5], s[4:7], 0 addr64 offset:8
-; LOOP-NEXT:    buffer_load_ubyte v18, v[4:5], s[4:7], 0 addr64 offset:9
-; LOOP-NEXT:    buffer_load_ubyte v19, v[4:5], s[4:7], 0 addr64 offset:10
-; LOOP-NEXT:    buffer_load_ubyte v20, v[4:5], s[4:7], 0 addr64 offset:11
-; LOOP-NEXT:    buffer_load_ubyte v21, v[4:5], s[4:7], 0 addr64 offset:12
-; LOOP-NEXT:    buffer_load_ubyte v22, v[4:5], s[4:7], 0 addr64 offset:13
-; LOOP-NEXT:    buffer_load_ubyte v23, v[4:5], s[4:7], 0 addr64 offset:14
-; LOOP-NEXT:    buffer_load_ubyte v24, v[4:5], s[4:7], 0 addr64 offset:15
-; LOOP-NEXT:    v_add_i32_e32 v8, vcc, 1, v8
-; LOOP-NEXT:    s_xor_b64 s[0:1], vcc, -1
-; LOOP-NEXT:    s_xor_b64 s[0:1], s[0:1], -1
-; LOOP-NEXT:    s_and_b64 vcc, s[0:1], exec
+; LOOP-NEXT:    v_add_i32_e32 v6, vcc, v2, v4
+; LOOP-NEXT:    v_addc_u32_e32 v7, vcc, v3, v5, vcc
+; LOOP-NEXT:    v_add_i32_e32 v8, vcc, v0, v4
+; LOOP-NEXT:    v_addc_u32_e32 v9, vcc, v1, v5, vcc
+; LOOP-NEXT:    v_add_i32_e32 v4, vcc, 16, v4
+; LOOP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; LOOP-NEXT:    buffer_load_ubyte v10, v[6:7], s[0:3], 0 addr64
+; LOOP-NEXT:    buffer_load_ubyte v11, v[6:7], s[0:3], 0 addr64 offset:1
+; LOOP-NEXT:    buffer_load_ubyte v12, v[6:7], s[0:3], 0 addr64 offset:2
+; LOOP-NEXT:    buffer_load_ubyte v13, v[6:7], s[0:3], 0 addr64 offset:3
+; LOOP-NEXT:    buffer_load_ubyte v14, v[6:7], s[0:3], 0 addr64 offset:4
+; LOOP-NEXT:    buffer_load_ubyte v15, v[6:7], s[0:3], 0 addr64 offset:5
+; LOOP-NEXT:    buffer_load_ubyte v16, v[6:7], s[0:3], 0 addr64 offset:6
+; LOOP-NEXT:    buffer_load_ubyte v17, v[6:7], s[0:3], 0 addr64 offset:7
+; LOOP-NEXT:    buffer_load_ubyte v18, v[6:7], s[0:3], 0 addr64 offset:8
+; LOOP-NEXT:    buffer_load_ubyte v19, v[6:7], s[0:3], 0 addr64 offset:9
+; LOOP-NEXT:    buffer_load_ubyte v20, v[6:7], s[0:3], 0 addr64 offset:10
+; LOOP-NEXT:    buffer_load_ubyte v21, v[6:7], s[0:3], 0 addr64 offset:11
+; LOOP-NEXT:    buffer_load_ubyte v22, v[6:7], s[0:3], 0 addr64 offset:12
+; LOOP-NEXT:    buffer_load_ubyte v23, v[6:7], s[0:3], 0 addr64 offset:13
+; LOOP-NEXT:    buffer_load_ubyte v24, v[6:7], s[0:3], 0 addr64 offset:14
+; LOOP-NEXT:    buffer_load_ubyte v6, v[6:7], s[0:3], 0 addr64 offset:15
+; LOOP-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v4
 ; LOOP-NEXT:    s_waitcnt vmcnt(14)
-; LOOP-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
+; LOOP-NEXT:    v_lshlrev_b32_e32 v7, 8, v11
 ; LOOP-NEXT:    s_waitcnt vmcnt(12)
-; LOOP-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
-; LOOP-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; LOOP-NEXT:    v_lshlrev_b32_e32 v11, 24, v13
+; LOOP-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
 ; LOOP-NEXT:    s_waitcnt vmcnt(10)
-; LOOP-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
+; LOOP-NEXT:    v_lshlrev_b32_e32 v13, 8, v15
 ; LOOP-NEXT:    s_waitcnt vmcnt(8)
-; LOOP-NEXT:    v_lshlrev_b32_e32 v16, 24, v16
-; LOOP-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; LOOP-NEXT:    v_lshlrev_b32_e32 v15, 24, v17
+; LOOP-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
 ; LOOP-NEXT:    s_waitcnt vmcnt(6)
-; LOOP-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
+; LOOP-NEXT:    v_lshlrev_b32_e32 v17, 8, v19
 ; LOOP-NEXT:    s_waitcnt vmcnt(4)
-; LOOP-NEXT:    v_lshlrev_b32_e32 v20, 24, v20
-; LOOP-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; LOOP-NEXT:    v_lshlrev_b32_e32 v19, 24, v21
+; LOOP-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
 ; LOOP-NEXT:    s_waitcnt vmcnt(2)
-; LOOP-NEXT:    v_lshlrev_b32_e32 v22, 8, v22
+; LOOP-NEXT:    v_lshlrev_b32_e32 v21, 8, v23
 ; LOOP-NEXT:    s_waitcnt vmcnt(0)
-; LOOP-NEXT:    v_lshlrev_b32_e32 v24, 24, v24
-; LOOP-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; LOOP-NEXT:    v_or_b32_e32 v9, v10, v9
+; LOOP-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; LOOP-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
+; LOOP-NEXT:    v_or_b32_e32 v7, v7, v10
+; LOOP-NEXT:    v_or_b32_e32 v10, v11, v12
+; LOOP-NEXT:    v_or_b32_e32 v11, v13, v14
+; LOOP-NEXT:    v_or_b32_e32 v12, v15, v16
+; LOOP-NEXT:    v_or_b32_e32 v13, v17, v18
+; LOOP-NEXT:    v_or_b32_e32 v14, v19, v20
+; LOOP-NEXT:    v_or_b32_e32 v15, v21, v22
+; LOOP-NEXT:    v_or_b32_e32 v6, v6, v23
+; LOOP-NEXT:    v_or_b32_e32 v7, v10, v7
 ; LOOP-NEXT:    v_or_b32_e32 v10, v12, v11
 ; LOOP-NEXT:    v_or_b32_e32 v11, v14, v13
-; LOOP-NEXT:    v_or_b32_e32 v12, v16, v15
-; LOOP-NEXT:    v_or_b32_e32 v13, v18, v17
-; LOOP-NEXT:    v_or_b32_e32 v14, v20, v19
-; LOOP-NEXT:    v_or_b32_e32 v15, v22, v21
-; LOOP-NEXT:    v_or_b32_e32 v16, v24, v23
-; LOOP-NEXT:    v_or_b32_e32 v9, v10, v9
-; LOOP-NEXT:    v_or_b32_e32 v10, v12, v11
-; LOOP-NEXT:    v_or_b32_e32 v11, v14, v13
-; LOOP-NEXT:    v_or_b32_e32 v12, v16, v15
-; LOOP-NEXT:    v_lshrrev_b32_e32 v13, 16, v9
-; LOOP-NEXT:    v_bfe_u32 v14, v9, 8, 8
-; LOOP-NEXT:    buffer_store_byte v9, v[6:7], s[4:7], 0 addr64
+; LOOP-NEXT:    v_or_b32_e32 v6, v6, v15
+; LOOP-NEXT:    v_lshrrev_b32_e32 v12, 16, v7
+; LOOP-NEXT:    v_bfe_u32 v13, v7, 8, 8
+; LOOP-NEXT:    buffer_store_byte v7, v[8:9], s[0:3], 0 addr64
 ; LOOP-NEXT:    s_waitcnt expcnt(0)
-; LOOP-NEXT:    v_lshrrev_b32_e32 v9, 24, v9
-; LOOP-NEXT:    v_lshrrev_b32_e32 v15, 16, v10
-; LOOP-NEXT:    v_bfe_u32 v16, v10, 8, 8
-; LOOP-NEXT:    buffer_store_byte v10, v[6:7], s[4:7], 0 addr64 offset:4
+; LOOP-NEXT:    v_lshrrev_b32_e32 v7, 24, v7
+; LOOP-NEXT:    v_lshrrev_b32_e32 v14, 16, v10
+; LOOP-NEXT:    v_bfe_u32 v15, v10, 8, 8
+; LOOP-NEXT:    buffer_store_byte v10, v[8:9], s[0:3], 0 addr64 offset:4
 ; LOOP-NEXT:    s_waitcnt expcnt(0)
 ; LOOP-NEXT:    v_lshrrev_b32_e32 v10, 24, v10
-; LOOP-NEXT:    v_lshrrev_b32_e32 v17, 16, v11
-; LOOP-NEXT:    v_bfe_u32 v18, v11, 8, 8
-; LOOP-NEXT:    buffer_store_byte v11, v[6:7], s[4:7], 0 addr64 offset:8
+; LOOP-NEXT:    v_lshrrev_b32_e32 v16, 16, v11
+; LOOP-NEXT:    v_bfe_u32 v17, v11, 8, 8
+; LOOP-NEXT:    buffer_store_byte v11, v[8:9], s[0:3], 0 addr64 offset:8
 ; LOOP-NEXT:    s_waitcnt expcnt(0)
 ; LOOP-NEXT:    v_lshrrev_b32_e32 v11, 24, v11
-; LOOP-NEXT:    v_lshrrev_b32_e32 v19, 16, v12
-; LOOP-NEXT:    v_bfe_u32 v20, v12, 8, 8
-; LOOP-NEXT:    buffer_store_byte v12, v[6:7], s[4:7], 0 addr64 offset:12
+; LOOP-NEXT:    v_lshrrev_b32_e32 v18, 16, v6
+; LOOP-NEXT:    v_bfe_u32 v19, v6, 8, 8
+; LOOP-NEXT:    buffer_store_byte v6, v[8:9], s[0:3], 0 addr64 offset:12
 ; LOOP-NEXT:    s_waitcnt expcnt(0)
-; LOOP-NEXT:    v_lshrrev_b32_e32 v12, 24, v12
-; LOOP-NEXT:    buffer_store_byte v14, v[6:7], s[4:7], 0 addr64 offset:1
-; LOOP-NEXT:    buffer_store_byte v13, v[6:7], s[4:7], 0 addr64 offset:2
-; LOOP-NEXT:    buffer_store_byte v9, v[6:7], s[4:7], 0 addr64 offset:3
-; LOOP-NEXT:    buffer_store_byte v16, v[6:7], s[4:7], 0 addr64 offset:5
-; LOOP-NEXT:    buffer_store_byte v15, v[6:7], s[4:7], 0 addr64 offset:6
-; LOOP-NEXT:    buffer_store_byte v10, v[6:7], s[4:7], 0 addr64 offset:7
-; LOOP-NEXT:    buffer_store_byte v18, v[6:7], s[4:7], 0 addr64 offset:9
-; LOOP-NEXT:    buffer_store_byte v17, v[6:7], s[4:7], 0 addr64 offset:10
-; LOOP-NEXT:    buffer_store_byte v11, v[6:7], s[4:7], 0 addr64 offset:11
-; LOOP-NEXT:    buffer_store_byte v20, v[6:7], s[4:7], 0 addr64 offset:13
-; LOOP-NEXT:    buffer_store_byte v19, v[6:7], s[4:7], 0 addr64 offset:14
-; LOOP-NEXT:    buffer_store_byte v12, v[6:7], s[4:7], 0 addr64 offset:15
-; LOOP-NEXT:    v_add_i32_e64 v6, s[0:1], 16, v6
-; LOOP-NEXT:    v_addc_u32_e64 v7, s[0:1], 0, v7, s[0:1]
-; LOOP-NEXT:    v_add_i32_e64 v4, s[0:1], 16, v4
-; LOOP-NEXT:    v_addc_u32_e64 v5, s[0:1], 0, v5, s[0:1]
+; LOOP-NEXT:    v_lshrrev_b32_e32 v6, 24, v6
+; LOOP-NEXT:    buffer_store_byte v13, v[8:9], s[0:3], 0 addr64 offset:1
+; LOOP-NEXT:    buffer_store_byte v12, v[8:9], s[0:3], 0 addr64 offset:2
+; LOOP-NEXT:    buffer_store_byte v7, v[8:9], s[0:3], 0 addr64 offset:3
+; LOOP-NEXT:    buffer_store_byte v15, v[8:9], s[0:3], 0 addr64 offset:5
+; LOOP-NEXT:    buffer_store_byte v14, v[8:9], s[0:3], 0 addr64 offset:6
+; LOOP-NEXT:    buffer_store_byte v10, v[8:9], s[0:3], 0 addr64 offset:7
+; LOOP-NEXT:    buffer_store_byte v17, v[8:9], s[0:3], 0 addr64 offset:9
+; LOOP-NEXT:    buffer_store_byte v16, v[8:9], s[0:3], 0 addr64 offset:10
+; LOOP-NEXT:    buffer_store_byte v11, v[8:9], s[0:3], 0 addr64 offset:11
+; LOOP-NEXT:    buffer_store_byte v19, v[8:9], s[0:3], 0 addr64 offset:13
+; LOOP-NEXT:    buffer_store_byte v18, v[8:9], s[0:3], 0 addr64 offset:14
+; LOOP-NEXT:    buffer_store_byte v6, v[8:9], s[0:3], 0 addr64 offset:15
 ; LOOP-NEXT:    s_cbranch_vccnz .LBB0_1
 ; LOOP-NEXT:  ; %bb.2: ; %memcpy-split
 ; LOOP-NEXT:    s_mov_b32 s2, 0
@@ -115,6 +109,7 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src)
 ; LOOP-NEXT:    s_mov_b64 s[0:1], 0
 ; LOOP-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:17
 ; LOOP-NEXT:    buffer_load_ubyte v5, v[2:3], s[0:3], 0 addr64 offset:19
+; LOOP-NEXT:    s_waitcnt expcnt(0)
 ; LOOP-NEXT:    buffer_load_ubyte v6, v[2:3], s[0:3], 0 addr64 offset:18
 ; LOOP-NEXT:    buffer_load_ubyte v2, v[2:3], s[0:3], 0 addr64 offset:16
 ; LOOP-NEXT:    s_waitcnt vmcnt(3)

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
index 3f07b3d94692c2..a95f22507eece3 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@@ -45,12 +45,12 @@ define amdgpu_kernel void @max_size_small_static_memcpy_caller0(ptr addrspace(1)
 ; ALL-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
 ; ALL:       load-store-loop:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
-; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
+; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; ALL-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 1
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
 ; ALL-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1
-; ALL-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 1
-; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 64
+; ALL-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 16
+; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; ALL:       memcpy-split:
 ; ALL-NEXT:    ret void
@@ -65,12 +65,12 @@ define amdgpu_kernel void @min_size_large_static_memcpy_caller0(ptr addrspace(1)
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
-; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 1
-; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1
-; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 64
+; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 16
+; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
 ; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
@@ -92,22 +92,22 @@ define amdgpu_kernel void @max_size_small_static_memmove_caller0(ptr addrspace(1
 ; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]]
 ; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
 ; ALL:       memmove_bwd_loop:
-; ALL-NEXT:    [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 64, [[TMP0:%.*]] ]
-; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP1]], 1
-; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 1024, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP1]], 16
+; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
 ; ALL-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP2]], align 1
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
 ; ALL-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
 ; ALL-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
 ; ALL-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
 ; ALL:       memmove_fwd_loop:
 ; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
-; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
 ; ALL-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
 ; ALL-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1
-; ALL-NEXT:    [[TMP7]] = add i64 [[FWD_INDEX]], 1
-; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 64
+; ALL-NEXT:    [[TMP7]] = add i64 [[FWD_INDEX]], 16
+; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 1024
 ; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
 ; ALL-NEXT:    ret void
@@ -127,22 +127,22 @@ define amdgpu_kernel void @min_size_large_static_memmove_caller0(ptr addrspace(1
 ; OPT-NEXT:    store i8 [[TMP2]], ptr addrspace(1) [[TMP3]], align 1
 ; OPT-NEXT:    br label [[MEMMOVE_BWD_LOOP:%.*]]
 ; OPT:       memmove_bwd_loop:
-; OPT-NEXT:    [[TMP4:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 64, [[MEMMOVE_BWD_RESIDUAL]] ]
-; OPT-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP4]], 1
-; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
+; OPT-NEXT:    [[TMP4:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 1024, [[MEMMOVE_BWD_RESIDUAL]] ]
+; OPT-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP4]], 16
+; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
 ; OPT-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
-; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP6]], align 1
 ; OPT-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
 ; OPT-NEXT:    br i1 [[TMP7]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
 ; OPT:       memmove_fwd_loop:
 ; OPT-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP10:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0:%.*]] ]
-; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
+; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
 ; OPT-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP8]], align 1
-; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
+; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP9]], align 1
-; OPT-NEXT:    [[TMP10]] = add i64 [[FWD_INDEX]], 1
-; OPT-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[TMP10]], 64
+; OPT-NEXT:    [[TMP10]] = add i64 [[FWD_INDEX]], 16
+; OPT-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[TMP10]], 1024
 ; OPT-NEXT:    br i1 [[TMP11]], label [[MEMMOVE_FWD_RESIDUAL:%.*]], label [[MEMMOVE_FWD_LOOP]]
 ; OPT:       memmove_fwd_residual:
 ; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
@@ -197,19 +197,18 @@ define amdgpu_kernel void @min_size_large_static_memset_caller0(ptr addrspace(1)
 
 define amdgpu_kernel void @variable_memcpy_caller0(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
 ; OPT-LABEL: @variable_memcpy_caller0(
-; OPT-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
-; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 15
+; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N:%.*]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
-; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
+; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; OPT:       loop-memcpy-expansion:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
-; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
-; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1
-; OPT-NEXT:    [[TMP8]] = add i64 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP1]]
+; OPT-NEXT:    [[TMP8]] = add i64 [[LOOP_INDEX]], 16
+; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
 ; OPT:       loop-memcpy-residual:
 ; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
@@ -233,19 +232,18 @@ define amdgpu_kernel void @variable_memcpy_caller0(ptr addrspace(1) %dst, ptr ad
 
 define amdgpu_kernel void @variable_memcpy_caller1(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
 ; OPT-LABEL: @variable_memcpy_caller1(
-; OPT-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
-; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 15
+; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N:%.*]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
-; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
+; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; OPT:       loop-memcpy-expansion:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
-; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
-; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1
-; OPT-NEXT:    [[TMP8]] = add i64 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP1]]
+; OPT-NEXT:    [[TMP8]] = add i64 [[LOOP_INDEX]], 16
+; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
 ; OPT:       loop-memcpy-residual:
 ; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
@@ -269,19 +267,18 @@ define amdgpu_kernel void @variable_memcpy_caller1(ptr addrspace(1) %dst, ptr ad
 
 define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0, ptr addrspace(1) %dst1, ptr addrspace(1) %src, i64 %n, i64 %m) #0 {
 ; OPT-LABEL: @memcpy_multi_use_one_function(
-; OPT-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
-; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 15
+; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N:%.*]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
-; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
+; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION2:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER5:%.*]]
 ; OPT:       loop-memcpy-expansion2:
 ; OPT-NEXT:    [[LOOP_INDEX3:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION2]] ]
-; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX3]]
+; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX3]]
 ; OPT-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
-; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST0:%.*]], i64 [[LOOP_INDEX3]]
+; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST0:%.*]], i64 [[LOOP_INDEX3]]
 ; OPT-NEXT:    store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1
-; OPT-NEXT:    [[TMP8]] = add i64 [[LOOP_INDEX3]], 1
-; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP1]]
+; OPT-NEXT:    [[TMP8]] = add i64 [[LOOP_INDEX3]], 16
+; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION2]], label [[LOOP_MEMCPY_RESIDUAL_HEADER5]]
 ; OPT:       loop-memcpy-residual4:
 ; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX6:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER5]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL4:%.*]] ]
@@ -294,19 +291,18 @@ define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0,
 ; OPT-NEXT:    [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
 ; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL4]], label [[POST_LOOP_MEMCPY_EXPANSION1:%.*]]
 ; OPT:       post-loop-memcpy-expansion1:
-; OPT-NEXT:    [[TMP16:%.*]] = lshr i64 [[M:%.*]], 4
-; OPT-NEXT:    [[TMP17:%.*]] = and i64 [[M]], 15
+; OPT-NEXT:    [[TMP17:%.*]] = and i64 [[M:%.*]], 15
 ; OPT-NEXT:    [[TMP18:%.*]] = sub i64 [[M]], [[TMP17]]
-; OPT-NEXT:    [[TMP19:%.*]] = icmp ne i64 [[TMP16]], 0
+; OPT-NEXT:    [[TMP19:%.*]] = icmp ne i64 [[TMP18]], 0
 ; OPT-NEXT:    br i1 [[TMP19]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; OPT:       loop-memcpy-expansion:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[POST_LOOP_MEMCPY_EXPANSION1]] ], [ [[TMP23:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
-; OPT-NEXT:    [[TMP20:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP20]], align 1
-; OPT-NEXT:    [[TMP22:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST1:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[TMP21]], ptr addrspace(1) [[TMP22]], align 1
-; OPT-NEXT:    [[TMP23]] = add i64 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP24:%.*]] = icmp ult i64 [[TMP23]], [[TMP16]]
+; OPT-NEXT:    [[TMP23]] = add i64 [[LOOP_INDEX]], 16
+; OPT-NEXT:    [[TMP24:%.*]] = icmp ult i64 [[TMP23]], [[TMP18]]
 ; OPT-NEXT:    br i1 [[TMP24]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
 ; OPT:       loop-memcpy-residual:
 ; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP29:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
@@ -334,19 +330,18 @@ define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0,
 
 define amdgpu_kernel void @memcpy_alt_type(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %n) #0 {
 ; OPT-LABEL: @memcpy_alt_type(
-; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[N:%.*]], 3
-; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N]], 7
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N:%.*]], 7
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
-; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
+; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; OPT:       loop-memcpy-expansion:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
-; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1
-; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(1) [[DST:%.*]], i32 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1
-; OPT-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP1]]
+; OPT-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
 ; OPT:       loop-memcpy-residual:
 ; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
@@ -371,19 +366,18 @@ define amdgpu_kernel void @memcpy_alt_type(ptr addrspace(1) %dst, ptr addrspace(
 ; One of the uses in the function should be expanded, the other left alone.
 define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(ptr addrspace(1) %dst0, ptr addrspace(1) %dst1, ptr addrspace(1) %src, i64 %n) #0 {
 ; MAX1024-LABEL: @memcpy_multi_use_one_function_keep_small(
-; MAX1024-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
-; MAX1024-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 15
+; MAX1024-NEXT:    [[TMP2:%.*]] = and i64 [[N:%.*]], 15
 ; MAX1024-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
-; MAX1024-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
+; MAX1024-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
 ; MAX1024-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; MAX1024:       loop-memcpy-expansion:
 ; MAX1024-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
-; MAX1024-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
+; MAX1024-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; MAX1024-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
-; MAX1024-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST0:%.*]], i64 [[LOOP_INDEX]]
+; MAX1024-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST0:%.*]], i64 [[LOOP_INDEX]]
 ; MAX1024-NEXT:    store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1
-; MAX1024-NEXT:    [[TMP8]] = add i64 [[LOOP_INDEX]], 1
-; MAX1024-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP1]]
+; MAX1024-NEXT:    [[TMP8]] = add i64 [[LOOP_INDEX]], 16
+; MAX1024-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]]
 ; MAX1024-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
 ; MAX1024:       loop-memcpy-residual:
 ; MAX1024-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
@@ -403,19 +397,18 @@ define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(ptr addrspac
 ; MAX1024-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
 ;
 ; ALL-LABEL: @memcpy_multi_use_one_function_keep_small(
-; ALL-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
-; ALL-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 15
+; ALL-NEXT:    [[TMP2:%.*]] = and i64 [[N:%.*]], 15
 ; ALL-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
-; ALL-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
+; ALL-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
 ; ALL-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; ALL:       loop-memcpy-expansion:
 ; ALL-NEXT:    [[LOOP_INDEX1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
-; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX1]]
+; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX1]]
 ; ALL-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
-; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST0:%.*]], i64 [[LOOP_INDEX1]]
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST0:%.*]], i64 [[LOOP_INDEX1]]
 ; ALL-NEXT:    store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1
-; ALL-NEXT:    [[TMP8]] = add i64 [[LOOP_INDEX1]], 1
-; ALL-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP1]]
+; ALL-NEXT:    [[TMP8]] = add i64 [[LOOP_INDEX1]], 16
+; ALL-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]]
 ; ALL-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
 ; ALL:       loop-memcpy-residual:
 ; ALL-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
@@ -431,21 +424,21 @@ define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(ptr addrspac
 ; ALL-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
 ; ALL:       load-store-loop:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[POST_LOOP_MEMCPY_EXPANSION]] ], [ [[TMP19:%.*]], [[LOAD_STORE_LOOP]] ]
-; ALL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[LOOP_INDEX]]
+; ALL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[LOOP_INDEX]]
 ; ALL-NEXT:    [[TMP17:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP16]], align 1
-; ALL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST1:%.*]], i64 [[LOOP_INDEX]]
+; ALL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1:%.*]], i64 [[LOOP_INDEX]]
 ; ALL-NEXT:    store <4 x i32> [[TMP17]], ptr addrspace(1) [[TMP18]], align 1
-; ALL-NEXT:    [[TMP19]] = add i64 [[LOOP_INDEX]], 1
-; ALL-NEXT:    [[TMP20:%.*]] = icmp ult i64 [[TMP19]], 6
+; ALL-NEXT:    [[TMP19]] = add i64 [[LOOP_INDEX]], 16
+; ALL-NEXT:    [[TMP20:%.*]] = icmp ult i64 [[TMP19]], 96
 ; ALL-NEXT:    br i1 [[TMP20]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; ALL:       memcpy-split:
-; ALL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[SRC]], i64 24
+; ALL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 96
 ; ALL-NEXT:    [[TMP22:%.*]] = load i32, ptr addrspace(1) [[TMP21]], align 1
-; ALL-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[DST1]], i64 24
+; ALL-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 96
 ; ALL-NEXT:    store i32 [[TMP22]], ptr addrspace(1) [[TMP23]], align 1
-; ALL-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[SRC]], i64 50
+; ALL-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 100
 ; ALL-NEXT:    [[TMP25:%.*]] = load i16, ptr addrspace(1) [[TMP24]], align 1
-; ALL-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[DST1]], i64 50
+; ALL-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 100
 ; ALL-NEXT:    store i16 [[TMP25]], ptr addrspace(1) [[TMP26]], align 1
 ; ALL-NEXT:    ret void
 ; ALL:       loop-memcpy-residual-header:
@@ -462,17 +455,17 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1028(ptr addrspace
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
-; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
-; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
-; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 64
+; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 16
+; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
-; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[SRC]], i64 256
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
 ; OPT-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4
-; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[DST]], i64 256
+; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
 ; OPT-NEXT:    store i32 [[TMP7]], ptr addrspace(1) [[TMP8]], align 4
 ; OPT-NEXT:    ret void
 ;
@@ -485,12 +478,12 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1025(ptr addrspace
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
-; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
-; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
-; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 64
+; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 16
+; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
 ; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
@@ -508,17 +501,17 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1026(ptr addrspace
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
-; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
-; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
-; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 64
+; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 16
+; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
-; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[SRC]], i64 512
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
 ; OPT-NEXT:    [[TMP7:%.*]] = load i16, ptr addrspace(1) [[TMP6]], align 4
-; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[DST]], i64 512
+; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
 ; OPT-NEXT:    store i16 [[TMP7]], ptr addrspace(1) [[TMP8]], align 4
 ; OPT-NEXT:    ret void
 ;
@@ -531,17 +524,17 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1032(ptr addrspace
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
-; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
-; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
-; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 64
+; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 16
+; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
-; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[SRC]], i64 128
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
 ; OPT-NEXT:    [[TMP7:%.*]] = load i64, ptr addrspace(1) [[TMP6]], align 4
-; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[DST]], i64 128
+; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
 ; OPT-NEXT:    store i64 [[TMP7]], ptr addrspace(1) [[TMP8]], align 4
 ; OPT-NEXT:    ret void
 ;
@@ -554,21 +547,21 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1034(ptr addrspace
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
-; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
-; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
-; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 64
+; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 16
+; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
-; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[SRC]], i64 128
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
 ; OPT-NEXT:    [[TMP7:%.*]] = load i64, ptr addrspace(1) [[TMP6]], align 4
-; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[DST]], i64 128
+; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
 ; OPT-NEXT:    store i64 [[TMP7]], ptr addrspace(1) [[TMP8]], align 4
-; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[SRC]], i64 516
+; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1032
 ; OPT-NEXT:    [[TMP10:%.*]] = load i16, ptr addrspace(1) [[TMP9]], align 4
-; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[DST]], i64 516
+; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1032
 ; OPT-NEXT:    store i16 [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
 ; OPT-NEXT:    ret void
 ;
@@ -581,21 +574,21 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1035(ptr addrspace
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
-; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
-; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
-; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 64
+; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 16
+; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
-; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[SRC]], i64 128
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
 ; OPT-NEXT:    [[TMP7:%.*]] = load i64, ptr addrspace(1) [[TMP6]], align 4
-; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[DST]], i64 128
+; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
 ; OPT-NEXT:    store i64 [[TMP7]], ptr addrspace(1) [[TMP8]], align 4
-; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[SRC]], i64 516
+; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1032
 ; OPT-NEXT:    [[TMP10:%.*]] = load i16, ptr addrspace(1) [[TMP9]], align 4
-; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[DST]], i64 516
+; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1032
 ; OPT-NEXT:    store i16 [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
 ; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1034
 ; OPT-NEXT:    [[TMP13:%.*]] = load i8, ptr addrspace(1) [[TMP12]], align 2
@@ -612,21 +605,21 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1036(ptr addrspace
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
-; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
-; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
-; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 64
+; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 16
+; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
-; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[SRC]], i64 128
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
 ; OPT-NEXT:    [[TMP7:%.*]] = load i64, ptr addrspace(1) [[TMP6]], align 4
-; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[DST]], i64 128
+; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
 ; OPT-NEXT:    store i64 [[TMP7]], ptr addrspace(1) [[TMP8]], align 4
-; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[SRC]], i64 258
+; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1032
 ; OPT-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(1) [[TMP9]], align 4
-; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[DST]], i64 258
+; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1032
 ; OPT-NEXT:    store i32 [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
 ; OPT-NEXT:    ret void
 ;
@@ -639,25 +632,25 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1039(ptr addrspace
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
-; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
-; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
-; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 64
+; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 16
+; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
-; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[SRC]], i64 128
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
 ; OPT-NEXT:    [[TMP7:%.*]] = load i64, ptr addrspace(1) [[TMP6]], align 4
-; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[DST]], i64 128
+; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
 ; OPT-NEXT:    store i64 [[TMP7]], ptr addrspace(1) [[TMP8]], align 4
-; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[SRC]], i64 258
+; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1032
 ; OPT-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(1) [[TMP9]], align 4
-; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[DST]], i64 258
+; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1032
 ; OPT-NEXT:    store i32 [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
-; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[SRC]], i64 518
+; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1036
 ; OPT-NEXT:    [[TMP13:%.*]] = load i16, ptr addrspace(1) [[TMP12]], align 4
-; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[DST]], i64 518
+; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1036
 ; OPT-NEXT:    store i16 [[TMP13]], ptr addrspace(1) [[TMP14]], align 4
 ; OPT-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1038
 ; OPT-NEXT:    [[TMP16:%.*]] = load i8, ptr addrspace(1) [[TMP15]], align 2
@@ -674,12 +667,12 @@ define amdgpu_kernel void @memcpy_global_align2_global_align2_1039(ptr addrspace
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
-; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(1) [[TMP1]], align 2
-; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    store i16 [[TMP2]], ptr addrspace(1) [[TMP3]], align 2
-; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 519
+; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 2
+; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1038
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
 ; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1038
@@ -697,17 +690,17 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_1027(ptr addrspace
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
-; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
-; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
-; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 64
+; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 16
+; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
-; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[SRC]], i64 512
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
 ; OPT-NEXT:    [[TMP7:%.*]] = load i16, ptr addrspace(1) [[TMP6]], align 4
-; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[DST]], i64 512
+; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
 ; OPT-NEXT:    store i16 [[TMP7]], ptr addrspace(1) [[TMP8]], align 4
 ; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1026
 ; OPT-NEXT:    [[TMP10:%.*]] = load i8, ptr addrspace(1) [[TMP9]], align 2
@@ -724,12 +717,12 @@ define amdgpu_kernel void @memcpy_global_align2_global_align4_1027(ptr addrspace
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
-; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(1) [[TMP1]], align 2
-; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    store i16 [[TMP2]], ptr addrspace(1) [[TMP3]], align 2
-; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 513
+; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 2
+; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1026
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
 ; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1026
@@ -747,12 +740,12 @@ define amdgpu_kernel void @memcpy_global_align4_global_align2_1027(ptr addrspace
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
-; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(1) [[TMP1]], align 2
-; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    store i16 [[TMP2]], ptr addrspace(1) [[TMP3]], align 2
-; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 513
+; OPT-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 2
+; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1026
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
 ; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1026
@@ -770,17 +763,17 @@ define amdgpu_kernel void @memcpy_private_align4_private_align4_1027(ptr addrspa
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
-; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP1]], align 4
-; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 4
-; OPT-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 64
+; OPT-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 16
+; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
-; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr addrspace(5) [[SRC]], i32 512
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024
 ; OPT-NEXT:    [[TMP7:%.*]] = load i16, ptr addrspace(5) [[TMP6]], align 4
-; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr addrspace(5) [[DST]], i32 512
+; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024
 ; OPT-NEXT:    store i16 [[TMP7]], ptr addrspace(5) [[TMP8]], align 4
 ; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026
 ; OPT-NEXT:    [[TMP10:%.*]] = load i8, ptr addrspace(5) [[TMP9]], align 2
@@ -797,12 +790,12 @@ define amdgpu_kernel void @memcpy_private_align2_private_align4_1027(ptr addrspa
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
-; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(5) [[TMP1]], align 2
-; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    store i16 [[TMP2]], ptr addrspace(5) [[TMP3]], align 2
-; OPT-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 513
+; OPT-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 2
+; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1026
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
 ; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026
@@ -820,17 +813,17 @@ define amdgpu_kernel void @memcpy_private_align1_private_align4_1027(ptr addrspa
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
-; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP1]], align 4
-; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1
-; OPT-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 64
+; OPT-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 16
+; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
-; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr addrspace(5) [[SRC]], i32 512
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024
 ; OPT-NEXT:    [[TMP7:%.*]] = load i16, ptr addrspace(5) [[TMP6]], align 4
-; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr addrspace(5) [[DST]], i32 512
+; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024
 ; OPT-NEXT:    store i16 [[TMP7]], ptr addrspace(5) [[TMP8]], align 1
 ; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026
 ; OPT-NEXT:    [[TMP10:%.*]] = load i8, ptr addrspace(5) [[TMP9]], align 2
@@ -847,12 +840,12 @@ define amdgpu_kernel void @memcpy_private_align4_private_align2_1027(ptr addrspa
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
-; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(5) [[TMP1]], align 2
-; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    store i16 [[TMP2]], ptr addrspace(5) [[TMP3]], align 2
-; OPT-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 513
+; OPT-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 2
+; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1026
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
 ; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026
@@ -870,17 +863,17 @@ define amdgpu_kernel void @memcpy_private_align4_private_align1_1027(ptr addrspa
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
-; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP1]], align 1
-; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 4
-; OPT-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 64
+; OPT-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 16
+; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
-; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr addrspace(5) [[SRC]], i32 512
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024
 ; OPT-NEXT:    [[TMP7:%.*]] = load i16, ptr addrspace(5) [[TMP6]], align 1
-; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr addrspace(5) [[DST]], i32 512
+; OPT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024
 ; OPT-NEXT:    store i16 [[TMP7]], ptr addrspace(5) [[TMP8]], align 4
 ; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026
 ; OPT-NEXT:    [[TMP10:%.*]] = load i8, ptr addrspace(5) [[TMP9]], align 1
@@ -897,12 +890,12 @@ define amdgpu_kernel void @memcpy_private_align2_private_align2_1027(ptr addrspa
 ; OPT-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
 ; OPT:       load-store-loop:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
-; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(5) [[TMP1]], align 2
-; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    store i16 [[TMP2]], ptr addrspace(5) [[TMP3]], align 2
-; OPT-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 513
+; OPT-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 2
+; OPT-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1026
 ; OPT-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; OPT:       memcpy-split:
 ; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026
@@ -917,19 +910,18 @@ define amdgpu_kernel void @memcpy_private_align2_private_align2_1027(ptr addrspa
 
 define amdgpu_kernel void @memcpy_global_align4_global_align4_variable(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
 ; OPT-LABEL: @memcpy_global_align4_global_align4_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
-; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 15
+; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N:%.*]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
-; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
+; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; OPT:       loop-memcpy-expansion:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
-; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 4
-; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4
-; OPT-NEXT:    [[TMP8]] = add i64 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP1]]
+; OPT-NEXT:    [[TMP8]] = add i64 [[LOOP_INDEX]], 16
+; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
 ; OPT:       loop-memcpy-residual:
 ; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
@@ -953,19 +945,18 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_variable(ptr addrs
 
 define amdgpu_kernel void @memcpy_global_align2_global_align2_variable(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
 ; OPT-LABEL: @memcpy_global_align2_global_align2_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 1
-; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 1
+; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N:%.*]], 1
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
-; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
+; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; OPT:       loop-memcpy-expansion:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
-; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP6:%.*]] = load i16, ptr addrspace(1) [[TMP5]], align 2
-; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    store i16 [[TMP6]], ptr addrspace(1) [[TMP7]], align 2
-; OPT-NEXT:    [[TMP8]] = add i64 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP1]]
+; OPT-NEXT:    [[TMP8]] = add i64 [[LOOP_INDEX]], 2
+; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
 ; OPT:       loop-memcpy-residual:
 ; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
@@ -989,19 +980,18 @@ define amdgpu_kernel void @memcpy_global_align2_global_align2_variable(ptr addrs
 
 define amdgpu_kernel void @memcpy_global_align1_global_align1_variable(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
 ; OPT-LABEL: @memcpy_global_align1_global_align1_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
-; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 15
+; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N:%.*]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
-; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
+; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; OPT:       loop-memcpy-expansion:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
-; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
-; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1
-; OPT-NEXT:    [[TMP8]] = add i64 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP1]]
+; OPT-NEXT:    [[TMP8]] = add i64 [[LOOP_INDEX]], 16
+; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
 ; OPT:       loop-memcpy-residual:
 ; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
@@ -1025,19 +1015,18 @@ define amdgpu_kernel void @memcpy_global_align1_global_align1_variable(ptr addrs
 
 define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 {
 ; OPT-LABEL: @memcpy_local_align4_local_align4_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[N:%.*]], 3
-; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N]], 7
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N:%.*]], 7
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
-; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
+; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; OPT:       loop-memcpy-expansion:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
-; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 4
-; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 4
-; OPT-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP1]]
+; OPT-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
 ; OPT:       loop-memcpy-residual:
 ; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
@@ -1061,19 +1050,18 @@ define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspa
 
 define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 {
 ; OPT-LABEL: @memcpy_local_align2_local_align2_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[N:%.*]], 1
-; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N]], 1
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N:%.*]], 1
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
-; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
+; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; OPT:       loop-memcpy-expansion:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
-; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP6:%.*]] = load i16, ptr addrspace(3) [[TMP5]], align 2
-; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    store i16 [[TMP6]], ptr addrspace(3) [[TMP7]], align 2
-; OPT-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP1]]
+; OPT-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 2
+; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
 ; OPT:       loop-memcpy-residual:
 ; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
@@ -1097,19 +1085,18 @@ define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspa
 
 define amdgpu_kernel void @memcpy_local_align1_local_align1_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 {
 ; OPT-LABEL: @memcpy_local_align1_local_align1_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[N:%.*]], 3
-; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N]], 7
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N:%.*]], 7
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
-; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
+; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; OPT:       loop-memcpy-expansion:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
-; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1
-; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 1
-; OPT-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP1]]
+; OPT-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
 ; OPT:       loop-memcpy-residual:
 ; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
@@ -1133,19 +1120,18 @@ define amdgpu_kernel void @memcpy_local_align1_local_align1_variable(ptr addrspa
 
 define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrspace(3) %dst, ptr addrspace(1) %src, i32 %n) #0 {
 ; OPT-LABEL: @memcpy_local_align4_global_align4_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[N:%.*]], 3
-; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N]], 7
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N:%.*]], 7
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
-; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
+; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; OPT:       loop-memcpy-expansion:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
-; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(1) [[SRC:%.*]], i32 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(1) [[TMP5]], align 4
-; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 4
-; OPT-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP1]]
+; OPT-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
 ; OPT:       loop-memcpy-residual:
 ; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
@@ -1169,19 +1155,18 @@ define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrsp
 
 define amdgpu_kernel void @memcpy_global_align4_local_align4_variable(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %n) #0 {
 ; OPT-LABEL: @memcpy_global_align4_local_align4_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[N:%.*]], 3
-; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N]], 7
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N:%.*]], 7
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
-; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
+; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; OPT:       loop-memcpy-expansion:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
-; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 4
-; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(1) [[DST:%.*]], i32 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i32 [[LOOP_INDEX]]
 ; OPT-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4
-; OPT-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP1]]
+; OPT-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
 ; OPT:       loop-memcpy-residual:
 ; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
@@ -1212,12 +1197,12 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_16(ptr addrspace(1
 ; ALL-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
 ; ALL:       load-store-loop:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
-; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
+; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; ALL-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
 ; ALL-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
-; ALL-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 1
-; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1
+; ALL-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 16
+; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 16
 ; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; ALL:       memcpy-split:
 ; ALL-NEXT:    ret void
@@ -1232,13 +1217,13 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_12(ptr addrspace(1
 ; MAX1024-NEXT:    ret void
 ;
 ; ALL-LABEL: @memcpy_global_align4_global_align4_12(
-; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[SRC:%.*]], i64 0
+; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspace(1) [[TMP1]], align 4
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[DST:%.*]], i64 0
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0
 ; ALL-NEXT:    store i64 [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
-; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[SRC]], i64 2
+; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 8
 ; ALL-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(1) [[TMP4]], align 4
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[DST]], i64 2
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 8
 ; ALL-NEXT:    store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4
 ; ALL-NEXT:    ret void
 ;
@@ -1252,9 +1237,9 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_8(ptr addrspace(1)
 ; MAX1024-NEXT:    ret void
 ;
 ; ALL-LABEL: @memcpy_global_align4_global_align4_8(
-; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[SRC:%.*]], i64 0
+; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspace(1) [[TMP1]], align 4
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[DST:%.*]], i64 0
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0
 ; ALL-NEXT:    store i64 [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
 ; ALL-NEXT:    ret void
 ;
@@ -1268,13 +1253,13 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_10(ptr addrspace(1
 ; MAX1024-NEXT:    ret void
 ;
 ; ALL-LABEL: @memcpy_global_align4_global_align4_10(
-; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[SRC:%.*]], i64 0
+; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspace(1) [[TMP1]], align 4
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[DST:%.*]], i64 0
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0
 ; ALL-NEXT:    store i64 [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
-; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[SRC]], i64 4
+; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 8
 ; ALL-NEXT:    [[TMP5:%.*]] = load i16, ptr addrspace(1) [[TMP4]], align 4
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[DST]], i64 4
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 8
 ; ALL-NEXT:    store i16 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4
 ; ALL-NEXT:    ret void
 ;
@@ -1288,9 +1273,9 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_4(ptr addrspace(1)
 ; MAX1024-NEXT:    ret void
 ;
 ; ALL-LABEL: @memcpy_global_align4_global_align4_4(
-; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[SRC:%.*]], i64 0
+; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(1) [[TMP1]], align 4
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[DST:%.*]], i64 0
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0
 ; ALL-NEXT:    store i32 [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
 ; ALL-NEXT:    ret void
 ;
@@ -1304,9 +1289,9 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_2(ptr addrspace(1)
 ; MAX1024-NEXT:    ret void
 ;
 ; ALL-LABEL: @memcpy_global_align4_global_align4_2(
-; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[SRC:%.*]], i64 0
+; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(1) [[TMP1]], align 4
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[DST:%.*]], i64 0
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0
 ; ALL-NEXT:    store i16 [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
 ; ALL-NEXT:    ret void
 ;
@@ -1340,22 +1325,22 @@ define amdgpu_kernel void @memmove_flat_align1_global_align1(ptr %dst, ptr addrs
 ; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[TMP1]]
 ; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
 ; ALL:       memmove_bwd_loop:
-; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 16, [[TMP0:%.*]] ]
-; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP2]], 1
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP2]], 16
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
 ; ALL-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP3]], align 1
-; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[DST]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[BWD_INDEX]]
 ; ALL-NEXT:    store <4 x i32> [[ELEMENT]], ptr [[TMP4]], align 1
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
 ; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
 ; ALL:       memmove_fwd_loop:
 ; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
 ; ALL-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP6]], align 1
-; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr [[DST]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[FWD_INDEX]]
 ; ALL-NEXT:    store <4 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1
-; ALL-NEXT:    [[TMP8]] = add i64 [[FWD_INDEX]], 1
-; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 16
+; ALL-NEXT:    [[TMP8]] = add i64 [[FWD_INDEX]], 16
+; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 256
 ; ALL-NEXT:    br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
 ; ALL-NEXT:    ret void
@@ -1374,22 +1359,22 @@ define amdgpu_kernel void @memmove_global_align1_flat_align1(ptr addrspace(1) %d
 ; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP1]]
 ; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
 ; ALL:       memmove_bwd_loop:
-; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 16, [[TMP0:%.*]] ]
-; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP2]], 1
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr [[SRC]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP2]], 16
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[BWD_INDEX]]
 ; ALL-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr [[TMP3]], align 1
-; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
 ; ALL-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP4]], align 1
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
 ; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
 ; ALL:       memmove_fwd_loop:
 ; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr [[SRC]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[FWD_INDEX]]
 ; ALL-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr [[TMP6]], align 1
-; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
 ; ALL-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP7]], align 1
-; ALL-NEXT:    [[TMP8]] = add i64 [[FWD_INDEX]], 1
-; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 16
+; ALL-NEXT:    [[TMP8]] = add i64 [[FWD_INDEX]], 16
+; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 256
 ; ALL-NEXT:    br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
 ; ALL-NEXT:    ret void
@@ -1408,22 +1393,22 @@ define amdgpu_kernel void @memmove_flat_align1_private_align1(ptr %dst, ptr addr
 ; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(5) [[SRC:%.*]], [[TMP1]]
 ; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
 ; ALL:       memmove_bwd_loop:
-; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 16, [[TMP0:%.*]] ]
-; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP2]], 1
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP2]], 16
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i64 [[BWD_INDEX]]
 ; ALL-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP3]], align 1
-; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[DST]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[BWD_INDEX]]
 ; ALL-NEXT:    store <4 x i32> [[ELEMENT]], ptr [[TMP4]], align 1
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
 ; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
 ; ALL:       memmove_fwd_loop:
 ; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i64 [[FWD_INDEX]]
 ; ALL-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP6]], align 1
-; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr [[DST]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[FWD_INDEX]]
 ; ALL-NEXT:    store <4 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1
-; ALL-NEXT:    [[TMP8]] = add i64 [[FWD_INDEX]], 1
-; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 16
+; ALL-NEXT:    [[TMP8]] = add i64 [[FWD_INDEX]], 16
+; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 256
 ; ALL-NEXT:    br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
 ; ALL-NEXT:    ret void
@@ -1442,22 +1427,22 @@ define amdgpu_kernel void @memmove_private_align1_flat_align1(ptr addrspace(5) %
 ; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP1]]
 ; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
 ; ALL:       memmove_bwd_loop:
-; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 16, [[TMP0:%.*]] ]
-; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP2]], 1
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr [[SRC]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP2]], 16
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[BWD_INDEX]]
 ; ALL-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr [[TMP3]], align 1
-; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i64 [[BWD_INDEX]]
 ; ALL-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(5) [[TMP4]], align 1
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
 ; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
 ; ALL:       memmove_fwd_loop:
 ; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr [[SRC]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[FWD_INDEX]]
 ; ALL-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr [[TMP6]], align 1
-; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i64 [[FWD_INDEX]]
 ; ALL-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(5) [[TMP7]], align 1
-; ALL-NEXT:    [[TMP8]] = add i64 [[FWD_INDEX]], 1
-; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 16
+; ALL-NEXT:    [[TMP8]] = add i64 [[FWD_INDEX]], 16
+; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 256
 ; ALL-NEXT:    br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
 ; ALL-NEXT:    ret void
@@ -1475,12 +1460,12 @@ define amdgpu_kernel void @memmove_private_align1_global_align1(ptr addrspace(5)
 ; ALL-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
 ; ALL:       load-store-loop:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
-; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
+; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; ALL-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 1, !alias.scope [[META0:![0-9]+]]
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST:%.*]], i64 [[LOOP_INDEX]]
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i64 [[LOOP_INDEX]]
 ; ALL-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias [[META0]]
-; ALL-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 1
-; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 16
+; ALL-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 16
+; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 256
 ; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; ALL:       memcpy-split:
 ; ALL-NEXT:    ret void
@@ -1498,12 +1483,12 @@ define amdgpu_kernel void @memmove_global_align1_private_align1(ptr addrspace(1)
 ; ALL-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
 ; ALL:       load-store-loop:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
-; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC:%.*]], i64 [[LOOP_INDEX]]
+; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; ALL-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META3:![0-9]+]]
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
 ; ALL-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1, !noalias [[META3]]
-; ALL-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 1
-; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 16
+; ALL-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 16
+; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 256
 ; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; ALL:       memcpy-split:
 ; ALL-NEXT:    ret void
@@ -1514,11 +1499,10 @@ define amdgpu_kernel void @memmove_global_align1_private_align1(ptr addrspace(1)
 
 define amdgpu_kernel void @memmove_global_align1_p999_align1(ptr addrspace(1) %dst, ptr addrspace(999) %src, i64 %size) {
 ; OPT-LABEL: @memmove_global_align1_p999_align1(
-; OPT-NEXT:    [[TMP1:%.*]] = lshr i64 [[SIZE:%.*]], 4
-; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[SIZE]], 15
+; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[SIZE:%.*]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[SIZE]], [[TMP2]]
 ; OPT-NEXT:    [[SKIP_RESIDUAL:%.*]] = icmp eq i64 [[TMP2]], 0
-; OPT-NEXT:    [[SKIP_MAIN:%.*]] = icmp eq i64 [[TMP1]], 0
+; OPT-NEXT:    [[SKIP_MAIN:%.*]] = icmp eq i64 [[TMP3]], 0
 ; OPT-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[DST:%.*]] to ptr addrspace(999)
 ; OPT-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(999) [[SRC:%.*]], [[TMP4]]
 ; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
@@ -1536,11 +1520,11 @@ define amdgpu_kernel void @memmove_global_align1_p999_align1(ptr addrspace(1) %d
 ; OPT:       memmove_bwd_middle:
 ; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
 ; OPT:       memmove_bwd_main_loop:
-; OPT-NEXT:    [[TMP9:%.*]] = phi i64 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ]
-; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i64 [[TMP9]], 1
-; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(999) [[SRC]], i64 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    [[TMP9:%.*]] = phi i64 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ]
+; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i64 [[TMP9]], 16
+; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[SRC]], i64 [[BWD_MAIN_INDEX]]
 ; OPT-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(999) [[TMP10]], align 1
-; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_MAIN_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP11]], align 1
 ; OPT-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[BWD_MAIN_INDEX]], 0
 ; OPT-NEXT:    br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
@@ -1548,12 +1532,12 @@ define amdgpu_kernel void @memmove_global_align1_p999_align1(ptr addrspace(1) %d
 ; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
 ; OPT:       memmove_fwd_main_loop:
 ; OPT-NEXT:    [[FWD_MAIN_INDEX:%.*]] = phi i64 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
-; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(999) [[SRC]], i64 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[SRC]], i64 [[FWD_MAIN_INDEX]]
 ; OPT-NEXT:    [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(999) [[TMP13]], align 1
-; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_MAIN_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[ELEMENT2]], ptr addrspace(1) [[TMP14]], align 1
-; OPT-NEXT:    [[TMP15]] = add i64 [[FWD_MAIN_INDEX]], 1
-; OPT-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP15]], [[TMP1]]
+; OPT-NEXT:    [[TMP15]] = add i64 [[FWD_MAIN_INDEX]], 16
+; OPT-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP15]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
 ; OPT:       memmove_fwd_middle:
 ; OPT-NEXT:    br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
@@ -1575,11 +1559,10 @@ define amdgpu_kernel void @memmove_global_align1_p999_align1(ptr addrspace(1) %d
 
 define amdgpu_kernel void @memmove_p999_align1_p1_align1(ptr addrspace(999) %dst, ptr addrspace(1) %src, i64 %size) {
 ; OPT-LABEL: @memmove_p999_align1_p1_align1(
-; OPT-NEXT:    [[TMP1:%.*]] = lshr i64 [[SIZE:%.*]], 4
-; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[SIZE]], 15
+; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[SIZE:%.*]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[SIZE]], [[TMP2]]
 ; OPT-NEXT:    [[SKIP_RESIDUAL:%.*]] = icmp eq i64 [[TMP2]], 0
-; OPT-NEXT:    [[SKIP_MAIN:%.*]] = icmp eq i64 [[TMP1]], 0
+; OPT-NEXT:    [[SKIP_MAIN:%.*]] = icmp eq i64 [[TMP3]], 0
 ; OPT-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(999) [[DST:%.*]] to ptr addrspace(1)
 ; OPT-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[TMP4]]
 ; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
@@ -1597,11 +1580,11 @@ define amdgpu_kernel void @memmove_p999_align1_p1_align1(ptr addrspace(999) %dst
 ; OPT:       memmove_bwd_middle:
 ; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
 ; OPT:       memmove_bwd_main_loop:
-; OPT-NEXT:    [[TMP9:%.*]] = phi i64 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ]
-; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i64 [[TMP9]], 1
-; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    [[TMP9:%.*]] = phi i64 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ]
+; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i64 [[TMP9]], 16
+; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_MAIN_INDEX]]
 ; OPT-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP10]], align 1
-; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(999) [[DST]], i64 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[BWD_MAIN_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(999) [[TMP11]], align 1
 ; OPT-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[BWD_MAIN_INDEX]], 0
 ; OPT-NEXT:    br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
@@ -1609,12 +1592,12 @@ define amdgpu_kernel void @memmove_p999_align1_p1_align1(ptr addrspace(999) %dst
 ; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
 ; OPT:       memmove_fwd_main_loop:
 ; OPT-NEXT:    [[FWD_MAIN_INDEX:%.*]] = phi i64 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
-; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_MAIN_INDEX]]
 ; OPT-NEXT:    [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP13]], align 1
-; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(999) [[DST]], i64 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[FWD_MAIN_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[ELEMENT2]], ptr addrspace(999) [[TMP14]], align 1
-; OPT-NEXT:    [[TMP15]] = add i64 [[FWD_MAIN_INDEX]], 1
-; OPT-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP15]], [[TMP1]]
+; OPT-NEXT:    [[TMP15]] = add i64 [[FWD_MAIN_INDEX]], 16
+; OPT-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP15]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
 ; OPT:       memmove_fwd_middle:
 ; OPT-NEXT:    br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
@@ -1636,11 +1619,10 @@ define amdgpu_kernel void @memmove_p999_align1_p1_align1(ptr addrspace(999) %dst
 
 define amdgpu_kernel void @memmove_p999_align1_p998_align1(ptr addrspace(999) %dst, ptr addrspace(998) %src, i64 %size) {
 ; OPT-LABEL: @memmove_p999_align1_p998_align1(
-; OPT-NEXT:    [[TMP1:%.*]] = lshr i64 [[SIZE:%.*]], 4
-; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[SIZE]], 15
+; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[SIZE:%.*]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[SIZE]], [[TMP2]]
 ; OPT-NEXT:    [[SKIP_RESIDUAL:%.*]] = icmp eq i64 [[TMP2]], 0
-; OPT-NEXT:    [[SKIP_MAIN:%.*]] = icmp eq i64 [[TMP1]], 0
+; OPT-NEXT:    [[SKIP_MAIN:%.*]] = icmp eq i64 [[TMP3]], 0
 ; OPT-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(999) [[DST:%.*]] to ptr addrspace(998)
 ; OPT-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(998) [[SRC:%.*]], [[TMP4]]
 ; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
@@ -1658,11 +1640,11 @@ define amdgpu_kernel void @memmove_p999_align1_p998_align1(ptr addrspace(999) %d
 ; OPT:       memmove_bwd_middle:
 ; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
 ; OPT:       memmove_bwd_main_loop:
-; OPT-NEXT:    [[TMP9:%.*]] = phi i64 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ]
-; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i64 [[TMP9]], 1
-; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(998) [[SRC]], i64 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    [[TMP9:%.*]] = phi i64 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ]
+; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i64 [[TMP9]], 16
+; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(998) [[SRC]], i64 [[BWD_MAIN_INDEX]]
 ; OPT-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(998) [[TMP10]], align 1
-; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(999) [[DST]], i64 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[BWD_MAIN_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(999) [[TMP11]], align 1
 ; OPT-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[BWD_MAIN_INDEX]], 0
 ; OPT-NEXT:    br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
@@ -1670,12 +1652,12 @@ define amdgpu_kernel void @memmove_p999_align1_p998_align1(ptr addrspace(999) %d
 ; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
 ; OPT:       memmove_fwd_main_loop:
 ; OPT-NEXT:    [[FWD_MAIN_INDEX:%.*]] = phi i64 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
-; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(998) [[SRC]], i64 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(998) [[SRC]], i64 [[FWD_MAIN_INDEX]]
 ; OPT-NEXT:    [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(998) [[TMP13]], align 1
-; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(999) [[DST]], i64 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[FWD_MAIN_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[ELEMENT2]], ptr addrspace(999) [[TMP14]], align 1
-; OPT-NEXT:    [[TMP15]] = add i64 [[FWD_MAIN_INDEX]], 1
-; OPT-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP15]], [[TMP1]]
+; OPT-NEXT:    [[TMP15]] = add i64 [[FWD_MAIN_INDEX]], 16
+; OPT-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP15]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
 ; OPT:       memmove_fwd_middle:
 ; OPT-NEXT:    br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
@@ -1704,12 +1686,12 @@ define amdgpu_kernel void @memmove_local_align1_private_align1(ptr addrspace(3)
 ; ALL-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
 ; ALL:       load-store-loop:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
-; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
+; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; ALL-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META6:![0-9]+]]
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
 ; ALL-NEXT:    store <2 x i32> [[TMP2]], ptr addrspace(3) [[TMP3]], align 1, !noalias [[META6]]
-; ALL-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 1
-; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 32
+; ALL-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 8
+; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 256
 ; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; ALL:       memcpy-split:
 ; ALL-NEXT:    ret void
@@ -1720,19 +1702,18 @@ define amdgpu_kernel void @memmove_local_align1_private_align1(ptr addrspace(3)
 
 define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(5) %src, i32 %size) {
 ; MAX1024-LABEL: @memmove_local_align1_private_align1_unknown_size(
-; MAX1024-NEXT:    [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 3
-; MAX1024-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE]], 7
+; MAX1024-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
 ; MAX1024-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
-; MAX1024-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
+; MAX1024-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 ; MAX1024-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; MAX1024:       loop-memcpy-expansion:
 ; MAX1024-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
-; MAX1024-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; MAX1024-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP5]], align 1, !alias.scope [[META0:![0-9]+]]
-; MAX1024-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; MAX1024-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 1, !noalias [[META0]]
-; MAX1024-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 1
-; MAX1024-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP1]]
+; MAX1024-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
+; MAX1024-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP7]], align 1, !alias.scope [[META0:![0-9]+]]
+; MAX1024-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
+; MAX1024-NEXT:    store <2 x i32> [[TMP5]], ptr addrspace(3) [[TMP6]], align 1, !noalias [[META0]]
+; MAX1024-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; MAX1024-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
 ; MAX1024-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
 ; MAX1024:       loop-memcpy-residual:
 ; MAX1024-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
@@ -1751,19 +1732,18 @@ define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr
 ; MAX1024-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
 ;
 ; ALL-LABEL: @memmove_local_align1_private_align1_unknown_size(
-; ALL-NEXT:    [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 3
-; ALL-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE]], 7
+; ALL-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
 ; ALL-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
-; ALL-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
+; ALL-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 ; ALL-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; ALL:       loop-memcpy-expansion:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
-; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP5]], align 1, !alias.scope [[META9:![0-9]+]]
-; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 1, !noalias [[META9]]
-; ALL-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 1
-; ALL-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP1]]
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
+; ALL-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP7]], align 1, !alias.scope [[META9:![0-9]+]]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
+; ALL-NEXT:    store <2 x i32> [[TMP5]], ptr addrspace(3) [[TMP6]], align 1, !noalias [[META9]]
+; ALL-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; ALL-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
 ; ALL-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
 ; ALL:       loop-memcpy-residual:
 ; ALL-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
@@ -1794,12 +1774,12 @@ define amdgpu_kernel void @memmove_private_align1_local_align1(ptr addrspace(5)
 ; ALL-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
 ; ALL:       load-store-loop:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
-; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
+; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
 ; ALL-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP1]], align 1, !alias.scope [[META12:![0-9]+]]
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
 ; ALL-NEXT:    store <2 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias [[META12]]
-; ALL-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 1
-; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 32
+; ALL-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 8
+; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 256
 ; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; ALL:       memcpy-split:
 ; ALL-NEXT:    ret void
@@ -1810,19 +1790,18 @@ define amdgpu_kernel void @memmove_private_align1_local_align1(ptr addrspace(5)
 
 define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr addrspace(5) %dst, ptr addrspace(3) %src, i32 %size) {
 ; MAX1024-LABEL: @memmove_private_align1_local_align1_unknown_size(
-; MAX1024-NEXT:    [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 3
-; MAX1024-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE]], 7
+; MAX1024-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
 ; MAX1024-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
-; MAX1024-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
+; MAX1024-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 ; MAX1024-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; MAX1024:       loop-memcpy-expansion:
 ; MAX1024-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
-; MAX1024-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; MAX1024-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1, !alias.scope [[META3:![0-9]+]]
-; MAX1024-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; MAX1024-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(5) [[TMP7]], align 1, !noalias [[META3]]
-; MAX1024-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 1
-; MAX1024-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP1]]
+; MAX1024-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
+; MAX1024-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP7]], align 1, !alias.scope [[META3:![0-9]+]]
+; MAX1024-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
+; MAX1024-NEXT:    store <2 x i32> [[TMP5]], ptr addrspace(5) [[TMP6]], align 1, !noalias [[META3]]
+; MAX1024-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; MAX1024-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
 ; MAX1024-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
 ; MAX1024:       loop-memcpy-residual:
 ; MAX1024-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
@@ -1841,19 +1820,18 @@ define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr
 ; MAX1024-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
 ;
 ; ALL-LABEL: @memmove_private_align1_local_align1_unknown_size(
-; ALL-NEXT:    [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 3
-; ALL-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE]], 7
+; ALL-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
 ; ALL-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
-; ALL-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
+; ALL-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 ; ALL-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; ALL:       loop-memcpy-expansion:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
-; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1, !alias.scope [[META15:![0-9]+]]
-; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(5) [[TMP7]], align 1, !noalias [[META15]]
-; ALL-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 1
-; ALL-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP1]]
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
+; ALL-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP7]], align 1, !alias.scope [[META15:![0-9]+]]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
+; ALL-NEXT:    store <2 x i32> [[TMP5]], ptr addrspace(5) [[TMP6]], align 1, !noalias [[META15]]
+; ALL-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 8
+; ALL-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
 ; ALL-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
 ; ALL:       loop-memcpy-residual:
 ; ALL-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
@@ -1886,22 +1864,22 @@ define amdgpu_kernel void @memmove_flat_align1_local_align1(ptr addrspace(0) %ds
 ; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(3) [[SRC:%.*]], [[TMP1]]
 ; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
 ; ALL:       memmove_bwd_loop:
-; ALL-NEXT:    [[TMP2:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 32, [[TMP0:%.*]] ]
-; ALL-NEXT:    [[BWD_INDEX]] = sub i32 [[TMP2]], 1
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    [[TMP2:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i32 [[TMP2]], 8
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_INDEX]]
 ; ALL-NEXT:    [[ELEMENT:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP3]], align 1
-; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[DST]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[BWD_INDEX]]
 ; ALL-NEXT:    store <2 x i32> [[ELEMENT]], ptr [[TMP4]], align 1
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
 ; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
 ; ALL:       memmove_fwd_loop:
 ; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i32 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_INDEX]]
 ; ALL-NEXT:    [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP6]], align 1
-; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr [[DST]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[FWD_INDEX]]
 ; ALL-NEXT:    store <2 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1
-; ALL-NEXT:    [[TMP8]] = add i32 [[FWD_INDEX]], 1
-; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 32
+; ALL-NEXT:    [[TMP8]] = add i32 [[FWD_INDEX]], 8
+; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 256
 ; ALL-NEXT:    br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
 ; ALL-NEXT:    ret void
@@ -1912,11 +1890,10 @@ define amdgpu_kernel void @memmove_flat_align1_local_align1(ptr addrspace(0) %ds
 
 define amdgpu_kernel void @memmove_flat_align1_local_align1_unknown_size(ptr addrspace(0) %dst, ptr addrspace(3) %src, i32 %size) {
 ; OPT-LABEL: @memmove_flat_align1_local_align1_unknown_size(
-; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 3
-; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE]], 7
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
 ; OPT-NEXT:    [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
-; OPT-NEXT:    [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP1]], 0
+; OPT-NEXT:    [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP3]], 0
 ; OPT-NEXT:    [[TMP4:%.*]] = addrspacecast ptr [[DST:%.*]] to ptr addrspace(3)
 ; OPT-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(3) [[SRC:%.*]], [[TMP4]]
 ; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
@@ -1934,11 +1911,11 @@ define amdgpu_kernel void @memmove_flat_align1_local_align1_unknown_size(ptr add
 ; OPT:       memmove_bwd_middle:
 ; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
 ; OPT:       memmove_bwd_main_loop:
-; OPT-NEXT:    [[TMP9:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ]
-; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 1
-; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    [[TMP9:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ]
+; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 8
+; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_MAIN_INDEX]]
 ; OPT-NEXT:    [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP10]], align 1
-; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds <2 x i32>, ptr [[DST]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[BWD_MAIN_INDEX]]
 ; OPT-NEXT:    store <2 x i32> [[ELEMENT1]], ptr [[TMP11]], align 1
 ; OPT-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
 ; OPT-NEXT:    br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
@@ -1946,12 +1923,12 @@ define amdgpu_kernel void @memmove_flat_align1_local_align1_unknown_size(ptr add
 ; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
 ; OPT:       memmove_fwd_main_loop:
 ; OPT-NEXT:    [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
-; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_MAIN_INDEX]]
 ; OPT-NEXT:    [[ELEMENT2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP13]], align 1
-; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds <2 x i32>, ptr [[DST]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[FWD_MAIN_INDEX]]
 ; OPT-NEXT:    store <2 x i32> [[ELEMENT2]], ptr [[TMP14]], align 1
-; OPT-NEXT:    [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 1
-; OPT-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP1]]
+; OPT-NEXT:    [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 8
+; OPT-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
 ; OPT:       memmove_fwd_middle:
 ; OPT-NEXT:    br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
@@ -1981,22 +1958,22 @@ define amdgpu_kernel void @memmove_local_align1_flat_align1(ptr addrspace(3) %ds
 ; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP1]]
 ; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
 ; ALL:       memmove_bwd_loop:
-; ALL-NEXT:    [[TMP2:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 32, [[TMP0:%.*]] ]
-; ALL-NEXT:    [[BWD_INDEX]] = sub i32 [[TMP2]], 1
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <2 x i32>, ptr [[SRC]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    [[TMP2:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i32 [[TMP2]], 8
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[BWD_INDEX]]
 ; ALL-NEXT:    [[ELEMENT:%.*]] = load <2 x i32>, ptr [[TMP3]], align 1
-; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_INDEX]]
 ; ALL-NEXT:    store <2 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP4]], align 1
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
 ; ALL-NEXT:    br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
 ; ALL:       memmove_fwd_loop:
 ; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i32 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, ptr [[SRC]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[FWD_INDEX]]
 ; ALL-NEXT:    [[ELEMENT1:%.*]] = load <2 x i32>, ptr [[TMP6]], align 1
-; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_INDEX]]
 ; ALL-NEXT:    store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP7]], align 1
-; ALL-NEXT:    [[TMP8]] = add i32 [[FWD_INDEX]], 1
-; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 32
+; ALL-NEXT:    [[TMP8]] = add i32 [[FWD_INDEX]], 8
+; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 256
 ; ALL-NEXT:    br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
 ; ALL-NEXT:    ret void
@@ -2007,11 +1984,10 @@ define amdgpu_kernel void @memmove_local_align1_flat_align1(ptr addrspace(3) %ds
 
 define amdgpu_kernel void @memmove_local_align1_flat_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(0) %src, i32 %size) {
 ; OPT-LABEL: @memmove_local_align1_flat_align1_unknown_size(
-; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 3
-; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE]], 7
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
 ; OPT-NEXT:    [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
-; OPT-NEXT:    [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP1]], 0
+; OPT-NEXT:    [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP3]], 0
 ; OPT-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(3) [[DST:%.*]] to ptr
 ; OPT-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP4]]
 ; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
@@ -2029,11 +2005,11 @@ define amdgpu_kernel void @memmove_local_align1_flat_align1_unknown_size(ptr add
 ; OPT:       memmove_bwd_middle:
 ; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
 ; OPT:       memmove_bwd_main_loop:
-; OPT-NEXT:    [[TMP9:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ]
-; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 1
-; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <2 x i32>, ptr [[SRC]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    [[TMP9:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ]
+; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 8
+; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[BWD_MAIN_INDEX]]
 ; OPT-NEXT:    [[ELEMENT1:%.*]] = load <2 x i32>, ptr [[TMP10]], align 1
-; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_MAIN_INDEX]]
 ; OPT-NEXT:    store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP11]], align 1
 ; OPT-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
 ; OPT-NEXT:    br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
@@ -2041,12 +2017,12 @@ define amdgpu_kernel void @memmove_local_align1_flat_align1_unknown_size(ptr add
 ; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
 ; OPT:       memmove_fwd_main_loop:
 ; OPT-NEXT:    [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
-; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds <2 x i32>, ptr [[SRC]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[FWD_MAIN_INDEX]]
 ; OPT-NEXT:    [[ELEMENT2:%.*]] = load <2 x i32>, ptr [[TMP13]], align 1
-; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_MAIN_INDEX]]
 ; OPT-NEXT:    store <2 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP14]], align 1
-; OPT-NEXT:    [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 1
-; OPT-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP1]]
+; OPT-NEXT:    [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 8
+; OPT-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
 ; OPT:       memmove_fwd_middle:
 ; OPT-NEXT:    br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
@@ -2075,22 +2051,22 @@ define amdgpu_kernel void @memmove_local_align1_local_align1(ptr addrspace(3) %d
 ; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(3) [[SRC:%.*]], [[DST:%.*]]
 ; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
 ; ALL:       memmove_bwd_loop:
-; ALL-NEXT:    [[TMP1:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 32, [[TMP0:%.*]] ]
-; ALL-NEXT:    [[BWD_INDEX]] = sub i32 [[TMP1]], 1
-; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    [[TMP1:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i32 [[TMP1]], 8
+; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_INDEX]]
 ; ALL-NEXT:    [[ELEMENT:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP2]], align 1
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_INDEX]]
 ; ALL-NEXT:    store <2 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP3]], align 1
 ; ALL-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
 ; ALL-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
 ; ALL:       memmove_fwd_loop:
 ; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i32 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
-; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_INDEX]]
 ; ALL-NEXT:    [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_INDEX]]
 ; ALL-NEXT:    store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP6]], align 1
-; ALL-NEXT:    [[TMP7]] = add i32 [[FWD_INDEX]], 1
-; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 32
+; ALL-NEXT:    [[TMP7]] = add i32 [[FWD_INDEX]], 8
+; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 256
 ; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
 ; ALL-NEXT:    ret void
@@ -2101,11 +2077,10 @@ define amdgpu_kernel void @memmove_local_align1_local_align1(ptr addrspace(3) %d
 
 define amdgpu_kernel void @memmove_local_align1_local_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %size) {
 ; OPT-LABEL: @memmove_local_align1_local_align1_unknown_size(
-; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 3
-; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE]], 7
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
 ; OPT-NEXT:    [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
-; OPT-NEXT:    [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP1]], 0
+; OPT-NEXT:    [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP3]], 0
 ; OPT-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(3) [[SRC:%.*]], [[DST:%.*]]
 ; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
 ; OPT:       memmove_copy_backwards:
@@ -2122,11 +2097,11 @@ define amdgpu_kernel void @memmove_local_align1_local_align1_unknown_size(ptr ad
 ; OPT:       memmove_bwd_middle:
 ; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
 ; OPT:       memmove_bwd_main_loop:
-; OPT-NEXT:    [[TMP8:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ]
-; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i32 [[TMP8]], 1
-; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    [[TMP8:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ]
+; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i32 [[TMP8]], 8
+; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_MAIN_INDEX]]
 ; OPT-NEXT:    [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP9]], align 1
-; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_MAIN_INDEX]]
 ; OPT-NEXT:    store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP10]], align 1
 ; OPT-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
 ; OPT-NEXT:    br i1 [[TMP11]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
@@ -2134,12 +2109,12 @@ define amdgpu_kernel void @memmove_local_align1_local_align1_unknown_size(ptr ad
 ; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
 ; OPT:       memmove_fwd_main_loop:
 ; OPT-NEXT:    [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP14:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
-; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[SRC]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_MAIN_INDEX]]
 ; OPT-NEXT:    [[ELEMENT2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP12]], align 1
-; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_MAIN_INDEX]]
 ; OPT-NEXT:    store <2 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP13]], align 1
-; OPT-NEXT:    [[TMP14]] = add i32 [[FWD_MAIN_INDEX]], 1
-; OPT-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP14]], [[TMP1]]
+; OPT-NEXT:    [[TMP14]] = add i32 [[FWD_MAIN_INDEX]], 8
+; OPT-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP14]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP15]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
 ; OPT:       memmove_fwd_middle:
 ; OPT-NEXT:    br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
@@ -2168,22 +2143,22 @@ define amdgpu_kernel void @memmove_private_align1_private_align1(ptr addrspace(5
 ; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(5) [[SRC:%.*]], [[DST:%.*]]
 ; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
 ; ALL:       memmove_bwd_loop:
-; ALL-NEXT:    [[TMP1:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 16, [[TMP0:%.*]] ]
-; ALL-NEXT:    [[BWD_INDEX]] = sub i32 [[TMP1]], 1
-; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    [[TMP1:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i32 [[TMP1]], 16
+; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[BWD_INDEX]]
 ; ALL-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP2]], align 1
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST]], i32 [[BWD_INDEX]]
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[BWD_INDEX]]
 ; ALL-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(5) [[TMP3]], align 1
 ; ALL-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
 ; ALL-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
 ; ALL:       memmove_fwd_loop:
 ; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i32 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
-; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[FWD_INDEX]]
 ; ALL-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP5]], align 1
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST]], i32 [[FWD_INDEX]]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[FWD_INDEX]]
 ; ALL-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(5) [[TMP6]], align 1
-; ALL-NEXT:    [[TMP7]] = add i32 [[FWD_INDEX]], 1
-; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 16
+; ALL-NEXT:    [[TMP7]] = add i32 [[FWD_INDEX]], 16
+; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 256
 ; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
 ; ALL-NEXT:    ret void
@@ -2194,11 +2169,10 @@ define amdgpu_kernel void @memmove_private_align1_private_align1(ptr addrspace(5
 
 define amdgpu_kernel void @memmove_private_align1_private_align1_unknown_size(ptr addrspace(5) %dst, ptr addrspace(5) %src, i32 %size) {
 ; OPT-LABEL: @memmove_private_align1_private_align1_unknown_size(
-; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 4
-; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE]], 15
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
 ; OPT-NEXT:    [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
-; OPT-NEXT:    [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP1]], 0
+; OPT-NEXT:    [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP3]], 0
 ; OPT-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(5) [[SRC:%.*]], [[DST:%.*]]
 ; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
 ; OPT:       memmove_copy_backwards:
@@ -2215,11 +2189,11 @@ define amdgpu_kernel void @memmove_private_align1_private_align1_unknown_size(pt
 ; OPT:       memmove_bwd_middle:
 ; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
 ; OPT:       memmove_bwd_main_loop:
-; OPT-NEXT:    [[TMP8:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ]
-; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i32 [[TMP8]], 1
-; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    [[TMP8:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ]
+; OPT-NEXT:    [[BWD_MAIN_INDEX]] = sub i32 [[TMP8]], 16
+; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[BWD_MAIN_INDEX]]
 ; OPT-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP9]], align 1
-; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST]], i32 [[BWD_MAIN_INDEX]]
+; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[BWD_MAIN_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(5) [[TMP10]], align 1
 ; OPT-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
 ; OPT-NEXT:    br i1 [[TMP11]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
@@ -2227,12 +2201,12 @@ define amdgpu_kernel void @memmove_private_align1_private_align1_unknown_size(pt
 ; OPT-NEXT:    br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
 ; OPT:       memmove_fwd_main_loop:
 ; OPT-NEXT:    [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP14:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
-; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[FWD_MAIN_INDEX]]
 ; OPT-NEXT:    [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP12]], align 1
-; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST]], i32 [[FWD_MAIN_INDEX]]
+; OPT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[FWD_MAIN_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[ELEMENT2]], ptr addrspace(5) [[TMP13]], align 1
-; OPT-NEXT:    [[TMP14]] = add i32 [[FWD_MAIN_INDEX]], 1
-; OPT-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP14]], [[TMP1]]
+; OPT-NEXT:    [[TMP14]] = add i32 [[FWD_MAIN_INDEX]], 16
+; OPT-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP14]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP15]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
 ; OPT:       memmove_fwd_middle:
 ; OPT-NEXT:    br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
@@ -2257,22 +2231,22 @@ define amdgpu_kernel void @memmove_global_align4_static_residual_empty(ptr addrs
 ; OPT-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]]
 ; OPT-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
 ; OPT:       memmove_bwd_loop:
-; OPT-NEXT:    [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 65, [[TMP0:%.*]] ]
-; OPT-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP1]], 1
-; OPT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
+; OPT-NEXT:    [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 1040, [[TMP0:%.*]] ]
+; OPT-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP1]], 16
+; OPT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
 ; OPT-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP2]], align 1
-; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
+; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
 ; OPT:       memmove_fwd_loop:
 ; OPT-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
-; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
+; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
 ; OPT-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
-; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1
-; OPT-NEXT:    [[TMP7]] = add i64 [[FWD_INDEX]], 1
-; OPT-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 65
+; OPT-NEXT:    [[TMP7]] = add i64 [[FWD_INDEX]], 16
+; OPT-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 1040
 ; OPT-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; OPT:       memmove_done:
 ; OPT-NEXT:    ret void
@@ -2290,49 +2264,49 @@ define amdgpu_kernel void @memmove_global_align4_static_residual_full(ptr addrsp
 ; OPT-NEXT:    [[TMP2:%.*]] = load i8, ptr addrspace(1) [[TMP1]], align 1
 ; OPT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1038
 ; OPT-NEXT:    store i8 [[TMP2]], ptr addrspace(1) [[TMP3]], align 1
-; OPT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[SRC]], i64 518
+; OPT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1036
 ; OPT-NEXT:    [[TMP5:%.*]] = load i16, ptr addrspace(1) [[TMP4]], align 1
-; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[DST]], i64 518
+; OPT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1036
 ; OPT-NEXT:    store i16 [[TMP5]], ptr addrspace(1) [[TMP6]], align 1
-; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[SRC]], i64 258
+; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1032
 ; OPT-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 1
-; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[DST]], i64 258
+; OPT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1032
 ; OPT-NEXT:    store i32 [[TMP8]], ptr addrspace(1) [[TMP9]], align 1
-; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[SRC]], i64 128
+; OPT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
 ; OPT-NEXT:    [[TMP11:%.*]] = load i64, ptr addrspace(1) [[TMP10]], align 1
-; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[DST]], i64 128
+; OPT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
 ; OPT-NEXT:    store i64 [[TMP11]], ptr addrspace(1) [[TMP12]], align 1
 ; OPT-NEXT:    br label [[MEMMOVE_BWD_LOOP:%.*]]
 ; OPT:       memmove_bwd_loop:
-; OPT-NEXT:    [[TMP13:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 64, [[MEMMOVE_BWD_RESIDUAL]] ]
-; OPT-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP13]], 1
-; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
+; OPT-NEXT:    [[TMP13:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 1024, [[MEMMOVE_BWD_RESIDUAL]] ]
+; OPT-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP13]], 16
+; OPT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
 ; OPT-NEXT:    [[ELEMENT:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP14]], align 1
-; OPT-NEXT:    [[TMP15:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
+; OPT-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP15]], align 1
 ; OPT-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
 ; OPT-NEXT:    br i1 [[TMP16]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
 ; OPT:       memmove_fwd_loop:
 ; OPT-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP19:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0:%.*]] ]
-; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
+; OPT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
 ; OPT-NEXT:    [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP17]], align 1
-; OPT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
+; OPT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP18]], align 1
-; OPT-NEXT:    [[TMP19]] = add i64 [[FWD_INDEX]], 1
-; OPT-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[TMP19]], 64
+; OPT-NEXT:    [[TMP19]] = add i64 [[FWD_INDEX]], 16
+; OPT-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[TMP19]], 1024
 ; OPT-NEXT:    br i1 [[TMP20]], label [[MEMMOVE_FWD_RESIDUAL:%.*]], label [[MEMMOVE_FWD_LOOP]]
 ; OPT:       memmove_fwd_residual:
-; OPT-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[SRC]], i64 128
+; OPT-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
 ; OPT-NEXT:    [[TMP22:%.*]] = load i64, ptr addrspace(1) [[TMP21]], align 1
-; OPT-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[DST]], i64 128
+; OPT-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
 ; OPT-NEXT:    store i64 [[TMP22]], ptr addrspace(1) [[TMP23]], align 1
-; OPT-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[SRC]], i64 258
+; OPT-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1032
 ; OPT-NEXT:    [[TMP25:%.*]] = load i32, ptr addrspace(1) [[TMP24]], align 1
-; OPT-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[DST]], i64 258
+; OPT-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1032
 ; OPT-NEXT:    store i32 [[TMP25]], ptr addrspace(1) [[TMP26]], align 1
-; OPT-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[SRC]], i64 518
+; OPT-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1036
 ; OPT-NEXT:    [[TMP28:%.*]] = load i16, ptr addrspace(1) [[TMP27]], align 1
-; OPT-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[DST]], i64 518
+; OPT-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1036
 ; OPT-NEXT:    store i16 [[TMP28]], ptr addrspace(1) [[TMP29]], align 1
 ; OPT-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1038
 ; OPT-NEXT:    [[TMP31:%.*]] = load i8, ptr addrspace(1) [[TMP30]], align 1
@@ -2351,19 +2325,18 @@ define void @test_umin(i64 %0, i64 %idxprom, ptr %x, ptr %y) {
 ; OPT-NEXT:  entry:
 ; OPT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr [32 x [8 x i64]], ptr [[Y:%.*]], i64 0, i64 [[IDXPROM:%.*]]
 ; OPT-NEXT:    [[SPEC_SELECT:%.*]] = tail call i64 @llvm.umin.i64(i64 sub (i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) inttoptr (i64 32 to ptr addrspace(4)) to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) null to ptr) to i64)), i64 56)
-; OPT-NEXT:    [[TMP1:%.*]] = lshr i64 [[SPEC_SELECT]], 4
 ; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[SPEC_SELECT]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[SPEC_SELECT]], [[TMP2]]
-; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
+; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; OPT:       loop-memcpy-expansion:
 ; OPT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
-; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr [[X:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr [[TMP5]], align 1
-; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr [[ARRAYIDX]], i64 [[LOOP_INDEX]]
+; OPT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX]], i64 [[LOOP_INDEX]]
 ; OPT-NEXT:    store <4 x i32> [[TMP6]], ptr [[TMP7]], align 1
-; OPT-NEXT:    [[TMP8]] = add i64 [[LOOP_INDEX]], 1
-; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP1]]
+; OPT-NEXT:    [[TMP8]] = add i64 [[LOOP_INDEX]], 16
+; OPT-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]]
 ; OPT-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
 ; OPT:       loop-memcpy-residual:
 ; OPT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
@@ -2397,22 +2370,22 @@ define amdgpu_kernel void @memmove_volatile(ptr addrspace(1) %dst, ptr addrspace
 ; ALL-NEXT:    [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]]
 ; ALL-NEXT:    br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
 ; ALL:       memmove_bwd_loop:
-; ALL-NEXT:    [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 4, [[TMP0:%.*]] ]
-; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP1]], 1
-; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 64, [[TMP0:%.*]] ]
+; ALL-NEXT:    [[BWD_INDEX]] = sub i64 [[TMP1]], 16
+; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
 ; ALL-NEXT:    [[ELEMENT:%.*]] = load volatile <4 x i32>, ptr addrspace(1) [[TMP2]], align 1
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
 ; ALL-NEXT:    store volatile <4 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
 ; ALL-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
 ; ALL-NEXT:    br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
 ; ALL:       memmove_fwd_loop:
 ; ALL-NEXT:    [[FWD_INDEX:%.*]] = phi i64 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
-; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
 ; ALL-NEXT:    [[ELEMENT1:%.*]] = load volatile <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
 ; ALL-NEXT:    store volatile <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1
-; ALL-NEXT:    [[TMP7]] = add i64 [[FWD_INDEX]], 1
-; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 4
+; ALL-NEXT:    [[TMP7]] = add i64 [[FWD_INDEX]], 16
+; ALL-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 64
 ; ALL-NEXT:    br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
 ; ALL:       memmove_done:
 ; ALL-NEXT:    ret void
@@ -2430,12 +2403,12 @@ define amdgpu_kernel void @memcpy_volatile(ptr addrspace(1) %dst, ptr addrspace(
 ; ALL-NEXT:    br label [[LOAD_STORE_LOOP:%.*]]
 ; ALL:       load-store-loop:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
-; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
+; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
 ; ALL-NEXT:    [[TMP2:%.*]] = load volatile <4 x i32>, ptr addrspace(1) [[TMP1]], align 1
-; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
+; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
 ; ALL-NEXT:    store volatile <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1
-; ALL-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 1
-; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 4
+; ALL-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 16
+; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 64
 ; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
 ; ALL:       memcpy-split:
 ; ALL-NEXT:    ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
index 9d43efbdf07b1f..8157b1a7f7c802 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
@@ -9,146 +9,128 @@ define void @issue63986(i64 %0, i64 %idxprom) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_lshlrev_b64 v[4:5], 6, v[2:3]
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
-; CHECK-NEXT:  ; %bb.1: ; %loop-memcpy-expansion.preheader
-; CHECK-NEXT:    v_lshlrev_b64 v[6:7], 6, v[2:3]
-; CHECK-NEXT:    s_mov_b64 s[6:7], 0
-; CHECK-NEXT:  .LBB0_2: ; %loop-memcpy-expansion
+; CHECK-NEXT:  .LBB0_1: ; %loop-memcpy-expansion
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    v_mov_b32_e32 v9, s7
-; CHECK-NEXT:    v_mov_b32_e32 v8, s6
-; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
-; CHECK-NEXT:    s_add_u32 s4, s4, 1
+; CHECK-NEXT:    v_mov_b32_e32 v7, s5
+; CHECK-NEXT:    v_mov_b32_e32 v6, s4
+; CHECK-NEXT:    flat_load_dwordx4 v[6:9], v[6:7]
+; CHECK-NEXT:    v_add_co_u32_e32 v10, vcc, s4, v4
+; CHECK-NEXT:    s_add_u32 s4, s4, 16
+; CHECK-NEXT:    v_mov_b32_e32 v11, s5
 ; CHECK-NEXT:    s_addc_u32 s5, s5, 0
-; CHECK-NEXT:    v_mov_b32_e32 v13, s7
-; CHECK-NEXT:    v_add_co_u32_e32 v12, vcc, s6, v6
-; CHECK-NEXT:    v_cmp_ge_u64_e64 s[8:9], s[4:5], 2
-; CHECK-NEXT:    v_addc_co_u32_e32 v13, vcc, v13, v7, vcc
-; CHECK-NEXT:    s_add_u32 s6, s6, 16
-; CHECK-NEXT:    s_addc_u32 s7, s7, 0
-; CHECK-NEXT:    s_and_b64 vcc, exec, s[8:9]
+; CHECK-NEXT:    v_cmp_ge_u64_e64 s[6:7], s[4:5], 32
+; CHECK-NEXT:    v_addc_co_u32_e32 v11, vcc, v5, v11, vcc
+; CHECK-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
-; CHECK-NEXT:    s_cbranch_vccz .LBB0_2
-; CHECK-NEXT:  ; %bb.3: ; %loop-memcpy-residual-header
-; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:    s_mov_b32 s5, s4
-; CHECK-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; CHECK-NEXT:    s_cbranch_scc0 .LBB0_5
-; CHECK-NEXT:  ; %bb.4:
-; CHECK-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; CHECK-NEXT:    s_branch .LBB0_6
-; CHECK-NEXT:  .LBB0_5: ; %loop-memcpy-residual-header.post-loop-memcpy-expansion_crit_edge
-; CHECK-NEXT:    v_lshlrev_b64 v[2:3], 6, v[2:3]
-; CHECK-NEXT:    s_cbranch_execnz .LBB0_9
-; CHECK-NEXT:  .LBB0_6: ; %loop-memcpy-residual.preheader
-; CHECK-NEXT:    v_mov_b32_e32 v2, s4
-; CHECK-NEXT:    v_or_b32_e32 v6, 32, v4
-; CHECK-NEXT:    v_mov_b32_e32 v7, v5
-; CHECK-NEXT:    s_mov_b64 s[6:7], 0
-; CHECK-NEXT:    v_mov_b32_e32 v3, s5
-; CHECK-NEXT:  .LBB0_7: ; %loop-memcpy-residual
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_add_u32 s4, 32, s6
-; CHECK-NEXT:    s_addc_u32 s5, 0, s7
+; CHECK-NEXT:    flat_store_dwordx4 v[10:11], v[6:9]
+; CHECK-NEXT:    s_cbranch_vccz .LBB0_1
+; CHECK-NEXT:  ; %bb.2: ; %loop-memcpy-residual-header
+; CHECK-NEXT:    s_branch .LBB0_4
+; CHECK-NEXT:  ; %bb.3:
+; CHECK-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; CHECK-NEXT:    s_branch .LBB0_5
+; CHECK-NEXT:  .LBB0_4: ; %loop-memcpy-residual-header.post-loop-memcpy-expansion_crit_edge
+; CHECK-NEXT:    v_lshlrev_b64 v[6:7], 6, v[2:3]
+; CHECK-NEXT:    s_cbranch_execnz .LBB0_7
+; CHECK-NEXT:  .LBB0_5: ; %loop-memcpy-residual.preheader
+; CHECK-NEXT:    v_or_b32_e32 v2, 32, v4
+; CHECK-NEXT:    v_mov_b32_e32 v3, v5
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0
+; CHECK-NEXT:  ; %bb.6: ; %loop-memcpy-residual
+; CHECK-NEXT:    s_add_u32 s6, 32, s4
+; CHECK-NEXT:    s_addc_u32 s7, 0, s5
+; CHECK-NEXT:    v_mov_b32_e32 v6, s6
+; CHECK-NEXT:    v_mov_b32_e32 v7, s7
+; CHECK-NEXT:    flat_load_ubyte v10, v[6:7]
 ; CHECK-NEXT:    v_mov_b32_e32 v9, s5
-; CHECK-NEXT:    v_mov_b32_e32 v8, s4
-; CHECK-NEXT:    flat_load_ubyte v10, v[8:9]
-; CHECK-NEXT:    v_mov_b32_e32 v9, s7
-; CHECK-NEXT:    v_add_co_u32_e32 v8, vcc, s6, v6
-; CHECK-NEXT:    s_add_u32 s6, s6, 1
-; CHECK-NEXT:    v_addc_co_u32_e32 v9, vcc, v7, v9, vcc
-; CHECK-NEXT:    s_addc_u32 s7, s7, 0
-; CHECK-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
+; CHECK-NEXT:    v_add_co_u32_e32 v8, vcc, s4, v2
+; CHECK-NEXT:    v_mov_b32_e32 v7, v5
+; CHECK-NEXT:    v_addc_co_u32_e32 v9, vcc, v3, v9, vcc
+; CHECK-NEXT:    s_add_u32 s4, s4, 1
+; CHECK-NEXT:    v_mov_b32_e32 v6, v4
+; CHECK-NEXT:    s_addc_u32 s5, s5, 0
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    flat_store_byte v[8:9], v10
-; CHECK-NEXT:    s_cbranch_vccnz .LBB0_7
-; CHECK-NEXT:  ; %bb.8:
-; CHECK-NEXT:    v_mov_b32_e32 v2, v4
-; CHECK-NEXT:    v_mov_b32_e32 v3, v5
-; CHECK-NEXT:  .LBB0_9: ; %post-loop-memcpy-expansion
-; CHECK-NEXT:    v_lshrrev_b64 v[4:5], 4, v[0:1]
-; CHECK-NEXT:    v_and_b32_e32 v6, 15, v0
-; CHECK-NEXT:    v_mov_b32_e32 v7, 0
+; CHECK-NEXT:  .LBB0_7: ; %post-loop-memcpy-expansion
+; CHECK-NEXT:    v_and_b32_e32 v2, 15, v0
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
 ; CHECK-NEXT:    v_and_b32_e32 v0, -16, v0
-; CHECK-NEXT:    v_cmp_ne_u64_e64 s[4:5], 0, v[4:5]
-; CHECK-NEXT:    v_cmp_ne_u64_e64 s[6:7], 0, v[6:7]
-; CHECK-NEXT:    v_add_co_u32_e32 v8, vcc, v2, v0
-; CHECK-NEXT:    v_addc_co_u32_e32 v9, vcc, v3, v1, vcc
-; CHECK-NEXT:    s_branch .LBB0_12
-; CHECK-NEXT:  .LBB0_10: ; %Flow19
-; CHECK-NEXT:    ; in Loop: Header=BB0_12 Depth=1
+; CHECK-NEXT:    v_cmp_ne_u64_e64 s[4:5], 0, v[0:1]
+; CHECK-NEXT:    v_cmp_ne_u64_e64 s[6:7], 0, v[2:3]
+; CHECK-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v0
+; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v1, vcc
+; CHECK-NEXT:    s_branch .LBB0_10
+; CHECK-NEXT:  .LBB0_8: ; %Flow14
+; CHECK-NEXT:    ; in Loop: Header=BB0_10 Depth=1
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], 0
-; CHECK-NEXT:  .LBB0_11: ; %Flow21
-; CHECK-NEXT:    ; in Loop: Header=BB0_12 Depth=1
+; CHECK-NEXT:  .LBB0_9: ; %Flow16
+; CHECK-NEXT:    ; in Loop: Header=BB0_10 Depth=1
 ; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
-; CHECK-NEXT:    s_cbranch_vccz .LBB0_20
-; CHECK-NEXT:  .LBB0_12: ; %while.cond
+; CHECK-NEXT:    s_cbranch_vccz .LBB0_18
+; CHECK-NEXT:  .LBB0_10: ; %while.cond
 ; CHECK-NEXT:    ; =>This Loop Header: Depth=1
-; CHECK-NEXT:    ; Child Loop BB0_14 Depth 2
-; CHECK-NEXT:    ; Child Loop BB0_18 Depth 2
+; CHECK-NEXT:    ; Child Loop BB0_12 Depth 2
+; CHECK-NEXT:    ; Child Loop BB0_16 Depth 2
 ; CHECK-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
-; CHECK-NEXT:    s_cbranch_execz .LBB0_15
-; CHECK-NEXT:  ; %bb.13: ; %loop-memcpy-expansion2.preheader
-; CHECK-NEXT:    ; in Loop: Header=BB0_12 Depth=1
+; CHECK-NEXT:    s_cbranch_execz .LBB0_13
+; CHECK-NEXT:  ; %bb.11: ; %loop-memcpy-expansion2.preheader
+; CHECK-NEXT:    ; in Loop: Header=BB0_10 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[10:11], 0
 ; CHECK-NEXT:    s_mov_b64 s[12:13], 0
-; CHECK-NEXT:    s_mov_b64 s[14:15], 0
-; CHECK-NEXT:  .LBB0_14: ; %loop-memcpy-expansion2
-; CHECK-NEXT:    ; Parent Loop BB0_12 Depth=1
+; CHECK-NEXT:  .LBB0_12: ; %loop-memcpy-expansion2
+; CHECK-NEXT:    ; Parent Loop BB0_10 Depth=1
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    v_mov_b32_e32 v10, s10
-; CHECK-NEXT:    v_mov_b32_e32 v11, s11
-; CHECK-NEXT:    flat_load_dwordx4 v[10:13], v[10:11]
-; CHECK-NEXT:    v_mov_b32_e32 v15, s11
-; CHECK-NEXT:    s_add_u32 s14, s14, 1
-; CHECK-NEXT:    v_add_co_u32_e32 v14, vcc, s10, v2
-; CHECK-NEXT:    v_addc_co_u32_e32 v15, vcc, v15, v3, vcc
-; CHECK-NEXT:    s_addc_u32 s15, s15, 0
-; CHECK-NEXT:    s_add_u32 s10, s10, 16
-; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc, s[14:15], v[4:5]
-; CHECK-NEXT:    s_addc_u32 s11, s11, 0
-; CHECK-NEXT:    s_or_b64 s[12:13], vcc, s[12:13]
+; CHECK-NEXT:    v_mov_b32_e32 v8, s12
+; CHECK-NEXT:    v_mov_b32_e32 v9, s13
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; CHECK-NEXT:    v_mov_b32_e32 v13, s13
+; CHECK-NEXT:    v_add_co_u32_e32 v12, vcc, s12, v4
+; CHECK-NEXT:    s_add_u32 s12, s12, 16
+; CHECK-NEXT:    v_addc_co_u32_e32 v13, vcc, v5, v13, vcc
+; CHECK-NEXT:    s_addc_u32 s13, s13, 0
+; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc, s[12:13], v[0:1]
+; CHECK-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx4 v[14:15], v[10:13]
-; CHECK-NEXT:    s_andn2_b64 exec, exec, s[12:13]
-; CHECK-NEXT:    s_cbranch_execnz .LBB0_14
-; CHECK-NEXT:  .LBB0_15: ; %Flow20
-; CHECK-NEXT:    ; in Loop: Header=BB0_12 Depth=1
+; CHECK-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
+; CHECK-NEXT:    s_andn2_b64 exec, exec, s[10:11]
+; CHECK-NEXT:    s_cbranch_execnz .LBB0_12
+; CHECK-NEXT:  .LBB0_13: ; %Flow15
+; CHECK-NEXT:    ; in Loop: Header=BB0_10 Depth=1
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], -1
-; CHECK-NEXT:    s_cbranch_execz .LBB0_11
-; CHECK-NEXT:  ; %bb.16: ; %loop-memcpy-residual-header5
-; CHECK-NEXT:    ; in Loop: Header=BB0_12 Depth=1
+; CHECK-NEXT:    s_cbranch_execz .LBB0_9
+; CHECK-NEXT:  ; %bb.14: ; %loop-memcpy-residual-header5
+; CHECK-NEXT:    ; in Loop: Header=BB0_10 Depth=1
 ; CHECK-NEXT:    s_and_saveexec_b64 s[8:9], s[6:7]
 ; CHECK-NEXT:    s_xor_b64 s[10:11], exec, s[8:9]
-; CHECK-NEXT:    s_cbranch_execz .LBB0_10
-; CHECK-NEXT:  ; %bb.17: ; %loop-memcpy-residual4.preheader
-; CHECK-NEXT:    ; in Loop: Header=BB0_12 Depth=1
+; CHECK-NEXT:    s_cbranch_execz .LBB0_8
+; CHECK-NEXT:  ; %bb.15: ; %loop-memcpy-residual4.preheader
+; CHECK-NEXT:    ; in Loop: Header=BB0_10 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[12:13], 0
 ; CHECK-NEXT:    s_mov_b64 s[14:15], 0
-; CHECK-NEXT:  .LBB0_18: ; %loop-memcpy-residual4
-; CHECK-NEXT:    ; Parent Loop BB0_12 Depth=1
+; CHECK-NEXT:  .LBB0_16: ; %loop-memcpy-residual4
+; CHECK-NEXT:    ; Parent Loop BB0_10 Depth=1
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    v_mov_b32_e32 v12, s15
-; CHECK-NEXT:    v_add_co_u32_e32 v10, vcc, s14, v0
-; CHECK-NEXT:    v_addc_co_u32_e32 v11, vcc, v1, v12, vcc
-; CHECK-NEXT:    flat_load_ubyte v13, v[10:11]
-; CHECK-NEXT:    v_add_co_u32_e32 v10, vcc, s14, v8
+; CHECK-NEXT:    v_mov_b32_e32 v10, s15
+; CHECK-NEXT:    v_add_co_u32_e32 v8, vcc, s14, v0
+; CHECK-NEXT:    v_addc_co_u32_e32 v9, vcc, v1, v10, vcc
+; CHECK-NEXT:    flat_load_ubyte v11, v[8:9]
+; CHECK-NEXT:    v_add_co_u32_e32 v8, vcc, s14, v6
 ; CHECK-NEXT:    s_add_u32 s14, s14, 1
 ; CHECK-NEXT:    s_addc_u32 s15, s15, 0
-; CHECK-NEXT:    v_cmp_ge_u64_e64 s[8:9], s[14:15], v[6:7]
-; CHECK-NEXT:    v_addc_co_u32_e32 v11, vcc, v9, v12, vcc
+; CHECK-NEXT:    v_cmp_ge_u64_e64 s[8:9], s[14:15], v[2:3]
+; CHECK-NEXT:    v_addc_co_u32_e32 v9, vcc, v7, v10, vcc
 ; CHECK-NEXT:    s_or_b64 s[12:13], s[8:9], s[12:13]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[10:11], v13
+; CHECK-NEXT:    flat_store_byte v[8:9], v11
 ; CHECK-NEXT:    s_andn2_b64 exec, exec, s[12:13]
-; CHECK-NEXT:    s_cbranch_execnz .LBB0_18
-; CHECK-NEXT:  ; %bb.19: ; %Flow
-; CHECK-NEXT:    ; in Loop: Header=BB0_12 Depth=1
+; CHECK-NEXT:    s_cbranch_execnz .LBB0_16
+; CHECK-NEXT:  ; %bb.17: ; %Flow
+; CHECK-NEXT:    ; in Loop: Header=BB0_10 Depth=1
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[12:13]
-; CHECK-NEXT:    s_branch .LBB0_10
-; CHECK-NEXT:  .LBB0_20: ; %DummyReturnBlock
+; CHECK-NEXT:    s_branch .LBB0_8
+; CHECK-NEXT:  .LBB0_18: ; %DummyReturnBlock
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
index dfa489d317403d..a68d2e575607d4 100644
--- a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
@@ -10,19 +10,20 @@ define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align
 ; CHECK-LABEL: memmove_p0_p0:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v9, 0
 ; CHECK-NEXT:    v_and_b32_e32 v8, 15, v4
-; CHECK-NEXT:    v_lshrrev_b64 v[6:7], 4, v[4:5]
+; CHECK-NEXT:    v_mov_b32_e32 v9, 0
+; CHECK-NEXT:    v_and_b32_e32 v6, -16, v4
+; CHECK-NEXT:    v_mov_b32_e32 v7, v5
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
 ; CHECK-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[8:9]
 ; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
 ; CHECK-NEXT:    v_cmpx_ge_u64_e64 v[2:3], v[0:1]
 ; CHECK-NEXT:    s_xor_b32 s7, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_3
-; CHECK-NEXT:  ; %bb.1: ; %Flow37
+; CHECK-NEXT:  ; %bb.1: ; %Flow35
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s6, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_10
-; CHECK-NEXT:  .LBB0_2: ; %Flow38
+; CHECK-NEXT:  .LBB0_2: ; %Flow36
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -30,39 +31,39 @@ define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align
 ; CHECK-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_6
 ; CHECK-NEXT:  ; %bb.4: ; %memmove_fwd_main_loop.preheader
-; CHECK-NEXT:    v_mov_b32_e32 v11, v3
-; CHECK-NEXT:    v_mov_b32_e32 v13, v1
-; CHECK-NEXT:    v_mov_b32_e32 v10, v2
-; CHECK-NEXT:    v_mov_b32_e32 v12, v0
+; CHECK-NEXT:    v_mov_b32_e32 v5, v3
+; CHECK-NEXT:    v_mov_b32_e32 v11, v1
+; CHECK-NEXT:    v_mov_b32_e32 v13, v7
+; CHECK-NEXT:    v_mov_b32_e32 v4, v2
+; CHECK-NEXT:    v_mov_b32_e32 v10, v0
+; CHECK-NEXT:    v_mov_b32_e32 v12, v6
 ; CHECK-NEXT:    s_mov_b32 s9, 0
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB0_5: ; %memmove_fwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    flat_load_dwordx4 v[14:17], v[10:11]
-; CHECK-NEXT:    v_add_co_u32 v6, s5, v6, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v7, s5, -1, v7, s5
-; CHECK-NEXT:    v_add_co_u32 v10, s5, v10, 16
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v11, s5, 0, v11, s5
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[6:7]
+; CHECK-NEXT:    flat_load_dwordx4 v[14:17], v[4:5]
+; CHECK-NEXT:    v_add_co_u32 v12, s5, v12, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v13, s5, -1, v13, s5
+; CHECK-NEXT:    v_add_co_u32 v4, s5, v4, 16
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, s5, 0, v5, s5
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[12:13]
 ; CHECK-NEXT:    s_or_b32 s9, s5, s9
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx4 v[12:13], v[14:17]
-; CHECK-NEXT:    v_add_co_u32 v12, s6, v12, 16
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v13, s6, 0, v13, s6
+; CHECK-NEXT:    flat_store_dwordx4 v[10:11], v[14:17]
+; CHECK-NEXT:    v_add_co_u32 v10, s6, v10, 16
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v11, s6, 0, v11, s6
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_5
-; CHECK-NEXT:  .LBB0_6: ; %Flow32
+; CHECK-NEXT:  .LBB0_6: ; %Flow30
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    s_and_saveexec_b32 s8, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_9
 ; CHECK-NEXT:  ; %bb.7: ; %memmove_fwd_residual_loop.preheader
-; CHECK-NEXT:    v_sub_co_u32 v4, s5, v4, v8
-; CHECK-NEXT:    v_subrev_co_ci_u32_e64 v5, s5, 0, v5, s5
+; CHECK-NEXT:    v_add_co_u32 v0, s5, v0, v6
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s5, v1, v7, s5
+; CHECK-NEXT:    v_add_co_u32 v2, s5, v2, v6
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, s5, v3, v7, s5
 ; CHECK-NEXT:    s_mov_b32 s9, 0
-; CHECK-NEXT:    v_add_co_u32 v0, s5, v0, v4
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s5, v1, v5, s5
-; CHECK-NEXT:    v_add_co_u32 v2, s5, v2, v4
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, s5, v3, v5, s5
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB0_8: ; %memmove_fwd_residual_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -79,7 +80,7 @@ define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s6, 0, v1, s6
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_8
-; CHECK-NEXT:  .LBB0_9: ; %Flow30
+; CHECK-NEXT:  .LBB0_9: ; %Flow28
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -115,36 +116,35 @@ define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, s5, -1, v5, s5
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_12
-; CHECK-NEXT:  .LBB0_13: ; %Flow36
+; CHECK-NEXT:  .LBB0_13: ; %Flow34
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_and_saveexec_b32 s5, vcc_lo
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_16
 ; CHECK-NEXT:  ; %bb.14: ; %memmove_bwd_main_loop.preheader
-; CHECK-NEXT:    v_lshlrev_b64 v[4:5], 4, v[6:7]
+; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v0, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v2, vcc_lo, v2, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
 ; CHECK-NEXT:    s_mov_b32 s7, 0
-; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v4, -16
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v5, vcc_lo
-; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
-; CHECK-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB0_15: ; %memmove_bwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
-; CHECK-NEXT:    v_add_co_u32 v6, vcc_lo, v6, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, -1, v7, vcc_lo
-; CHECK-NEXT:    v_add_co_u32 v2, vcc_lo, v2, -16
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
-; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[6:7]
+; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v2, v6
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v3, v7, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v12, s4, v0, v6
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v13, s4, v1, v7, s4
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[4:5]
+; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v6, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v7, vcc_lo
+; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
+; CHECK-NEXT:    v_mov_b32_e32 v7, v5
+; CHECK-NEXT:    v_mov_b32_e32 v6, v4
 ; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
-; CHECK-NEXT:    v_add_co_u32 v0, s4, v0, -16
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s4, -1, v1, s4
+; CHECK-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_15
-; CHECK-NEXT:  .LBB0_16: ; %Flow34
+; CHECK-NEXT:  .LBB0_16: ; %Flow32
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -158,19 +158,20 @@ define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align
 ; CHECK-LABEL: memmove_p0_p1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v9, 0
 ; CHECK-NEXT:    v_and_b32_e32 v8, 15, v4
-; CHECK-NEXT:    v_lshrrev_b64 v[6:7], 4, v[4:5]
+; CHECK-NEXT:    v_mov_b32_e32 v9, 0
+; CHECK-NEXT:    v_and_b32_e32 v6, -16, v4
+; CHECK-NEXT:    v_mov_b32_e32 v7, v5
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
 ; CHECK-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[8:9]
 ; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
 ; CHECK-NEXT:    v_cmpx_ge_u64_e64 v[2:3], v[0:1]
 ; CHECK-NEXT:    s_xor_b32 s7, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execnz .LBB1_3
-; CHECK-NEXT:  ; %bb.1: ; %Flow41
+; CHECK-NEXT:  ; %bb.1: ; %Flow37
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s6, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB1_10
-; CHECK-NEXT:  .LBB1_2: ; %Flow42
+; CHECK-NEXT:  .LBB1_2: ; %Flow38
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -178,39 +179,39 @@ define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align
 ; CHECK-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; CHECK-NEXT:    s_cbranch_execz .LBB1_6
 ; CHECK-NEXT:  ; %bb.4: ; %memmove_fwd_main_loop.preheader
-; CHECK-NEXT:    v_mov_b32_e32 v11, v3
-; CHECK-NEXT:    v_mov_b32_e32 v13, v1
-; CHECK-NEXT:    v_mov_b32_e32 v10, v2
-; CHECK-NEXT:    v_mov_b32_e32 v12, v0
+; CHECK-NEXT:    v_mov_b32_e32 v5, v3
+; CHECK-NEXT:    v_mov_b32_e32 v11, v1
+; CHECK-NEXT:    v_mov_b32_e32 v13, v7
+; CHECK-NEXT:    v_mov_b32_e32 v4, v2
+; CHECK-NEXT:    v_mov_b32_e32 v10, v0
+; CHECK-NEXT:    v_mov_b32_e32 v12, v6
 ; CHECK-NEXT:    s_mov_b32 s9, 0
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB1_5: ; %memmove_fwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    global_load_dwordx4 v[14:17], v[10:11], off
-; CHECK-NEXT:    v_add_co_u32 v6, s5, v6, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v7, s5, -1, v7, s5
-; CHECK-NEXT:    v_add_co_u32 v10, s5, v10, 16
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v11, s5, 0, v11, s5
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[6:7]
+; CHECK-NEXT:    global_load_dwordx4 v[14:17], v[4:5], off
+; CHECK-NEXT:    v_add_co_u32 v12, s5, v12, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v13, s5, -1, v13, s5
+; CHECK-NEXT:    v_add_co_u32 v4, s5, v4, 16
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, s5, 0, v5, s5
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[12:13]
 ; CHECK-NEXT:    s_or_b32 s9, s5, s9
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx4 v[12:13], v[14:17]
-; CHECK-NEXT:    v_add_co_u32 v12, s6, v12, 16
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v13, s6, 0, v13, s6
+; CHECK-NEXT:    flat_store_dwordx4 v[10:11], v[14:17]
+; CHECK-NEXT:    v_add_co_u32 v10, s6, v10, 16
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v11, s6, 0, v11, s6
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
 ; CHECK-NEXT:    s_cbranch_execnz .LBB1_5
-; CHECK-NEXT:  .LBB1_6: ; %Flow36
+; CHECK-NEXT:  .LBB1_6: ; %Flow32
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    s_and_saveexec_b32 s8, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB1_9
 ; CHECK-NEXT:  ; %bb.7: ; %memmove_fwd_residual_loop.preheader
-; CHECK-NEXT:    v_sub_co_u32 v4, s5, v4, v8
-; CHECK-NEXT:    v_subrev_co_ci_u32_e64 v5, s5, 0, v5, s5
+; CHECK-NEXT:    v_add_co_u32 v0, s5, v0, v6
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s5, v1, v7, s5
+; CHECK-NEXT:    v_add_co_u32 v2, s5, v2, v6
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, s5, v3, v7, s5
 ; CHECK-NEXT:    s_mov_b32 s9, 0
-; CHECK-NEXT:    v_add_co_u32 v0, s5, v0, v4
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s5, v1, v5, s5
-; CHECK-NEXT:    v_add_co_u32 v2, s5, v2, v4
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, s5, v3, v5, s5
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB1_8: ; %memmove_fwd_residual_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -227,7 +228,7 @@ define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s6, 0, v1, s6
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
 ; CHECK-NEXT:    s_cbranch_execnz .LBB1_8
-; CHECK-NEXT:  .LBB1_9: ; %Flow34
+; CHECK-NEXT:  .LBB1_9: ; %Flow30
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; CHECK-NEXT:    ; implicit-def: $vgpr2_vgpr3
@@ -263,36 +264,35 @@ define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v11, s5, -1, v11, s5
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    s_cbranch_execnz .LBB1_12
-; CHECK-NEXT:  .LBB1_13: ; %Flow40
+; CHECK-NEXT:  .LBB1_13: ; %Flow36
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_and_saveexec_b32 s5, vcc_lo
 ; CHECK-NEXT:    s_cbranch_execz .LBB1_16
 ; CHECK-NEXT:  ; %bb.14: ; %memmove_bwd_main_loop.preheader
-; CHECK-NEXT:    v_lshlrev_b64 v[4:5], 4, v[6:7]
+; CHECK-NEXT:    v_add_co_u32 v2, vcc_lo, v2, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v0, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; CHECK-NEXT:    s_mov_b32 s7, 0
-; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v4, -16
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v5, vcc_lo
-; CHECK-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB1_15: ; %memmove_bwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
-; CHECK-NEXT:    v_add_co_u32 v6, vcc_lo, v6, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, -1, v7, vcc_lo
-; CHECK-NEXT:    v_add_co_u32 v2, vcc_lo, v2, -16
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
-; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[6:7]
+; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v2, v6
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v3, v7, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v12, s4, v0, v6
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v13, s4, v1, v7, s4
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[4:5], off
+; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v6, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v7, vcc_lo
+; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
+; CHECK-NEXT:    v_mov_b32_e32 v7, v5
+; CHECK-NEXT:    v_mov_b32_e32 v6, v4
 ; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
-; CHECK-NEXT:    v_add_co_u32 v0, s4, v0, -16
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s4, -1, v1, s4
+; CHECK-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB1_15
-; CHECK-NEXT:  .LBB1_16: ; %Flow38
+; CHECK-NEXT:  .LBB1_16: ; %Flow34
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -309,7 +309,8 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align
 ; CHECK-NEXT:    v_and_b32_e32 v7, 7, v3
 ; CHECK-NEXT:    v_mov_b32_e32 v8, 0
 ; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; CHECK-NEXT:    v_lshrrev_b64 v[5:6], 3, v[3:4]
+; CHECK-NEXT:    v_and_b32_e32 v5, -8, v3
+; CHECK-NEXT:    v_mov_b32_e32 v6, v4
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
 ; CHECK-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[7:8]
 ; CHECK-NEXT:    v_cndmask_b32_e32 v9, -1, v0, vcc_lo
@@ -317,10 +318,10 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align
 ; CHECK-NEXT:    v_cmpx_ge_u32_e64 v2, v9
 ; CHECK-NEXT:    s_xor_b32 s7, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execnz .LBB2_3
-; CHECK-NEXT:  ; %bb.1: ; %Flow40
+; CHECK-NEXT:  ; %bb.1: ; %Flow39
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s6, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB2_10
-; CHECK-NEXT:  .LBB2_2: ; %Flow41
+; CHECK-NEXT:  .LBB2_2: ; %Flow40
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -329,35 +330,35 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align
 ; CHECK-NEXT:    s_cbranch_execz .LBB2_6
 ; CHECK-NEXT:  ; %bb.4: ; %memmove_fwd_main_loop.preheader
 ; CHECK-NEXT:    v_mov_b32_e32 v10, v1
+; CHECK-NEXT:    v_mov_b32_e32 v12, v6
 ; CHECK-NEXT:    v_mov_b32_e32 v9, v0
-; CHECK-NEXT:    v_mov_b32_e32 v11, v2
+; CHECK-NEXT:    v_mov_b32_e32 v11, v5
+; CHECK-NEXT:    v_mov_b32_e32 v4, v2
 ; CHECK-NEXT:    s_mov_b32 s9, 0
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB2_5: ; %memmove_fwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ds_read_b64 v[12:13], v11
-; CHECK-NEXT:    v_add_co_u32 v5, s5, v5, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v6, s5, -1, v6, s5
-; CHECK-NEXT:    v_add_nc_u32_e32 v11, 8, v11
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[5:6]
+; CHECK-NEXT:    ds_read_b64 v[13:14], v4
+; CHECK-NEXT:    v_add_co_u32 v11, s5, v11, -8
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v12, s5, -1, v12, s5
+; CHECK-NEXT:    v_add_nc_u32_e32 v4, 8, v4
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[11:12]
 ; CHECK-NEXT:    s_or_b32 s9, s5, s9
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[9:10], v[12:13]
+; CHECK-NEXT:    flat_store_dwordx2 v[9:10], v[13:14]
 ; CHECK-NEXT:    v_add_co_u32 v9, s6, v9, 8
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v10, s6, 0, v10, s6
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
 ; CHECK-NEXT:    s_cbranch_execnz .LBB2_5
-; CHECK-NEXT:  .LBB2_6: ; %Flow35
+; CHECK-NEXT:  .LBB2_6: ; %Flow34
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    s_and_saveexec_b32 s8, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB2_9
 ; CHECK-NEXT:  ; %bb.7: ; %memmove_fwd_residual_loop.preheader
-; CHECK-NEXT:    v_and_b32_e32 v5, -8, v3
-; CHECK-NEXT:    v_sub_co_u32 v3, s5, v3, v7
-; CHECK-NEXT:    v_subrev_co_ci_u32_e64 v4, s5, 0, v4, s5
-; CHECK-NEXT:    v_add_nc_u32_e32 v2, v2, v5
-; CHECK-NEXT:    v_add_co_u32 v0, s5, v0, v3
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s5, v1, v4, s5
+; CHECK-NEXT:    v_and_b32_e32 v3, -8, v3
+; CHECK-NEXT:    v_add_co_u32 v0, s5, v0, v5
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s5, v1, v6, s5
+; CHECK-NEXT:    v_add_nc_u32_e32 v2, v2, v3
 ; CHECK-NEXT:    s_mov_b32 s9, 0
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB2_8: ; %memmove_fwd_residual_loop
@@ -374,7 +375,7 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s6, 0, v1, s6
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
 ; CHECK-NEXT:    s_cbranch_execnz .LBB2_8
-; CHECK-NEXT:  .LBB2_9: ; %Flow33
+; CHECK-NEXT:  .LBB2_9: ; %Flow32
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    ; implicit-def: $vgpr5_vgpr6
 ; CHECK-NEXT:    ; implicit-def: $vgpr3_vgpr4
@@ -408,34 +409,34 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v10, s5, -1, v10, s5
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    s_cbranch_execnz .LBB2_12
-; CHECK-NEXT:  .LBB2_13: ; %Flow39
+; CHECK-NEXT:  .LBB2_13: ; %Flow38
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_and_saveexec_b32 s5, vcc_lo
 ; CHECK-NEXT:    s_cbranch_execz .LBB2_16
 ; CHECK-NEXT:  ; %bb.14: ; %memmove_bwd_main_loop.preheader
-; CHECK-NEXT:    v_lshlrev_b64 v[7:8], 3, v[5:6]
 ; CHECK-NEXT:    v_and_b32_e32 v3, -8, v3
-; CHECK-NEXT:    s_mov_b32 s7, 0
-; CHECK-NEXT:    v_add3_u32 v2, v3, v2, -8
-; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v7, v0
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v8, v1, vcc_lo
 ; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v0, -8
 ; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; CHECK-NEXT:    v_add3_u32 v2, v3, v2, -8
+; CHECK-NEXT:    s_mov_b32 s7, 0
+; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB2_15: ; %memmove_bwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ds_read_b64 v[3:4], v2
-; CHECK-NEXT:    v_add_co_u32 v5, vcc_lo, v5, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, -1, v6, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v7, vcc_lo, v5, -8
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, -1, v6, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v9, vcc_lo, v0, v5
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v1, v6, vcc_lo
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s4, 0, v[7:8]
+; CHECK-NEXT:    v_mov_b32_e32 v5, v7
 ; CHECK-NEXT:    v_add_nc_u32_e32 v2, -8, v2
-; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[5:6]
-; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
+; CHECK-NEXT:    v_mov_b32_e32 v6, v8
+; CHECK-NEXT:    s_or_b32 s7, s4, s7
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[3:4]
-; CHECK-NEXT:    v_add_co_u32 v0, s4, v0, -8
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s4, -1, v1, s4
+; CHECK-NEXT:    flat_store_dwordx2 v[9:10], v[3:4]
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB2_15
-; CHECK-NEXT:  .LBB2_16: ; %Flow37
+; CHECK-NEXT:  .LBB2_16: ; %Flow36
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -449,19 +450,20 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align
 ; CHECK-LABEL: memmove_p0_p4:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v9, 0
 ; CHECK-NEXT:    v_and_b32_e32 v8, 15, v4
-; CHECK-NEXT:    v_lshrrev_b64 v[6:7], 4, v[4:5]
+; CHECK-NEXT:    v_mov_b32_e32 v9, 0
+; CHECK-NEXT:    v_and_b32_e32 v6, -16, v4
+; CHECK-NEXT:    v_mov_b32_e32 v7, v5
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
 ; CHECK-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[8:9]
 ; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
 ; CHECK-NEXT:    v_cmpx_ge_u64_e64 v[2:3], v[0:1]
 ; CHECK-NEXT:    s_xor_b32 s7, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execnz .LBB3_3
-; CHECK-NEXT:  ; %bb.1: ; %Flow37
+; CHECK-NEXT:  ; %bb.1: ; %Flow34
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s6, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB3_10
-; CHECK-NEXT:  .LBB3_2: ; %Flow38
+; CHECK-NEXT:  .LBB3_2: ; %Flow35
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -469,39 +471,39 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align
 ; CHECK-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; CHECK-NEXT:    s_cbranch_execz .LBB3_6
 ; CHECK-NEXT:  ; %bb.4: ; %memmove_fwd_main_loop.preheader
-; CHECK-NEXT:    v_mov_b32_e32 v11, v3
-; CHECK-NEXT:    v_mov_b32_e32 v13, v1
-; CHECK-NEXT:    v_mov_b32_e32 v10, v2
-; CHECK-NEXT:    v_mov_b32_e32 v12, v0
+; CHECK-NEXT:    v_mov_b32_e32 v5, v3
+; CHECK-NEXT:    v_mov_b32_e32 v11, v1
+; CHECK-NEXT:    v_mov_b32_e32 v13, v7
+; CHECK-NEXT:    v_mov_b32_e32 v4, v2
+; CHECK-NEXT:    v_mov_b32_e32 v10, v0
+; CHECK-NEXT:    v_mov_b32_e32 v12, v6
 ; CHECK-NEXT:    s_mov_b32 s9, 0
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB3_5: ; %memmove_fwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    global_load_dwordx4 v[14:17], v[10:11], off
-; CHECK-NEXT:    v_add_co_u32 v6, s5, v6, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v7, s5, -1, v7, s5
-; CHECK-NEXT:    v_add_co_u32 v10, s5, v10, 16
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v11, s5, 0, v11, s5
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[6:7]
+; CHECK-NEXT:    global_load_dwordx4 v[14:17], v[4:5], off
+; CHECK-NEXT:    v_add_co_u32 v12, s5, v12, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v13, s5, -1, v13, s5
+; CHECK-NEXT:    v_add_co_u32 v4, s5, v4, 16
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, s5, 0, v5, s5
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[12:13]
 ; CHECK-NEXT:    s_or_b32 s9, s5, s9
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx4 v[12:13], v[14:17]
-; CHECK-NEXT:    v_add_co_u32 v12, s6, v12, 16
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v13, s6, 0, v13, s6
+; CHECK-NEXT:    flat_store_dwordx4 v[10:11], v[14:17]
+; CHECK-NEXT:    v_add_co_u32 v10, s6, v10, 16
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v11, s6, 0, v11, s6
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
 ; CHECK-NEXT:    s_cbranch_execnz .LBB3_5
-; CHECK-NEXT:  .LBB3_6: ; %Flow32
+; CHECK-NEXT:  .LBB3_6: ; %Flow29
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    s_and_saveexec_b32 s8, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB3_9
 ; CHECK-NEXT:  ; %bb.7: ; %memmove_fwd_residual_loop.preheader
-; CHECK-NEXT:    v_sub_co_u32 v4, s5, v4, v8
-; CHECK-NEXT:    v_subrev_co_ci_u32_e64 v5, s5, 0, v5, s5
+; CHECK-NEXT:    v_add_co_u32 v0, s5, v0, v6
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s5, v1, v7, s5
+; CHECK-NEXT:    v_add_co_u32 v2, s5, v2, v6
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, s5, v3, v7, s5
 ; CHECK-NEXT:    s_mov_b32 s9, 0
-; CHECK-NEXT:    v_add_co_u32 v0, s5, v0, v4
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s5, v1, v5, s5
-; CHECK-NEXT:    v_add_co_u32 v2, s5, v2, v4
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, s5, v3, v5, s5
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB3_8: ; %memmove_fwd_residual_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -518,11 +520,11 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s6, 0, v1, s6
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
 ; CHECK-NEXT:    s_cbranch_execnz .LBB3_8
-; CHECK-NEXT:  .LBB3_9: ; %Flow30
+; CHECK-NEXT:  .LBB3_9: ; %Flow27
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    ; implicit-def: $vgpr6_vgpr7
-; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:    ; implicit-def: $vgpr8_vgpr9
 ; CHECK-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s6, s7
@@ -554,36 +556,35 @@ define void @memmove_p0_p4(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, s5, -1, v5, s5
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    s_cbranch_execnz .LBB3_12
-; CHECK-NEXT:  .LBB3_13: ; %Flow36
+; CHECK-NEXT:  .LBB3_13: ; %Flow33
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_and_saveexec_b32 s5, vcc_lo
 ; CHECK-NEXT:    s_cbranch_execz .LBB3_16
 ; CHECK-NEXT:  ; %bb.14: ; %memmove_bwd_main_loop.preheader
-; CHECK-NEXT:    v_lshlrev_b64 v[4:5], 4, v[6:7]
+; CHECK-NEXT:    v_add_co_u32 v2, vcc_lo, v2, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v0, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; CHECK-NEXT:    s_mov_b32 s7, 0
-; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v4, -16
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v5, vcc_lo
-; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
-; CHECK-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB3_15: ; %memmove_bwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
-; CHECK-NEXT:    v_add_co_u32 v6, vcc_lo, v6, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, -1, v7, vcc_lo
-; CHECK-NEXT:    v_add_co_u32 v2, vcc_lo, v2, -16
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
-; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[6:7]
+; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v2, v6
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v3, v7, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v12, s4, v0, v6
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v13, s4, v1, v7, s4
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[4:5], off
+; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v6, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v7, vcc_lo
+; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
+; CHECK-NEXT:    v_mov_b32_e32 v7, v5
+; CHECK-NEXT:    v_mov_b32_e32 v6, v4
 ; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
-; CHECK-NEXT:    v_add_co_u32 v0, s4, v0, -16
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s4, -1, v1, s4
+; CHECK-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB3_15
-; CHECK-NEXT:  .LBB3_16: ; %Flow34
+; CHECK-NEXT:  .LBB3_16: ; %Flow31
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -600,7 +601,8 @@ define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align
 ; CHECK-NEXT:    v_and_b32_e32 v7, 15, v3
 ; CHECK-NEXT:    v_mov_b32_e32 v8, 0
 ; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; CHECK-NEXT:    v_lshrrev_b64 v[5:6], 4, v[3:4]
+; CHECK-NEXT:    v_and_b32_e32 v5, -16, v3
+; CHECK-NEXT:    v_mov_b32_e32 v6, v4
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
 ; CHECK-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[7:8]
 ; CHECK-NEXT:    v_cndmask_b32_e32 v9, -1, v0, vcc_lo
@@ -608,10 +610,10 @@ define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align
 ; CHECK-NEXT:    v_cmpx_ge_u32_e64 v2, v9
 ; CHECK-NEXT:    s_xor_b32 s7, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execnz .LBB4_3
-; CHECK-NEXT:  ; %bb.1: ; %Flow40
+; CHECK-NEXT:  ; %bb.1: ; %Flow39
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s6, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB4_10
-; CHECK-NEXT:  .LBB4_2: ; %Flow41
+; CHECK-NEXT:  .LBB4_2: ; %Flow40
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -620,39 +622,39 @@ define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align
 ; CHECK-NEXT:    s_cbranch_execz .LBB4_6
 ; CHECK-NEXT:  ; %bb.4: ; %memmove_fwd_main_loop.preheader
 ; CHECK-NEXT:    v_mov_b32_e32 v10, v1
+; CHECK-NEXT:    v_mov_b32_e32 v12, v6
 ; CHECK-NEXT:    v_mov_b32_e32 v9, v0
-; CHECK-NEXT:    v_mov_b32_e32 v11, v2
+; CHECK-NEXT:    v_mov_b32_e32 v11, v5
+; CHECK-NEXT:    v_mov_b32_e32 v4, v2
 ; CHECK-NEXT:    s_mov_b32 s9, 0
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB4_5: ; %memmove_fwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    buffer_load_dword v12, v11, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v13, v11, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v14, v11, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_dword v15, v11, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    v_add_co_u32 v5, s5, v5, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v6, s5, -1, v6, s5
-; CHECK-NEXT:    v_add_nc_u32_e32 v11, 16, v11
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[5:6]
+; CHECK-NEXT:    buffer_load_dword v13, v4, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v14, v4, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v15, v4, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v16, v4, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    v_add_co_u32 v11, s5, v11, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v12, s5, -1, v12, s5
+; CHECK-NEXT:    v_add_nc_u32_e32 v4, 16, v4
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[11:12]
 ; CHECK-NEXT:    s_or_b32 s9, s5, s9
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx4 v[9:10], v[12:15]
+; CHECK-NEXT:    flat_store_dwordx4 v[9:10], v[13:16]
 ; CHECK-NEXT:    v_add_co_u32 v9, s6, v9, 16
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v10, s6, 0, v10, s6
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
 ; CHECK-NEXT:    s_cbranch_execnz .LBB4_5
-; CHECK-NEXT:  .LBB4_6: ; %Flow35
+; CHECK-NEXT:  .LBB4_6: ; %Flow34
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    s_and_saveexec_b32 s8, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB4_9
 ; CHECK-NEXT:  ; %bb.7: ; %memmove_fwd_residual_loop.preheader
-; CHECK-NEXT:    v_and_b32_e32 v5, -16, v3
-; CHECK-NEXT:    v_sub_co_u32 v3, s5, v3, v7
-; CHECK-NEXT:    v_subrev_co_ci_u32_e64 v4, s5, 0, v4, s5
-; CHECK-NEXT:    v_add_nc_u32_e32 v2, v2, v5
-; CHECK-NEXT:    v_add_co_u32 v0, s5, v0, v3
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s5, v1, v4, s5
+; CHECK-NEXT:    v_and_b32_e32 v3, -16, v3
+; CHECK-NEXT:    v_add_co_u32 v0, s5, v0, v5
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s5, v1, v6, s5
+; CHECK-NEXT:    v_add_nc_u32_e32 v2, v2, v3
 ; CHECK-NEXT:    s_mov_b32 s9, 0
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB4_8: ; %memmove_fwd_residual_loop
@@ -669,7 +671,7 @@ define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s6, 0, v1, s6
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
 ; CHECK-NEXT:    s_cbranch_execnz .LBB4_8
-; CHECK-NEXT:  .LBB4_9: ; %Flow33
+; CHECK-NEXT:  .LBB4_9: ; %Flow32
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    ; implicit-def: $vgpr5_vgpr6
 ; CHECK-NEXT:    ; implicit-def: $vgpr3_vgpr4
@@ -703,19 +705,16 @@ define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v10, s5, -1, v10, s5
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    s_cbranch_execnz .LBB4_12
-; CHECK-NEXT:  .LBB4_13: ; %Flow39
+; CHECK-NEXT:  .LBB4_13: ; %Flow38
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_and_saveexec_b32 s5, vcc_lo
 ; CHECK-NEXT:    s_cbranch_execz .LBB4_16
 ; CHECK-NEXT:  ; %bb.14: ; %memmove_bwd_main_loop.preheader
-; CHECK-NEXT:    v_lshlrev_b64 v[7:8], 4, v[5:6]
 ; CHECK-NEXT:    v_and_b32_e32 v3, -16, v3
-; CHECK-NEXT:    s_mov_b32 s7, 0
-; CHECK-NEXT:    v_add3_u32 v2, v3, v2, -16
-; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v7, v0
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v8, v1, vcc_lo
 ; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v0, -16
 ; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; CHECK-NEXT:    v_add3_u32 v2, v3, v2, -16
+; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB4_15: ; %memmove_bwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -724,18 +723,20 @@ define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align
 ; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    v_add_co_u32 v5, vcc_lo, v5, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, -1, v6, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v3, vcc_lo, v5, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v6, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v11, vcc_lo, v0, v5
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, v1, v6, vcc_lo
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s4, 0, v[3:4]
+; CHECK-NEXT:    v_mov_b32_e32 v6, v4
 ; CHECK-NEXT:    v_add_nc_u32_e32 v2, -16, v2
-; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[5:6]
-; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
+; CHECK-NEXT:    v_mov_b32_e32 v5, v3
+; CHECK-NEXT:    s_or_b32 s7, s4, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
-; CHECK-NEXT:    v_add_co_u32 v0, s4, v0, -16
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s4, -1, v1, s4
+; CHECK-NEXT:    flat_store_dwordx4 v[11:12], v[7:10]
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB4_15
-; CHECK-NEXT:  .LBB4_16: ; %Flow37
+; CHECK-NEXT:  .LBB4_16: ; %Flow36
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -750,58 +751,59 @@ define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align
 ; CHECK-LABEL: memmove_p1_p0:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v9, 0
 ; CHECK-NEXT:    v_and_b32_e32 v8, 15, v4
-; CHECK-NEXT:    v_lshrrev_b64 v[6:7], 4, v[4:5]
+; CHECK-NEXT:    v_mov_b32_e32 v9, 0
+; CHECK-NEXT:    v_and_b32_e32 v6, -16, v4
+; CHECK-NEXT:    v_mov_b32_e32 v7, v5
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
 ; CHECK-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[8:9]
 ; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
 ; CHECK-NEXT:    v_cmpx_ge_u64_e64 v[2:3], v[0:1]
 ; CHECK-NEXT:    s_xor_b32 s7, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execnz .LBB5_3
-; CHECK-NEXT:  ; %bb.1: ; %Flow41
+; CHECK-NEXT:  ; %bb.1: ; %Flow37
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s6, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB5_10
-; CHECK-NEXT:  .LBB5_2: ; %Flow42
+; CHECK-NEXT:  .LBB5_2: ; %Flow38
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ; CHECK-NEXT:  .LBB5_3: ; %memmove_copy_forward
 ; CHECK-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; CHECK-NEXT:    s_cbranch_execz .LBB5_6
 ; CHECK-NEXT:  ; %bb.4: ; %memmove_fwd_main_loop.preheader
-; CHECK-NEXT:    v_mov_b32_e32 v11, v3
-; CHECK-NEXT:    v_mov_b32_e32 v13, v1
-; CHECK-NEXT:    v_mov_b32_e32 v10, v2
-; CHECK-NEXT:    v_mov_b32_e32 v12, v0
+; CHECK-NEXT:    v_mov_b32_e32 v5, v3
+; CHECK-NEXT:    v_mov_b32_e32 v11, v1
+; CHECK-NEXT:    v_mov_b32_e32 v13, v7
+; CHECK-NEXT:    v_mov_b32_e32 v4, v2
+; CHECK-NEXT:    v_mov_b32_e32 v10, v0
+; CHECK-NEXT:    v_mov_b32_e32 v12, v6
 ; CHECK-NEXT:    s_mov_b32 s9, 0
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB5_5: ; %memmove_fwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    flat_load_dwordx4 v[14:17], v[10:11]
-; CHECK-NEXT:    v_add_co_u32 v6, s5, v6, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v7, s5, -1, v7, s5
-; CHECK-NEXT:    v_add_co_u32 v10, s5, v10, 16
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v11, s5, 0, v11, s5
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[6:7]
+; CHECK-NEXT:    flat_load_dwordx4 v[14:17], v[4:5]
+; CHECK-NEXT:    v_add_co_u32 v12, s5, v12, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v13, s5, -1, v13, s5
+; CHECK-NEXT:    v_add_co_u32 v4, s5, v4, 16
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, s5, 0, v5, s5
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[12:13]
 ; CHECK-NEXT:    s_or_b32 s9, s5, s9
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v[12:13], v[14:17], off
-; CHECK-NEXT:    v_add_co_u32 v12, s6, v12, 16
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v13, s6, 0, v13, s6
+; CHECK-NEXT:    global_store_dwordx4 v[10:11], v[14:17], off
+; CHECK-NEXT:    v_add_co_u32 v10, s6, v10, 16
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v11, s6, 0, v11, s6
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
 ; CHECK-NEXT:    s_cbranch_execnz .LBB5_5
-; CHECK-NEXT:  .LBB5_6: ; %Flow36
+; CHECK-NEXT:  .LBB5_6: ; %Flow32
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    s_and_saveexec_b32 s8, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB5_9
 ; CHECK-NEXT:  ; %bb.7: ; %memmove_fwd_residual_loop.preheader
-; CHECK-NEXT:    v_sub_co_u32 v4, s5, v4, v8
-; CHECK-NEXT:    v_subrev_co_ci_u32_e64 v5, s5, 0, v5, s5
+; CHECK-NEXT:    v_add_co_u32 v0, s5, v0, v6
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s5, v1, v7, s5
+; CHECK-NEXT:    v_add_co_u32 v2, s5, v2, v6
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, s5, v3, v7, s5
 ; CHECK-NEXT:    s_mov_b32 s9, 0
-; CHECK-NEXT:    v_add_co_u32 v0, s5, v0, v4
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s5, v1, v5, s5
-; CHECK-NEXT:    v_add_co_u32 v2, s5, v2, v4
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, s5, v3, v5, s5
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB5_8: ; %memmove_fwd_residual_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -818,7 +820,7 @@ define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s6, 0, v1, s6
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
 ; CHECK-NEXT:    s_cbranch_execnz .LBB5_8
-; CHECK-NEXT:  .LBB5_9: ; %Flow34
+; CHECK-NEXT:  .LBB5_9: ; %Flow30
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -854,36 +856,35 @@ define void @memmove_p1_p0(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, s5, -1, v5, s5
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    s_cbranch_execnz .LBB5_12
-; CHECK-NEXT:  .LBB5_13: ; %Flow40
+; CHECK-NEXT:  .LBB5_13: ; %Flow36
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_and_saveexec_b32 s5, vcc_lo
 ; CHECK-NEXT:    s_cbranch_execz .LBB5_16
 ; CHECK-NEXT:  ; %bb.14: ; %memmove_bwd_main_loop.preheader
-; CHECK-NEXT:    v_lshlrev_b64 v[4:5], 4, v[6:7]
+; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v0, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v2, vcc_lo, v2, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
 ; CHECK-NEXT:    s_mov_b32 s7, 0
-; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v4, -16
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v5, vcc_lo
-; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
-; CHECK-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB5_15: ; %memmove_bwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
-; CHECK-NEXT:    v_add_co_u32 v6, vcc_lo, v6, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, -1, v7, vcc_lo
-; CHECK-NEXT:    v_add_co_u32 v2, vcc_lo, v2, -16
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
-; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[6:7]
+; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v2, v6
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v3, v7, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v12, s4, v0, v6
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v13, s4, v1, v7, s4
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[4:5]
+; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v6, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v7, vcc_lo
+; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
+; CHECK-NEXT:    v_mov_b32_e32 v7, v5
+; CHECK-NEXT:    v_mov_b32_e32 v6, v4
 ; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
-; CHECK-NEXT:    v_add_co_u32 v0, s4, v0, -16
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s4, -1, v1, s4
+; CHECK-NEXT:    global_store_dwordx4 v[12:13], v[8:11], off
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB5_15
-; CHECK-NEXT:  .LBB5_16: ; %Flow38
+; CHECK-NEXT:  .LBB5_16: ; %Flow34
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -896,58 +897,59 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align
 ; CHECK-LABEL: memmove_p1_p1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v9, 0
 ; CHECK-NEXT:    v_and_b32_e32 v8, 15, v4
-; CHECK-NEXT:    v_lshrrev_b64 v[6:7], 4, v[4:5]
+; CHECK-NEXT:    v_mov_b32_e32 v9, 0
+; CHECK-NEXT:    v_and_b32_e32 v6, -16, v4
+; CHECK-NEXT:    v_mov_b32_e32 v7, v5
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
 ; CHECK-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[8:9]
 ; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
 ; CHECK-NEXT:    v_cmpx_ge_u64_e64 v[2:3], v[0:1]
 ; CHECK-NEXT:    s_xor_b32 s7, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execnz .LBB6_3
-; CHECK-NEXT:  ; %bb.1: ; %Flow47
+; CHECK-NEXT:  ; %bb.1: ; %Flow41
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s6, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB6_10
-; CHECK-NEXT:  .LBB6_2: ; %Flow48
+; CHECK-NEXT:  .LBB6_2: ; %Flow42
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ; CHECK-NEXT:  .LBB6_3: ; %memmove_copy_forward
 ; CHECK-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; CHECK-NEXT:    s_cbranch_execz .LBB6_6
 ; CHECK-NEXT:  ; %bb.4: ; %memmove_fwd_main_loop.preheader
-; CHECK-NEXT:    v_mov_b32_e32 v11, v3
-; CHECK-NEXT:    v_mov_b32_e32 v13, v1
-; CHECK-NEXT:    v_mov_b32_e32 v10, v2
-; CHECK-NEXT:    v_mov_b32_e32 v12, v0
+; CHECK-NEXT:    v_mov_b32_e32 v5, v3
+; CHECK-NEXT:    v_mov_b32_e32 v11, v1
+; CHECK-NEXT:    v_mov_b32_e32 v13, v7
+; CHECK-NEXT:    v_mov_b32_e32 v4, v2
+; CHECK-NEXT:    v_mov_b32_e32 v10, v0
+; CHECK-NEXT:    v_mov_b32_e32 v12, v6
 ; CHECK-NEXT:    s_mov_b32 s9, 0
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB6_5: ; %memmove_fwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    global_load_dwordx4 v[14:17], v[10:11], off
-; CHECK-NEXT:    v_add_co_u32 v6, s5, v6, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v7, s5, -1, v7, s5
-; CHECK-NEXT:    v_add_co_u32 v10, s5, v10, 16
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v11, s5, 0, v11, s5
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[6:7]
+; CHECK-NEXT:    global_load_dwordx4 v[14:17], v[4:5], off
+; CHECK-NEXT:    v_add_co_u32 v12, s5, v12, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v13, s5, -1, v13, s5
+; CHECK-NEXT:    v_add_co_u32 v4, s5, v4, 16
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, s5, 0, v5, s5
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[12:13]
 ; CHECK-NEXT:    s_or_b32 s9, s5, s9
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v[12:13], v[14:17], off
-; CHECK-NEXT:    v_add_co_u32 v12, s6, v12, 16
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v13, s6, 0, v13, s6
+; CHECK-NEXT:    global_store_dwordx4 v[10:11], v[14:17], off
+; CHECK-NEXT:    v_add_co_u32 v10, s6, v10, 16
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v11, s6, 0, v11, s6
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
 ; CHECK-NEXT:    s_cbranch_execnz .LBB6_5
-; CHECK-NEXT:  .LBB6_6: ; %Flow42
+; CHECK-NEXT:  .LBB6_6: ; %Flow36
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    s_and_saveexec_b32 s8, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB6_9
 ; CHECK-NEXT:  ; %bb.7: ; %memmove_fwd_residual_loop.preheader
-; CHECK-NEXT:    v_sub_co_u32 v4, s5, v4, v8
-; CHECK-NEXT:    v_subrev_co_ci_u32_e64 v5, s5, 0, v5, s5
+; CHECK-NEXT:    v_add_co_u32 v0, s5, v0, v6
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s5, v1, v7, s5
+; CHECK-NEXT:    v_add_co_u32 v2, s5, v2, v6
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, s5, v3, v7, s5
 ; CHECK-NEXT:    s_mov_b32 s9, 0
-; CHECK-NEXT:    v_add_co_u32 v0, s5, v0, v4
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s5, v1, v5, s5
-; CHECK-NEXT:    v_add_co_u32 v2, s5, v2, v4
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, s5, v3, v5, s5
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB6_8: ; %memmove_fwd_residual_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -964,11 +966,11 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s6, 0, v1, s6
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
 ; CHECK-NEXT:    s_cbranch_execnz .LBB6_8
-; CHECK-NEXT:  .LBB6_9: ; %Flow40
+; CHECK-NEXT:  .LBB6_9: ; %Flow34
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    ; implicit-def: $vgpr6_vgpr7
-; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:    ; implicit-def: $vgpr8_vgpr9
 ; CHECK-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s6, s7
@@ -1000,36 +1002,35 @@ define void @memmove_p1_p1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, s5, -1, v5, s5
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    s_cbranch_execnz .LBB6_12
-; CHECK-NEXT:  .LBB6_13: ; %Flow46
+; CHECK-NEXT:  .LBB6_13: ; %Flow40
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_and_saveexec_b32 s5, vcc_lo
 ; CHECK-NEXT:    s_cbranch_execz .LBB6_16
 ; CHECK-NEXT:  ; %bb.14: ; %memmove_bwd_main_loop.preheader
-; CHECK-NEXT:    v_lshlrev_b64 v[4:5], 4, v[6:7]
+; CHECK-NEXT:    v_add_co_u32 v2, vcc_lo, v2, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v0, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; CHECK-NEXT:    s_mov_b32 s7, 0
-; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v4, -16
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v5, vcc_lo
-; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
-; CHECK-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB6_15: ; %memmove_bwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
-; CHECK-NEXT:    v_add_co_u32 v6, vcc_lo, v6, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, -1, v7, vcc_lo
-; CHECK-NEXT:    v_add_co_u32 v2, vcc_lo, v2, -16
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
-; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[6:7]
+; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v2, v6
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v3, v7, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v12, s4, v0, v6
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v13, s4, v1, v7, s4
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[4:5], off
+; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v6, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v7, vcc_lo
+; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
+; CHECK-NEXT:    v_mov_b32_e32 v7, v5
+; CHECK-NEXT:    v_mov_b32_e32 v6, v4
 ; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
-; CHECK-NEXT:    v_add_co_u32 v0, s4, v0, -16
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s4, -1, v1, s4
+; CHECK-NEXT:    global_store_dwordx4 v[12:13], v[8:11], off
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB6_15
-; CHECK-NEXT:  .LBB6_16: ; %Flow44
+; CHECK-NEXT:  .LBB6_16: ; %Flow38
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1042,63 +1043,61 @@ define void @memmove_p1_p3(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align
 ; CHECK-LABEL: memmove_p1_p3:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshrrev_b64 v[4:5], 3, v[3:4]
-; CHECK-NEXT:    s_mov_b64 s[6:7], 0
-; CHECK-NEXT:    s_mov_b32 s5, exec_lo
-; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[4:5]
+; CHECK-NEXT:    v_and_b32_e32 v7, -8, v3
+; CHECK-NEXT:    v_mov_b32_e32 v8, v4
+; CHECK-NEXT:    v_and_b32_e32 v5, 7, v3
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0
+; CHECK-NEXT:    s_mov_b32 s6, exec_lo
+; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[7:8]
 ; CHECK-NEXT:    s_cbranch_execz .LBB7_3
 ; CHECK-NEXT:  ; %bb.1: ; %loop-memcpy-expansion.preheader
-; CHECK-NEXT:    v_mov_b32_e32 v7, v1
-; CHECK-NEXT:    v_mov_b32_e32 v6, v0
-; CHECK-NEXT:    v_mov_b32_e32 v8, v2
-; CHECK-NEXT:    s_mov_b32 s8, 0
+; CHECK-NEXT:    v_mov_b32_e32 v9, v2
+; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:  .LBB7_2: ; %loop-memcpy-expansion
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ds_read_b64 v[9:10], v8
-; CHECK-NEXT:    s_add_u32 s6, s6, 1
-; CHECK-NEXT:    s_addc_u32 s7, s7, 0
-; CHECK-NEXT:    v_add_nc_u32_e32 v8, 8, v8
-; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[6:7], v[4:5]
-; CHECK-NEXT:    s_or_b32 s8, vcc_lo, s8
+; CHECK-NEXT:    ds_read_b64 v[10:11], v9
+; CHECK-NEXT:    v_add_co_u32 v12, vcc_lo, v0, s4
+; CHECK-NEXT:    s_add_u32 s4, s4, 8
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT:    s_addc_u32 s5, s5, 0
+; CHECK-NEXT:    v_add_nc_u32_e32 v9, 8, v9
+; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8]
+; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_store_dwordx2 v[6:7], v[9:10], off
-; CHECK-NEXT:    v_add_co_u32 v6, s4, v6, 8
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v7, s4, 0, v7, s4
-; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
+; CHECK-NEXT:    global_store_dwordx2 v[12:13], v[10:11], off
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB7_2
-; CHECK-NEXT:  .LBB7_3: ; %Flow11
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; CHECK-NEXT:    v_and_b32_e32 v6, 7, v3
-; CHECK-NEXT:    v_mov_b32_e32 v7, 0
+; CHECK-NEXT:  .LBB7_3: ; %Flow9
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
-; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[6:7]
+; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[5:6]
 ; CHECK-NEXT:    s_xor_b32 s6, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execz .LBB7_7
 ; CHECK-NEXT:  ; %bb.4: ; %loop-memcpy-residual.preheader
-; CHECK-NEXT:    v_and_b32_e32 v8, -8, v3
-; CHECK-NEXT:    v_lshlrev_b64 v[3:4], 3, v[4:5]
+; CHECK-NEXT:    v_and_b32_e32 v3, -8, v3
 ; CHECK-NEXT:    s_mov_b32 s7, 0
-; CHECK-NEXT:    v_add_nc_u32_e32 v2, v2, v8
 ; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
+; CHECK-NEXT:    v_add_nc_u32_e32 v2, v2, v3
 ; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
 ; CHECK-NEXT:  .LBB7_5: ; %loop-memcpy-residual
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ds_read_u8 v5, v2
+; CHECK-NEXT:    ds_read_u8 v7, v2
 ; CHECK-NEXT:    v_add_co_u32 v3, vcc_lo, v0, s4
 ; CHECK-NEXT:    s_add_u32 s4, s4, 1
 ; CHECK-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, s5, v1, vcc_lo
 ; CHECK-NEXT:    s_addc_u32 s5, s5, 0
 ; CHECK-NEXT:    v_add_nc_u32_e32 v2, 1, v2
-; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[6:7]
+; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6]
 ; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_store_byte v[3:4], v5, off
+; CHECK-NEXT:    global_store_byte v[3:4], v7, off
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB7_5
 ; CHECK-NEXT:  ; %bb.6: ; %Flow
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
-; CHECK-NEXT:  .LBB7_7: ; %Flow9
+; CHECK-NEXT:  .LBB7_7: ; %Flow7
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1110,58 +1109,59 @@ define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align
 ; CHECK-LABEL: memmove_p1_p4:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v9, 0
 ; CHECK-NEXT:    v_and_b32_e32 v8, 15, v4
-; CHECK-NEXT:    v_lshrrev_b64 v[6:7], 4, v[4:5]
+; CHECK-NEXT:    v_mov_b32_e32 v9, 0
+; CHECK-NEXT:    v_and_b32_e32 v6, -16, v4
+; CHECK-NEXT:    v_mov_b32_e32 v7, v5
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
 ; CHECK-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[8:9]
 ; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
 ; CHECK-NEXT:    v_cmpx_ge_u64_e64 v[2:3], v[0:1]
 ; CHECK-NEXT:    s_xor_b32 s7, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execnz .LBB8_3
-; CHECK-NEXT:  ; %bb.1: ; %Flow41
+; CHECK-NEXT:  ; %bb.1: ; %Flow38
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s6, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB8_10
-; CHECK-NEXT:  .LBB8_2: ; %Flow42
+; CHECK-NEXT:  .LBB8_2: ; %Flow39
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ; CHECK-NEXT:  .LBB8_3: ; %memmove_copy_forward
 ; CHECK-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; CHECK-NEXT:    s_cbranch_execz .LBB8_6
 ; CHECK-NEXT:  ; %bb.4: ; %memmove_fwd_main_loop.preheader
-; CHECK-NEXT:    v_mov_b32_e32 v11, v3
-; CHECK-NEXT:    v_mov_b32_e32 v13, v1
-; CHECK-NEXT:    v_mov_b32_e32 v10, v2
-; CHECK-NEXT:    v_mov_b32_e32 v12, v0
+; CHECK-NEXT:    v_mov_b32_e32 v5, v3
+; CHECK-NEXT:    v_mov_b32_e32 v11, v1
+; CHECK-NEXT:    v_mov_b32_e32 v13, v7
+; CHECK-NEXT:    v_mov_b32_e32 v4, v2
+; CHECK-NEXT:    v_mov_b32_e32 v10, v0
+; CHECK-NEXT:    v_mov_b32_e32 v12, v6
 ; CHECK-NEXT:    s_mov_b32 s9, 0
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB8_5: ; %memmove_fwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    global_load_dwordx4 v[14:17], v[10:11], off
-; CHECK-NEXT:    v_add_co_u32 v6, s5, v6, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v7, s5, -1, v7, s5
-; CHECK-NEXT:    v_add_co_u32 v10, s5, v10, 16
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v11, s5, 0, v11, s5
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[6:7]
+; CHECK-NEXT:    global_load_dwordx4 v[14:17], v[4:5], off
+; CHECK-NEXT:    v_add_co_u32 v12, s5, v12, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v13, s5, -1, v13, s5
+; CHECK-NEXT:    v_add_co_u32 v4, s5, v4, 16
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, s5, 0, v5, s5
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[12:13]
 ; CHECK-NEXT:    s_or_b32 s9, s5, s9
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v[12:13], v[14:17], off
-; CHECK-NEXT:    v_add_co_u32 v12, s6, v12, 16
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v13, s6, 0, v13, s6
+; CHECK-NEXT:    global_store_dwordx4 v[10:11], v[14:17], off
+; CHECK-NEXT:    v_add_co_u32 v10, s6, v10, 16
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v11, s6, 0, v11, s6
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
 ; CHECK-NEXT:    s_cbranch_execnz .LBB8_5
-; CHECK-NEXT:  .LBB8_6: ; %Flow36
+; CHECK-NEXT:  .LBB8_6: ; %Flow33
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    s_and_saveexec_b32 s8, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB8_9
 ; CHECK-NEXT:  ; %bb.7: ; %memmove_fwd_residual_loop.preheader
-; CHECK-NEXT:    v_sub_co_u32 v4, s5, v4, v8
-; CHECK-NEXT:    v_subrev_co_ci_u32_e64 v5, s5, 0, v5, s5
+; CHECK-NEXT:    v_add_co_u32 v0, s5, v0, v6
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s5, v1, v7, s5
+; CHECK-NEXT:    v_add_co_u32 v2, s5, v2, v6
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, s5, v3, v7, s5
 ; CHECK-NEXT:    s_mov_b32 s9, 0
-; CHECK-NEXT:    v_add_co_u32 v0, s5, v0, v4
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s5, v1, v5, s5
-; CHECK-NEXT:    v_add_co_u32 v2, s5, v2, v4
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, s5, v3, v5, s5
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB8_8: ; %memmove_fwd_residual_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -1178,11 +1178,11 @@ define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s6, 0, v1, s6
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
 ; CHECK-NEXT:    s_cbranch_execnz .LBB8_8
-; CHECK-NEXT:  .LBB8_9: ; %Flow34
+; CHECK-NEXT:  .LBB8_9: ; %Flow31
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    ; implicit-def: $vgpr6_vgpr7
-; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:    ; implicit-def: $vgpr8_vgpr9
 ; CHECK-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s6, s7
@@ -1214,36 +1214,35 @@ define void @memmove_p1_p4(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, s5, -1, v5, s5
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    s_cbranch_execnz .LBB8_12
-; CHECK-NEXT:  .LBB8_13: ; %Flow40
+; CHECK-NEXT:  .LBB8_13: ; %Flow37
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_and_saveexec_b32 s5, vcc_lo
 ; CHECK-NEXT:    s_cbranch_execz .LBB8_16
 ; CHECK-NEXT:  ; %bb.14: ; %memmove_bwd_main_loop.preheader
-; CHECK-NEXT:    v_lshlrev_b64 v[4:5], 4, v[6:7]
+; CHECK-NEXT:    v_add_co_u32 v2, vcc_lo, v2, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v0, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; CHECK-NEXT:    s_mov_b32 s7, 0
-; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v4, -16
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v5, vcc_lo
-; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
-; CHECK-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB8_15: ; %memmove_bwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
-; CHECK-NEXT:    v_add_co_u32 v6, vcc_lo, v6, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, -1, v7, vcc_lo
-; CHECK-NEXT:    v_add_co_u32 v2, vcc_lo, v2, -16
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
-; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[6:7]
+; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v2, v6
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v3, v7, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v12, s4, v0, v6
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v13, s4, v1, v7, s4
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[4:5], off
+; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v6, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v7, vcc_lo
+; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
+; CHECK-NEXT:    v_mov_b32_e32 v7, v5
+; CHECK-NEXT:    v_mov_b32_e32 v6, v4
 ; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
-; CHECK-NEXT:    v_add_co_u32 v0, s4, v0, -16
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s4, -1, v1, s4
+; CHECK-NEXT:    global_store_dwordx4 v[12:13], v[8:11], off
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB8_15
-; CHECK-NEXT:  .LBB8_16: ; %Flow38
+; CHECK-NEXT:  .LBB8_16: ; %Flow35
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1256,68 +1255,66 @@ define void @memmove_p1_p5(ptr addrspace(1) align 1 %dst, ptr addrspace(5) align
 ; CHECK-LABEL: memmove_p1_p5:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshrrev_b64 v[4:5], 4, v[3:4]
-; CHECK-NEXT:    s_mov_b64 s[6:7], 0
-; CHECK-NEXT:    s_mov_b32 s5, exec_lo
-; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[4:5]
+; CHECK-NEXT:    v_and_b32_e32 v7, -16, v3
+; CHECK-NEXT:    v_mov_b32_e32 v8, v4
+; CHECK-NEXT:    v_and_b32_e32 v5, 15, v3
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0
+; CHECK-NEXT:    s_mov_b32 s6, exec_lo
+; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[7:8]
 ; CHECK-NEXT:    s_cbranch_execz .LBB9_3
 ; CHECK-NEXT:  ; %bb.1: ; %loop-memcpy-expansion.preheader
-; CHECK-NEXT:    v_mov_b32_e32 v7, v1
-; CHECK-NEXT:    v_mov_b32_e32 v6, v0
-; CHECK-NEXT:    v_mov_b32_e32 v8, v2
-; CHECK-NEXT:    s_mov_b32 s8, 0
+; CHECK-NEXT:    v_mov_b32_e32 v9, v2
+; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB9_2: ; %loop-memcpy-expansion
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    buffer_load_dword v9, v8, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v10, v8, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v11, v8, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_dword v12, v8, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_add_u32 s6, s6, 1
-; CHECK-NEXT:    s_addc_u32 s7, s7, 0
-; CHECK-NEXT:    v_add_nc_u32_e32 v8, 16, v8
-; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[6:7], v[4:5]
-; CHECK-NEXT:    s_or_b32 s8, vcc_lo, s8
+; CHECK-NEXT:    buffer_load_dword v10, v9, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v11, v9, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v12, v9, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v13, v9, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    v_add_co_u32 v14, vcc_lo, v0, s4
+; CHECK-NEXT:    s_add_u32 s4, s4, 16
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v15, vcc_lo, s5, v1, vcc_lo
+; CHECK-NEXT:    s_addc_u32 s5, s5, 0
+; CHECK-NEXT:    v_add_nc_u32_e32 v9, 16, v9
+; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8]
+; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v[6:7], v[9:12], off
-; CHECK-NEXT:    v_add_co_u32 v6, s4, v6, 16
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v7, s4, 0, v7, s4
-; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
+; CHECK-NEXT:    global_store_dwordx4 v[14:15], v[10:13], off
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB9_2
-; CHECK-NEXT:  .LBB9_3: ; %Flow11
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; CHECK-NEXT:    v_and_b32_e32 v6, 15, v3
-; CHECK-NEXT:    v_mov_b32_e32 v7, 0
+; CHECK-NEXT:  .LBB9_3: ; %Flow9
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
-; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[6:7]
+; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[5:6]
 ; CHECK-NEXT:    s_xor_b32 s6, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execz .LBB9_7
 ; CHECK-NEXT:  ; %bb.4: ; %loop-memcpy-residual.preheader
-; CHECK-NEXT:    v_and_b32_e32 v8, -16, v3
-; CHECK-NEXT:    v_lshlrev_b64 v[3:4], 4, v[4:5]
+; CHECK-NEXT:    v_and_b32_e32 v3, -16, v3
 ; CHECK-NEXT:    s_mov_b32 s7, 0
-; CHECK-NEXT:    v_add_nc_u32_e32 v2, v2, v8
 ; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
+; CHECK-NEXT:    v_add_nc_u32_e32 v2, v2, v3
 ; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
 ; CHECK-NEXT:  .LBB9_5: ; %loop-memcpy-residual
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    v_add_co_u32 v3, vcc_lo, v0, s4
 ; CHECK-NEXT:    s_add_u32 s4, s4, 1
 ; CHECK-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, s5, v1, vcc_lo
 ; CHECK-NEXT:    s_addc_u32 s5, s5, 0
 ; CHECK-NEXT:    v_add_nc_u32_e32 v2, 1, v2
-; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[6:7]
+; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6]
 ; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_byte v[3:4], v5, off
+; CHECK-NEXT:    global_store_byte v[3:4], v7, off
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB9_5
 ; CHECK-NEXT:  ; %bb.6: ; %Flow
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
-; CHECK-NEXT:  .LBB9_7: ; %Flow9
+; CHECK-NEXT:  .LBB9_7: ; %Flow7
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1330,23 +1327,24 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align
 ; CHECK-LABEL: memmove_p3_p0:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v7, 7, v3
-; CHECK-NEXT:    v_mov_b32_e32 v8, 0
+; CHECK-NEXT:    v_and_b32_e32 v5, 7, v3
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, -1, v0
 ; CHECK-NEXT:    s_mov_b64 s[4:5], src_shared_base
-; CHECK-NEXT:    v_lshrrev_b64 v[5:6], 3, v[3:4]
-; CHECK-NEXT:    s_mov_b32 s6, exec_lo
-; CHECK-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[7:8]
+; CHECK-NEXT:    v_and_b32_e32 v7, -8, v3
+; CHECK-NEXT:    v_mov_b32_e32 v8, v4
+; CHECK-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[5:6]
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, s5, vcc_lo
 ; CHECK-NEXT:    v_cndmask_b32_e32 v9, 0, v0, vcc_lo
-; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[5:6]
+; CHECK-NEXT:    s_mov_b32 s6, exec_lo
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[7:8]
 ; CHECK-NEXT:    v_cmpx_ge_u64_e64 v[1:2], v[9:10]
 ; CHECK-NEXT:    s_xor_b32 s7, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execnz .LBB10_3
-; CHECK-NEXT:  ; %bb.1: ; %Flow40
+; CHECK-NEXT:  ; %bb.1: ; %Flow39
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s6, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB10_10
-; CHECK-NEXT:  .LBB10_2: ; %Flow41
+; CHECK-NEXT:  .LBB10_2: ; %Flow40
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1355,58 +1353,58 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align
 ; CHECK-NEXT:    s_cbranch_execz .LBB10_6
 ; CHECK-NEXT:  ; %bb.4: ; %memmove_fwd_main_loop.preheader
 ; CHECK-NEXT:    v_mov_b32_e32 v10, v2
+; CHECK-NEXT:    v_mov_b32_e32 v12, v8
 ; CHECK-NEXT:    v_mov_b32_e32 v9, v1
-; CHECK-NEXT:    v_mov_b32_e32 v11, v0
+; CHECK-NEXT:    v_mov_b32_e32 v11, v7
+; CHECK-NEXT:    v_mov_b32_e32 v4, v0
 ; CHECK-NEXT:    s_mov_b32 s9, 0
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB10_5: ; %memmove_fwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    flat_load_dwordx2 v[12:13], v[9:10]
-; CHECK-NEXT:    v_add_co_u32 v5, s5, v5, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v6, s5, -1, v6, s5
+; CHECK-NEXT:    flat_load_dwordx2 v[13:14], v[9:10]
+; CHECK-NEXT:    v_add_co_u32 v11, s5, v11, -8
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v12, s5, -1, v12, s5
 ; CHECK-NEXT:    v_add_co_u32 v9, s5, v9, 8
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v10, s5, 0, v10, s5
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s6, 0, v[5:6]
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s6, 0, v[11:12]
 ; CHECK-NEXT:    s_or_b32 s9, s6, s9
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_write_b64 v11, v[12:13]
-; CHECK-NEXT:    v_add_nc_u32_e32 v11, 8, v11
+; CHECK-NEXT:    ds_write_b64 v4, v[13:14]
+; CHECK-NEXT:    v_add_nc_u32_e32 v4, 8, v4
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
 ; CHECK-NEXT:    s_cbranch_execnz .LBB10_5
-; CHECK-NEXT:  .LBB10_6: ; %Flow35
+; CHECK-NEXT:  .LBB10_6: ; %Flow34
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    s_and_saveexec_b32 s8, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB10_9
 ; CHECK-NEXT:  ; %bb.7: ; %memmove_fwd_residual_loop.preheader
-; CHECK-NEXT:    v_and_b32_e32 v5, -8, v3
-; CHECK-NEXT:    v_sub_co_u32 v6, s5, v3, v7
-; CHECK-NEXT:    v_subrev_co_ci_u32_e64 v4, s5, 0, v4, s5
-; CHECK-NEXT:    v_add_nc_u32_e32 v3, v0, v5
-; CHECK-NEXT:    v_add_co_u32 v0, s5, v1, v6
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s5, v2, v4, s5
+; CHECK-NEXT:    v_and_b32_e32 v3, -8, v3
 ; CHECK-NEXT:    s_mov_b32 s9, 0
+; CHECK-NEXT:    v_add_nc_u32_e32 v3, v0, v3
+; CHECK-NEXT:    v_add_co_u32 v0, s5, v1, v7
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s5, v2, v8, s5
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB10_8: ; %memmove_fwd_residual_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    flat_load_ubyte v2, v[0:1]
-; CHECK-NEXT:    v_add_co_u32 v7, s5, v7, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v8, s5, -1, v8, s5
+; CHECK-NEXT:    v_add_co_u32 v5, s5, v5, -1
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v6, s5, -1, v6, s5
 ; CHECK-NEXT:    v_add_co_u32 v0, s5, v0, 1
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s5, 0, v1, s5
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s6, 0, v[7:8]
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s6, 0, v[5:6]
 ; CHECK-NEXT:    s_or_b32 s9, s6, s9
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    ds_write_b8 v3, v2
 ; CHECK-NEXT:    v_add_nc_u32_e32 v3, 1, v3
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
 ; CHECK-NEXT:    s_cbranch_execnz .LBB10_8
-; CHECK-NEXT:  .LBB10_9: ; %Flow33
+; CHECK-NEXT:  .LBB10_9: ; %Flow32
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
-; CHECK-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; CHECK-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; CHECK-NEXT:    ; implicit-def: $vgpr3_vgpr4
 ; CHECK-NEXT:    ; implicit-def: $vgpr0
 ; CHECK-NEXT:    ; implicit-def: $vgpr1_vgpr2
-; CHECK-NEXT:    ; implicit-def: $vgpr7_vgpr8
+; CHECK-NEXT:    ; implicit-def: $vgpr5_vgpr6
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s6, s7
 ; CHECK-NEXT:    s_cbranch_execz .LBB10_2
 ; CHECK-NEXT:  .LBB10_10: ; %memmove_copy_backwards
@@ -1423,46 +1421,43 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align
 ; CHECK-NEXT:  .LBB10_12: ; %memmove_bwd_residual_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    flat_load_ubyte v11, v[9:10]
-; CHECK-NEXT:    v_add_co_u32 v7, s4, v7, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v8, s4, -1, v8, s4
+; CHECK-NEXT:    v_add_co_u32 v5, s4, v5, -1
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v6, s4, -1, v6, s4
 ; CHECK-NEXT:    v_add_co_u32 v9, s4, v9, -1
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v10, s4, -1, v10, s4
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[7:8]
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[5:6]
 ; CHECK-NEXT:    s_or_b32 s8, s5, s8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    ds_write_b8 v4, v11
 ; CHECK-NEXT:    v_add_nc_u32_e32 v4, -1, v4
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    s_cbranch_execnz .LBB10_12
-; CHECK-NEXT:  .LBB10_13: ; %Flow39
+; CHECK-NEXT:  .LBB10_13: ; %Flow38
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
-; CHECK-NEXT:    s_and_saveexec_b32 s5, vcc_lo
+; CHECK-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; CHECK-NEXT:    s_cbranch_execz .LBB10_16
 ; CHECK-NEXT:  ; %bb.14: ; %memmove_bwd_main_loop.preheader
-; CHECK-NEXT:    v_lshlrev_b64 v[7:8], 3, v[5:6]
 ; CHECK-NEXT:    v_and_b32_e32 v3, -8, v3
-; CHECK-NEXT:    s_mov_b32 s7, 0
-; CHECK-NEXT:    v_add_co_u32 v1, vcc_lo, v7, v1
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v8, v2, vcc_lo
-; CHECK-NEXT:    v_add3_u32 v2, v3, v0, -8
-; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v1, -8
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v1, vcc_lo, v1, -8
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, -1, v2, vcc_lo
+; CHECK-NEXT:    v_add3_u32 v0, v3, v0, -8
+; CHECK-NEXT:    s_mov_b32 s5, 0
 ; CHECK-NEXT:  .LBB10_15: ; %memmove_bwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    flat_load_dwordx2 v[3:4], v[0:1]
-; CHECK-NEXT:    v_add_co_u32 v5, vcc_lo, v5, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, -1, v6, vcc_lo
-; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v0, -8
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s4, 0, v[5:6]
-; CHECK-NEXT:    s_or_b32 s7, s4, s7
+; CHECK-NEXT:    v_add_co_u32 v3, vcc_lo, v1, v7
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v2, v8, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v7, vcc_lo, v7, -8
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, -1, v8, vcc_lo
+; CHECK-NEXT:    flat_load_dwordx2 v[3:4], v[3:4]
+; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[7:8]
+; CHECK-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_write_b64 v2, v[3:4]
-; CHECK-NEXT:    v_add_nc_u32_e32 v2, -8, v2
-; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
+; CHECK-NEXT:    ds_write_b64 v0, v[3:4]
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, -8, v0
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; CHECK-NEXT:    s_cbranch_execnz .LBB10_15
-; CHECK-NEXT:  .LBB10_16: ; %Flow37
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT:  .LBB10_16: ; %Flow36
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1475,45 +1470,43 @@ define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align
 ; CHECK-LABEL: memmove_p3_p1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshrrev_b64 v[4:5], 3, v[3:4]
-; CHECK-NEXT:    s_mov_b64 s[6:7], 0
-; CHECK-NEXT:    s_mov_b32 s5, exec_lo
-; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[4:5]
+; CHECK-NEXT:    v_and_b32_e32 v7, -8, v3
+; CHECK-NEXT:    v_mov_b32_e32 v8, v4
+; CHECK-NEXT:    v_and_b32_e32 v5, 7, v3
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0
+; CHECK-NEXT:    s_mov_b32 s6, exec_lo
+; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[7:8]
 ; CHECK-NEXT:    s_cbranch_execz .LBB11_3
 ; CHECK-NEXT:  ; %bb.1: ; %loop-memcpy-expansion.preheader
-; CHECK-NEXT:    v_mov_b32_e32 v7, v2
-; CHECK-NEXT:    v_mov_b32_e32 v6, v1
-; CHECK-NEXT:    v_mov_b32_e32 v8, v0
-; CHECK-NEXT:    s_mov_b32 s8, 0
+; CHECK-NEXT:    v_mov_b32_e32 v9, v0
+; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:  .LBB11_2: ; %loop-memcpy-expansion
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    global_load_dwordx2 v[9:10], v[6:7], off
-; CHECK-NEXT:    s_add_u32 s6, s6, 1
-; CHECK-NEXT:    s_addc_u32 s7, s7, 0
-; CHECK-NEXT:    v_add_co_u32 v6, vcc_lo, v6, 8
-; CHECK-NEXT:    v_cmp_ge_u64_e64 s4, s[6:7], v[4:5]
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
-; CHECK-NEXT:    s_or_b32 s8, s4, s8
+; CHECK-NEXT:    v_add_co_u32 v10, vcc_lo, v1, s4
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, s5, v2, vcc_lo
+; CHECK-NEXT:    s_add_u32 s4, s4, 8
+; CHECK-NEXT:    s_addc_u32 s5, s5, 0
+; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8]
+; CHECK-NEXT:    global_load_dwordx2 v[10:11], v[10:11], off
+; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    ds_write_b64 v8, v[9:10]
-; CHECK-NEXT:    v_add_nc_u32_e32 v8, 8, v8
-; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
+; CHECK-NEXT:    ds_write_b64 v9, v[10:11]
+; CHECK-NEXT:    v_add_nc_u32_e32 v9, 8, v9
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB11_2
-; CHECK-NEXT:  .LBB11_3: ; %Flow11
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; CHECK-NEXT:    v_and_b32_e32 v6, 7, v3
-; CHECK-NEXT:    v_mov_b32_e32 v7, 0
+; CHECK-NEXT:  .LBB11_3: ; %Flow9
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
-; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[6:7]
+; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[5:6]
 ; CHECK-NEXT:    s_xor_b32 s6, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execz .LBB11_7
 ; CHECK-NEXT:  ; %bb.4: ; %loop-memcpy-residual.preheader
-; CHECK-NEXT:    v_and_b32_e32 v8, -8, v3
-; CHECK-NEXT:    v_lshlrev_b64 v[3:4], 3, v[4:5]
+; CHECK-NEXT:    v_and_b32_e32 v3, -8, v3
 ; CHECK-NEXT:    s_mov_b32 s7, 0
-; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v8
 ; CHECK-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v3
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v3
 ; CHECK-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo
 ; CHECK-NEXT:  .LBB11_5: ; %loop-memcpy-residual
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -1521,7 +1514,7 @@ define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align
 ; CHECK-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, s5, v2, vcc_lo
 ; CHECK-NEXT:    s_add_u32 s4, s4, 1
 ; CHECK-NEXT:    s_addc_u32 s5, s5, 0
-; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[6:7]
+; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6]
 ; CHECK-NEXT:    global_load_ubyte v3, v[3:4], off
 ; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
@@ -1531,7 +1524,7 @@ define void @memmove_p3_p1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align
 ; CHECK-NEXT:    s_cbranch_execnz .LBB11_5
 ; CHECK-NEXT:  ; %bb.6: ; %Flow
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
-; CHECK-NEXT:  .LBB11_7: ; %Flow9
+; CHECK-NEXT:  .LBB11_7: ; %Flow7
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1544,19 +1537,20 @@ define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align
 ; CHECK-LABEL: memmove_p3_p3:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v6, 7, v2
-; CHECK-NEXT:    v_mov_b32_e32 v7, 0
-; CHECK-NEXT:    v_lshrrev_b64 v[4:5], 3, v[2:3]
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0
+; CHECK-NEXT:    v_and_b32_e32 v4, 7, v2
+; CHECK-NEXT:    v_and_b32_e32 v6, -8, v2
+; CHECK-NEXT:    v_mov_b32_e32 v7, v3
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
-; CHECK-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[6:7]
-; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; CHECK-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[4:5]
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
 ; CHECK-NEXT:    v_cmpx_ge_u32_e64 v1, v0
 ; CHECK-NEXT:    s_xor_b32 s6, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execnz .LBB12_3
-; CHECK-NEXT:  ; %bb.1: ; %Flow44
+; CHECK-NEXT:  ; %bb.1: ; %Flow46
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s5, s6
 ; CHECK-NEXT:    s_cbranch_execnz .LBB12_10
-; CHECK-NEXT:  .LBB12_2: ; %Flow45
+; CHECK-NEXT:  .LBB12_2: ; %Flow47
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1570,17 +1564,17 @@ define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align
 ; CHECK-NEXT:  .LBB12_5: ; %memmove_fwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ds_read_b64 v[9:10], v3
-; CHECK-NEXT:    v_add_co_u32 v4, s5, v4, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, s5, -1, v5, s5
+; CHECK-NEXT:    v_add_co_u32 v6, s5, v6, -8
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v7, s5, -1, v7, s5
 ; CHECK-NEXT:    v_add_nc_u32_e32 v3, 8, v3
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[4:5]
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[6:7]
 ; CHECK-NEXT:    s_or_b32 s8, s5, s8
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    ds_write_b64 v8, v[9:10]
 ; CHECK-NEXT:    v_add_nc_u32_e32 v8, 8, v8
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    s_cbranch_execnz .LBB12_5
-; CHECK-NEXT:  .LBB12_6: ; %Flow39
+; CHECK-NEXT:  .LBB12_6: ; %Flow41
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_and_saveexec_b32 s7, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB12_9
@@ -1592,70 +1586,71 @@ define void @memmove_p3_p3(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align
 ; CHECK-NEXT:  .LBB12_8: ; %memmove_fwd_residual_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ds_read_u8 v2, v1
-; CHECK-NEXT:    v_add_co_u32 v6, s5, v6, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v7, s5, -1, v7, s5
+; CHECK-NEXT:    v_add_co_u32 v4, s5, v4, -1
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, s5, -1, v5, s5
 ; CHECK-NEXT:    v_add_nc_u32_e32 v1, 1, v1
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[6:7]
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[4:5]
 ; CHECK-NEXT:    s_or_b32 s8, s5, s8
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    ds_write_b8 v0, v2
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    s_cbranch_execnz .LBB12_8
-; CHECK-NEXT:  .LBB12_9: ; %Flow37
+; CHECK-NEXT:  .LBB12_9: ; %Flow39
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
-; CHECK-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; CHECK-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; CHECK-NEXT:    ; implicit-def: $vgpr0
 ; CHECK-NEXT:    ; implicit-def: $vgpr1
-; CHECK-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; CHECK-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s5, s6
 ; CHECK-NEXT:    s_cbranch_execz .LBB12_2
 ; CHECK-NEXT:  .LBB12_10: ; %memmove_copy_backwards
 ; CHECK-NEXT:    s_and_saveexec_b32 s6, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB12_13
 ; CHECK-NEXT:  ; %bb.11: ; %memmove_bwd_residual_loop.preheader
-; CHECK-NEXT:    v_add_nc_u32_e32 v8, -1, v2
+; CHECK-NEXT:    v_add_nc_u32_e32 v7, -1, v2
 ; CHECK-NEXT:    s_mov_b32 s7, 0
-; CHECK-NEXT:    v_add_nc_u32_e32 v3, v0, v8
-; CHECK-NEXT:    v_add_nc_u32_e32 v8, v1, v8
+; CHECK-NEXT:    v_add_nc_u32_e32 v6, v0, v7
+; CHECK-NEXT:    v_add_nc_u32_e32 v7, v1, v7
 ; CHECK-NEXT:  .LBB12_12: ; %memmove_bwd_residual_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ds_read_u8 v9, v8
-; CHECK-NEXT:    v_add_co_u32 v6, s4, v6, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v7, s4, -1, v7, s4
-; CHECK-NEXT:    v_add_nc_u32_e32 v8, -1, v8
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s4, 0, v[6:7]
+; CHECK-NEXT:    ds_read_u8 v8, v7
+; CHECK-NEXT:    v_add_co_u32 v4, s4, v4, -1
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, s4, -1, v5, s4
+; CHECK-NEXT:    v_add_nc_u32_e32 v7, -1, v7
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s4, 0, v[4:5]
 ; CHECK-NEXT:    s_or_b32 s7, s4, s7
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    ds_write_b8 v3, v9
-; CHECK-NEXT:    v_add_nc_u32_e32 v3, -1, v3
+; CHECK-NEXT:    ds_write_b8 v6, v8
+; CHECK-NEXT:    v_add_nc_u32_e32 v6, -1, v6
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB12_12
-; CHECK-NEXT:  .LBB12_13: ; %Flow43
+; CHECK-NEXT:  .LBB12_13: ; %Flow45
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; CHECK-NEXT:    s_cbranch_execz .LBB12_16
 ; CHECK-NEXT:  ; %bb.14: ; %memmove_bwd_main_loop.preheader
-; CHECK-NEXT:    v_and_b32_e32 v2, -8, v2
+; CHECK-NEXT:    v_and_b32_e32 v5, -8, v2
 ; CHECK-NEXT:    s_mov_b32 s6, 0
-; CHECK-NEXT:    v_add_nc_u32_e32 v2, -8, v2
-; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v2
-; CHECK-NEXT:    v_add_nc_u32_e32 v1, v1, v2
+; CHECK-NEXT:    v_add_nc_u32_e32 v4, -8, v5
+; CHECK-NEXT:    v_add_nc_u32_e32 v2, v0, v4
+; CHECK-NEXT:    v_sub_co_u32 v0, vcc_lo, 0, v5
+; CHECK-NEXT:    v_add_nc_u32_e32 v4, v1, v4
+; CHECK-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
 ; CHECK-NEXT:  .LBB12_15: ; %memmove_bwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ds_read_b64 v[2:3], v1
-; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v4, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v5, vcc_lo
-; CHECK-NEXT:    v_add_nc_u32_e32 v1, -8, v1
-; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
+; CHECK-NEXT:    ds_read_b64 v[5:6], v4
+; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 8
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; CHECK-NEXT:    v_add_nc_u32_e32 v4, -8, v4
+; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
 ; CHECK-NEXT:    s_or_b32 s6, vcc_lo, s6
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    ds_write_b64 v0, v[2:3]
-; CHECK-NEXT:    v_add_nc_u32_e32 v0, -8, v0
+; CHECK-NEXT:    ds_write_b64 v2, v[5:6]
+; CHECK-NEXT:    v_add_nc_u32_e32 v2, -8, v2
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execnz .LBB12_15
-; CHECK-NEXT:  .LBB12_16: ; %Flow41
+; CHECK-NEXT:  .LBB12_16: ; %Flow43
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1669,45 +1664,43 @@ define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align
 ; CHECK-LABEL: memmove_p3_p4:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshrrev_b64 v[4:5], 3, v[3:4]
-; CHECK-NEXT:    s_mov_b64 s[6:7], 0
-; CHECK-NEXT:    s_mov_b32 s5, exec_lo
-; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[4:5]
+; CHECK-NEXT:    v_and_b32_e32 v7, -8, v3
+; CHECK-NEXT:    v_mov_b32_e32 v8, v4
+; CHECK-NEXT:    v_and_b32_e32 v5, 7, v3
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0
+; CHECK-NEXT:    s_mov_b32 s6, exec_lo
+; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[7:8]
 ; CHECK-NEXT:    s_cbranch_execz .LBB13_3
 ; CHECK-NEXT:  ; %bb.1: ; %loop-memcpy-expansion.preheader
-; CHECK-NEXT:    v_mov_b32_e32 v7, v2
-; CHECK-NEXT:    v_mov_b32_e32 v6, v1
-; CHECK-NEXT:    v_mov_b32_e32 v8, v0
-; CHECK-NEXT:    s_mov_b32 s8, 0
+; CHECK-NEXT:    v_mov_b32_e32 v9, v0
+; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:  .LBB13_2: ; %loop-memcpy-expansion
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    global_load_dwordx2 v[9:10], v[6:7], off
-; CHECK-NEXT:    s_add_u32 s6, s6, 1
-; CHECK-NEXT:    s_addc_u32 s7, s7, 0
-; CHECK-NEXT:    v_add_co_u32 v6, vcc_lo, v6, 8
-; CHECK-NEXT:    v_cmp_ge_u64_e64 s4, s[6:7], v[4:5]
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
-; CHECK-NEXT:    s_or_b32 s8, s4, s8
+; CHECK-NEXT:    v_add_co_u32 v10, vcc_lo, v1, s4
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, s5, v2, vcc_lo
+; CHECK-NEXT:    s_add_u32 s4, s4, 8
+; CHECK-NEXT:    s_addc_u32 s5, s5, 0
+; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8]
+; CHECK-NEXT:    global_load_dwordx2 v[10:11], v[10:11], off
+; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    ds_write_b64 v8, v[9:10]
-; CHECK-NEXT:    v_add_nc_u32_e32 v8, 8, v8
-; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
+; CHECK-NEXT:    ds_write_b64 v9, v[10:11]
+; CHECK-NEXT:    v_add_nc_u32_e32 v9, 8, v9
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB13_2
-; CHECK-NEXT:  .LBB13_3: ; %Flow11
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; CHECK-NEXT:    v_and_b32_e32 v6, 7, v3
-; CHECK-NEXT:    v_mov_b32_e32 v7, 0
+; CHECK-NEXT:  .LBB13_3: ; %Flow9
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
-; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[6:7]
+; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[5:6]
 ; CHECK-NEXT:    s_xor_b32 s6, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execz .LBB13_7
 ; CHECK-NEXT:  ; %bb.4: ; %loop-memcpy-residual.preheader
-; CHECK-NEXT:    v_and_b32_e32 v8, -8, v3
-; CHECK-NEXT:    v_lshlrev_b64 v[3:4], 3, v[4:5]
+; CHECK-NEXT:    v_and_b32_e32 v3, -8, v3
 ; CHECK-NEXT:    s_mov_b32 s7, 0
-; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v8
 ; CHECK-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v3
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v3
 ; CHECK-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo
 ; CHECK-NEXT:  .LBB13_5: ; %loop-memcpy-residual
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -1715,7 +1708,7 @@ define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align
 ; CHECK-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, s5, v2, vcc_lo
 ; CHECK-NEXT:    s_add_u32 s4, s4, 1
 ; CHECK-NEXT:    s_addc_u32 s5, s5, 0
-; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[6:7]
+; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6]
 ; CHECK-NEXT:    global_load_ubyte v3, v[3:4], off
 ; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
@@ -1725,7 +1718,7 @@ define void @memmove_p3_p4(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align
 ; CHECK-NEXT:    s_cbranch_execnz .LBB13_5
 ; CHECK-NEXT:  ; %bb.6: ; %Flow
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
-; CHECK-NEXT:  .LBB13_7: ; %Flow9
+; CHECK-NEXT:  .LBB13_7: ; %Flow7
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1738,41 +1731,42 @@ define void @memmove_p3_p5(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align
 ; CHECK-LABEL: memmove_p3_p5:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshrrev_b64 v[3:4], 3, v[2:3]
+; CHECK-NEXT:    v_mov_b32_e32 v4, v2
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
-; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[3:4]
+; CHECK-NEXT:    v_and_b32_e32 v2, -8, v4
+; CHECK-NEXT:    v_and_b32_e32 v5, 7, v4
+; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[2:3]
 ; CHECK-NEXT:    s_cbranch_execz .LBB14_3
 ; CHECK-NEXT:  ; %bb.1: ; %loop-memcpy-expansion.preheader
-; CHECK-NEXT:    v_mov_b32_e32 v5, v1
-; CHECK-NEXT:    v_mov_b32_e32 v6, v0
+; CHECK-NEXT:    v_mov_b32_e32 v7, v1
+; CHECK-NEXT:    v_mov_b32_e32 v8, v0
 ; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:  .LBB14_2: ; %loop-memcpy-expansion
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    s_clause 0x1
-; CHECK-NEXT:    buffer_load_dword v7, v5, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v8, v5, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_add_u32 s4, s4, 1
+; CHECK-NEXT:    buffer_load_dword v9, v7, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v10, v7, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_add_u32 s4, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s5, s5, 0
-; CHECK-NEXT:    v_add_nc_u32_e32 v5, 8, v5
-; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[3:4]
+; CHECK-NEXT:    v_add_nc_u32_e32 v7, 8, v7
+; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[2:3]
 ; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    ds_write_b64 v6, v[7:8]
-; CHECK-NEXT:    v_add_nc_u32_e32 v6, 8, v6
+; CHECK-NEXT:    ds_write_b64 v8, v[9:10]
+; CHECK-NEXT:    v_add_nc_u32_e32 v8, 8, v8
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB14_2
 ; CHECK-NEXT:  .LBB14_3: ; %Flow14
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
-; CHECK-NEXT:    v_and_b32_e32 v3, 7, v2
-; CHECK-NEXT:    v_mov_b32_e32 v4, 0
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
-; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[3:4]
+; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[5:6]
 ; CHECK-NEXT:    s_xor_b32 s6, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execz .LBB14_7
 ; CHECK-NEXT:  ; %bb.4: ; %loop-memcpy-residual.preheader
-; CHECK-NEXT:    v_and_b32_e32 v2, -8, v2
+; CHECK-NEXT:    v_and_b32_e32 v2, -8, v4
 ; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v2
 ; CHECK-NEXT:    v_add_nc_u32_e32 v1, v1, v2
@@ -1782,7 +1776,7 @@ define void @memmove_p3_p5(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align
 ; CHECK-NEXT:    s_add_u32 s4, s4, 1
 ; CHECK-NEXT:    s_addc_u32 s5, s5, 0
 ; CHECK-NEXT:    v_add_nc_u32_e32 v1, 1, v1
-; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[3:4]
+; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6]
 ; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    ds_write_b8 v0, v2
@@ -1805,23 +1799,24 @@ define void @memmove_p5_p0(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align
 ; CHECK-LABEL: memmove_p5_p0:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v7, 15, v3
-; CHECK-NEXT:    v_mov_b32_e32 v8, 0
+; CHECK-NEXT:    v_and_b32_e32 v5, 15, v3
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, -1, v0
 ; CHECK-NEXT:    s_mov_b64 s[4:5], src_private_base
-; CHECK-NEXT:    v_lshrrev_b64 v[5:6], 4, v[3:4]
-; CHECK-NEXT:    s_mov_b32 s6, exec_lo
-; CHECK-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[7:8]
+; CHECK-NEXT:    v_and_b32_e32 v7, -16, v3
+; CHECK-NEXT:    v_mov_b32_e32 v8, v4
+; CHECK-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[5:6]
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, s5, vcc_lo
 ; CHECK-NEXT:    v_cndmask_b32_e32 v9, 0, v0, vcc_lo
-; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[5:6]
+; CHECK-NEXT:    s_mov_b32 s6, exec_lo
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[7:8]
 ; CHECK-NEXT:    v_cmpx_ge_u64_e64 v[1:2], v[9:10]
 ; CHECK-NEXT:    s_xor_b32 s7, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execnz .LBB15_3
-; CHECK-NEXT:  ; %bb.1: ; %Flow40
+; CHECK-NEXT:  ; %bb.1: ; %Flow39
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s6, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB15_10
-; CHECK-NEXT:  .LBB15_2: ; %Flow41
+; CHECK-NEXT:  .LBB15_2: ; %Flow40
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ; CHECK-NEXT:  .LBB15_3: ; %memmove_copy_forward
@@ -1829,61 +1824,61 @@ define void @memmove_p5_p0(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align
 ; CHECK-NEXT:    s_cbranch_execz .LBB15_6
 ; CHECK-NEXT:  ; %bb.4: ; %memmove_fwd_main_loop.preheader
 ; CHECK-NEXT:    v_mov_b32_e32 v10, v2
+; CHECK-NEXT:    v_mov_b32_e32 v12, v8
 ; CHECK-NEXT:    v_mov_b32_e32 v9, v1
-; CHECK-NEXT:    v_mov_b32_e32 v11, v0
+; CHECK-NEXT:    v_mov_b32_e32 v11, v7
+; CHECK-NEXT:    v_mov_b32_e32 v4, v0
 ; CHECK-NEXT:    s_mov_b32 s8, 0
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB15_5: ; %memmove_fwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    flat_load_dwordx4 v[12:15], v[9:10]
-; CHECK-NEXT:    v_add_co_u32 v5, s5, v5, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v6, s5, -1, v6, s5
+; CHECK-NEXT:    flat_load_dwordx4 v[13:16], v[9:10]
+; CHECK-NEXT:    v_add_co_u32 v11, s5, v11, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v12, s5, -1, v12, s5
 ; CHECK-NEXT:    v_add_co_u32 v9, s5, v9, 16
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v10, s5, 0, v10, s5
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[5:6]
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[11:12]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v15, v11, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_store_dword v14, v11, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_store_dword v13, v11, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v12, v11, s[0:3], 0 offen
-; CHECK-NEXT:    v_add_nc_u32_e32 v11, 16, v11
+; CHECK-NEXT:    buffer_store_dword v16, v4, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v15, v4, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v14, v4, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v13, v4, s[0:3], 0 offen
+; CHECK-NEXT:    v_add_nc_u32_e32 v4, 16, v4
 ; CHECK-NEXT:    s_or_b32 s8, s5, s8
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    s_cbranch_execnz .LBB15_5
-; CHECK-NEXT:  .LBB15_6: ; %Flow35
+; CHECK-NEXT:  .LBB15_6: ; %Flow34
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_and_saveexec_b32 s8, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB15_9
 ; CHECK-NEXT:  ; %bb.7: ; %memmove_fwd_residual_loop.preheader
-; CHECK-NEXT:    v_and_b32_e32 v5, -16, v3
-; CHECK-NEXT:    v_sub_co_u32 v6, s5, v3, v7
-; CHECK-NEXT:    v_subrev_co_ci_u32_e64 v4, s5, 0, v4, s5
-; CHECK-NEXT:    v_add_nc_u32_e32 v3, v0, v5
-; CHECK-NEXT:    v_add_co_u32 v0, s5, v1, v6
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s5, v2, v4, s5
+; CHECK-NEXT:    v_and_b32_e32 v3, -16, v3
 ; CHECK-NEXT:    s_mov_b32 s9, 0
+; CHECK-NEXT:    v_add_nc_u32_e32 v3, v0, v3
+; CHECK-NEXT:    v_add_co_u32 v0, s5, v1, v7
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s5, v2, v8, s5
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB15_8: ; %memmove_fwd_residual_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    flat_load_ubyte v2, v[0:1]
-; CHECK-NEXT:    v_add_co_u32 v7, s5, v7, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v8, s5, -1, v8, s5
+; CHECK-NEXT:    v_add_co_u32 v5, s5, v5, -1
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v6, s5, -1, v6, s5
 ; CHECK-NEXT:    v_add_co_u32 v0, s5, v0, 1
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, s5, 0, v1, s5
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s6, 0, v[7:8]
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s6, 0, v[5:6]
 ; CHECK-NEXT:    s_or_b32 s9, s6, s9
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    buffer_store_byte v2, v3, s[0:3], 0 offen
 ; CHECK-NEXT:    v_add_nc_u32_e32 v3, 1, v3
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s9
 ; CHECK-NEXT:    s_cbranch_execnz .LBB15_8
-; CHECK-NEXT:  .LBB15_9: ; %Flow33
+; CHECK-NEXT:  .LBB15_9: ; %Flow32
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
-; CHECK-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; CHECK-NEXT:    ; implicit-def: $vgpr7_vgpr8
 ; CHECK-NEXT:    ; implicit-def: $vgpr3_vgpr4
 ; CHECK-NEXT:    ; implicit-def: $vgpr0
 ; CHECK-NEXT:    ; implicit-def: $vgpr1_vgpr2
-; CHECK-NEXT:    ; implicit-def: $vgpr7_vgpr8
+; CHECK-NEXT:    ; implicit-def: $vgpr5_vgpr6
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s6, s7
 ; CHECK-NEXT:    s_cbranch_execz .LBB15_2
 ; CHECK-NEXT:  .LBB15_10: ; %memmove_copy_backwards
@@ -1900,49 +1895,46 @@ define void @memmove_p5_p0(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align
 ; CHECK-NEXT:  .LBB15_12: ; %memmove_bwd_residual_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    flat_load_ubyte v11, v[9:10]
-; CHECK-NEXT:    v_add_co_u32 v7, s4, v7, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v8, s4, -1, v8, s4
+; CHECK-NEXT:    v_add_co_u32 v5, s4, v5, -1
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v6, s4, -1, v6, s4
 ; CHECK-NEXT:    v_add_co_u32 v9, s4, v9, -1
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v10, s4, -1, v10, s4
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[7:8]
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[5:6]
 ; CHECK-NEXT:    s_or_b32 s8, s5, s8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    buffer_store_byte v11, v4, s[0:3], 0 offen
 ; CHECK-NEXT:    v_add_nc_u32_e32 v4, -1, v4
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    s_cbranch_execnz .LBB15_12
-; CHECK-NEXT:  .LBB15_13: ; %Flow39
+; CHECK-NEXT:  .LBB15_13: ; %Flow38
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; CHECK-NEXT:    s_cbranch_execz .LBB15_16
 ; CHECK-NEXT:  ; %bb.14: ; %memmove_bwd_main_loop.preheader
-; CHECK-NEXT:    v_lshlrev_b64 v[7:8], 4, v[5:6]
 ; CHECK-NEXT:    v_and_b32_e32 v3, -16, v3
+; CHECK-NEXT:    v_add_co_u32 v1, vcc_lo, v1, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, -1, v2, vcc_lo
+; CHECK-NEXT:    v_add3_u32 v0, v3, v0, -16
 ; CHECK-NEXT:    s_mov_b32 s5, 0
-; CHECK-NEXT:    v_add_co_u32 v1, vcc_lo, v7, v1
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v8, v2, vcc_lo
-; CHECK-NEXT:    v_add3_u32 v2, v3, v0, -16
-; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v1, -16
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB15_15: ; %memmove_bwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[0:1]
-; CHECK-NEXT:    v_add_co_u32 v5, vcc_lo, v5, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, -1, v6, vcc_lo
-; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v0, -16
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[5:6]
+; CHECK-NEXT:    v_add_co_u32 v3, vcc_lo, v1, v7
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v2, v8, vcc_lo
+; CHECK-NEXT:    v_add_co_u32 v7, vcc_lo, v7, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, -1, v8, vcc_lo
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[3:4]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v10, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_store_dword v9, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_store_dword v8, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v7, v2, s[0:3], 0 offen
-; CHECK-NEXT:    v_add_nc_u32_e32 v2, -16, v2
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[7:8]
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, -16, v0
 ; CHECK-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; CHECK-NEXT:    s_cbranch_execnz .LBB15_15
-; CHECK-NEXT:  .LBB15_16: ; %Flow37
+; CHECK-NEXT:  .LBB15_16: ; %Flow36
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1955,49 +1947,47 @@ define void @memmove_p5_p1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align
 ; CHECK-LABEL: memmove_p5_p1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshrrev_b64 v[4:5], 4, v[3:4]
+; CHECK-NEXT:    v_and_b32_e32 v7, -16, v3
+; CHECK-NEXT:    v_mov_b32_e32 v8, v4
+; CHECK-NEXT:    v_and_b32_e32 v5, 15, v3
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
-; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[4:5]
+; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[7:8]
 ; CHECK-NEXT:    s_cbranch_execz .LBB16_3
 ; CHECK-NEXT:  ; %bb.1: ; %loop-memcpy-expansion.preheader
-; CHECK-NEXT:    v_mov_b32_e32 v7, v2
-; CHECK-NEXT:    v_mov_b32_e32 v6, v1
-; CHECK-NEXT:    v_mov_b32_e32 v8, v0
+; CHECK-NEXT:    v_mov_b32_e32 v9, v0
 ; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB16_2: ; %loop-memcpy-expansion
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    global_load_dwordx4 v[9:12], v[6:7], off
-; CHECK-NEXT:    s_add_u32 s4, s4, 1
-; CHECK-NEXT:    v_add_co_u32 v6, vcc_lo, v6, 16
+; CHECK-NEXT:    v_add_co_u32 v10, vcc_lo, v1, s4
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, s5, v2, vcc_lo
+; CHECK-NEXT:    s_add_u32 s4, s4, 16
 ; CHECK-NEXT:    s_addc_u32 s5, s5, 0
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
-; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[4:5]
+; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8]
+; CHECK-NEXT:    global_load_dwordx4 v[10:13], v[10:11], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v12, v8, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_store_dword v11, v8, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_store_dword v10, v8, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v9, v8, s[0:3], 0 offen
-; CHECK-NEXT:    v_add_nc_u32_e32 v8, 16, v8
+; CHECK-NEXT:    buffer_store_dword v13, v9, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v12, v9, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v11, v9, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v10, v9, s[0:3], 0 offen
+; CHECK-NEXT:    v_add_nc_u32_e32 v9, 16, v9
 ; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB16_2
-; CHECK-NEXT:  .LBB16_3: ; %Flow11
+; CHECK-NEXT:  .LBB16_3: ; %Flow9
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
-; CHECK-NEXT:    v_and_b32_e32 v6, 15, v3
-; CHECK-NEXT:    v_mov_b32_e32 v7, 0
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
-; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[6:7]
+; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[5:6]
 ; CHECK-NEXT:    s_xor_b32 s6, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execz .LBB16_7
 ; CHECK-NEXT:  ; %bb.4: ; %loop-memcpy-residual.preheader
-; CHECK-NEXT:    v_and_b32_e32 v8, -16, v3
-; CHECK-NEXT:    v_lshlrev_b64 v[3:4], 4, v[4:5]
+; CHECK-NEXT:    v_and_b32_e32 v3, -16, v3
 ; CHECK-NEXT:    s_mov_b32 s7, 0
-; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v8
 ; CHECK-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v3
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v3
 ; CHECK-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo
 ; CHECK-NEXT:  .LBB16_5: ; %loop-memcpy-residual
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -2005,7 +1995,7 @@ define void @memmove_p5_p1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align
 ; CHECK-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, s5, v2, vcc_lo
 ; CHECK-NEXT:    s_add_u32 s4, s4, 1
 ; CHECK-NEXT:    s_addc_u32 s5, s5, 0
-; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[6:7]
+; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6]
 ; CHECK-NEXT:    global_load_ubyte v3, v[3:4], off
 ; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
@@ -2015,7 +2005,7 @@ define void @memmove_p5_p1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align
 ; CHECK-NEXT:    s_cbranch_execnz .LBB16_5
 ; CHECK-NEXT:  ; %bb.6: ; %Flow
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
-; CHECK-NEXT:  .LBB16_7: ; %Flow9
+; CHECK-NEXT:  .LBB16_7: ; %Flow7
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2027,40 +2017,41 @@ define void @memmove_p5_p3(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align
 ; CHECK-LABEL: memmove_p5_p3:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshrrev_b64 v[3:4], 3, v[2:3]
+; CHECK-NEXT:    v_mov_b32_e32 v4, v2
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
-; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[3:4]
+; CHECK-NEXT:    v_and_b32_e32 v2, -8, v4
+; CHECK-NEXT:    v_and_b32_e32 v5, 7, v4
+; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[2:3]
 ; CHECK-NEXT:    s_cbranch_execz .LBB17_3
 ; CHECK-NEXT:  ; %bb.1: ; %loop-memcpy-expansion.preheader
-; CHECK-NEXT:    v_mov_b32_e32 v5, v1
-; CHECK-NEXT:    v_mov_b32_e32 v6, v0
+; CHECK-NEXT:    v_mov_b32_e32 v7, v1
+; CHECK-NEXT:    v_mov_b32_e32 v8, v0
 ; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:  .LBB17_2: ; %loop-memcpy-expansion
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ds_read_b64 v[7:8], v5
-; CHECK-NEXT:    s_add_u32 s4, s4, 1
+; CHECK-NEXT:    ds_read_b64 v[9:10], v7
+; CHECK-NEXT:    s_add_u32 s4, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s5, s5, 0
-; CHECK-NEXT:    v_add_nc_u32_e32 v5, 8, v5
-; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[3:4]
+; CHECK-NEXT:    v_add_nc_u32_e32 v7, 8, v7
+; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[2:3]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v8, v6, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v7, v6, s[0:3], 0 offen
-; CHECK-NEXT:    v_add_nc_u32_e32 v6, 8, v6
+; CHECK-NEXT:    buffer_store_dword v10, v8, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v9, v8, s[0:3], 0 offen
+; CHECK-NEXT:    v_add_nc_u32_e32 v8, 8, v8
 ; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB17_2
 ; CHECK-NEXT:  .LBB17_3: ; %Flow14
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
-; CHECK-NEXT:    v_and_b32_e32 v3, 7, v2
-; CHECK-NEXT:    v_mov_b32_e32 v4, 0
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
-; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[3:4]
+; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[5:6]
 ; CHECK-NEXT:    s_xor_b32 s6, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execz .LBB17_7
 ; CHECK-NEXT:  ; %bb.4: ; %loop-memcpy-residual.preheader
-; CHECK-NEXT:    v_and_b32_e32 v2, -8, v2
+; CHECK-NEXT:    v_and_b32_e32 v2, -8, v4
 ; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v2
 ; CHECK-NEXT:    v_add_nc_u32_e32 v1, v1, v2
@@ -2070,7 +2061,7 @@ define void @memmove_p5_p3(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align
 ; CHECK-NEXT:    s_add_u32 s4, s4, 1
 ; CHECK-NEXT:    s_addc_u32 s5, s5, 0
 ; CHECK-NEXT:    v_add_nc_u32_e32 v1, 1, v1
-; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[3:4]
+; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6]
 ; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen
@@ -2091,49 +2082,47 @@ define void @memmove_p5_p4(ptr addrspace(5) align 1 %dst, ptr addrspace(4) align
 ; CHECK-LABEL: memmove_p5_p4:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshrrev_b64 v[4:5], 4, v[3:4]
+; CHECK-NEXT:    v_and_b32_e32 v7, -16, v3
+; CHECK-NEXT:    v_mov_b32_e32 v8, v4
+; CHECK-NEXT:    v_and_b32_e32 v5, 15, v3
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
-; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[4:5]
+; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[7:8]
 ; CHECK-NEXT:    s_cbranch_execz .LBB18_3
 ; CHECK-NEXT:  ; %bb.1: ; %loop-memcpy-expansion.preheader
-; CHECK-NEXT:    v_mov_b32_e32 v7, v2
-; CHECK-NEXT:    v_mov_b32_e32 v6, v1
-; CHECK-NEXT:    v_mov_b32_e32 v8, v0
+; CHECK-NEXT:    v_mov_b32_e32 v9, v0
 ; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB18_2: ; %loop-memcpy-expansion
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    global_load_dwordx4 v[9:12], v[6:7], off
-; CHECK-NEXT:    s_add_u32 s4, s4, 1
-; CHECK-NEXT:    v_add_co_u32 v6, vcc_lo, v6, 16
+; CHECK-NEXT:    v_add_co_u32 v10, vcc_lo, v1, s4
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, s5, v2, vcc_lo
+; CHECK-NEXT:    s_add_u32 s4, s4, 16
 ; CHECK-NEXT:    s_addc_u32 s5, s5, 0
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
-; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[4:5]
+; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8]
+; CHECK-NEXT:    global_load_dwordx4 v[10:13], v[10:11], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v12, v8, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_store_dword v11, v8, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_store_dword v10, v8, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v9, v8, s[0:3], 0 offen
-; CHECK-NEXT:    v_add_nc_u32_e32 v8, 16, v8
+; CHECK-NEXT:    buffer_store_dword v13, v9, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v12, v9, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v11, v9, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v10, v9, s[0:3], 0 offen
+; CHECK-NEXT:    v_add_nc_u32_e32 v9, 16, v9
 ; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB18_2
-; CHECK-NEXT:  .LBB18_3: ; %Flow11
+; CHECK-NEXT:  .LBB18_3: ; %Flow9
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
-; CHECK-NEXT:    v_and_b32_e32 v6, 15, v3
-; CHECK-NEXT:    v_mov_b32_e32 v7, 0
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
-; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[6:7]
+; CHECK-NEXT:    v_cmpx_ne_u64_e32 0, v[5:6]
 ; CHECK-NEXT:    s_xor_b32 s6, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execz .LBB18_7
 ; CHECK-NEXT:  ; %bb.4: ; %loop-memcpy-residual.preheader
-; CHECK-NEXT:    v_and_b32_e32 v8, -16, v3
-; CHECK-NEXT:    v_lshlrev_b64 v[3:4], 4, v[4:5]
+; CHECK-NEXT:    v_and_b32_e32 v3, -16, v3
 ; CHECK-NEXT:    s_mov_b32 s7, 0
-; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v8
 ; CHECK-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v3
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v3
 ; CHECK-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo
 ; CHECK-NEXT:  .LBB18_5: ; %loop-memcpy-residual
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -2141,7 +2130,7 @@ define void @memmove_p5_p4(ptr addrspace(5) align 1 %dst, ptr addrspace(4) align
 ; CHECK-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, s5, v2, vcc_lo
 ; CHECK-NEXT:    s_add_u32 s4, s4, 1
 ; CHECK-NEXT:    s_addc_u32 s5, s5, 0
-; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[6:7]
+; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[5:6]
 ; CHECK-NEXT:    global_load_ubyte v3, v[3:4], off
 ; CHECK-NEXT:    s_or_b32 s7, vcc_lo, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
@@ -2151,7 +2140,7 @@ define void @memmove_p5_p4(ptr addrspace(5) align 1 %dst, ptr addrspace(4) align
 ; CHECK-NEXT:    s_cbranch_execnz .LBB18_5
 ; CHECK-NEXT:  ; %bb.6: ; %Flow
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
-; CHECK-NEXT:  .LBB18_7: ; %Flow9
+; CHECK-NEXT:  .LBB18_7: ; %Flow7
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2163,19 +2152,20 @@ define void @memmove_p5_p5(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align
 ; CHECK-LABEL: memmove_p5_p5:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v6, 15, v2
-; CHECK-NEXT:    v_mov_b32_e32 v7, 0
-; CHECK-NEXT:    v_lshrrev_b64 v[4:5], 4, v[2:3]
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0
+; CHECK-NEXT:    v_and_b32_e32 v4, 15, v2
+; CHECK-NEXT:    v_and_b32_e32 v6, -16, v2
+; CHECK-NEXT:    v_mov_b32_e32 v7, v3
 ; CHECK-NEXT:    s_mov_b32 s6, exec_lo
-; CHECK-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[6:7]
-; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; CHECK-NEXT:    v_cmp_ne_u64_e64 s4, 0, v[4:5]
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
 ; CHECK-NEXT:    v_cmpx_ge_u32_e64 v1, v0
 ; CHECK-NEXT:    s_xor_b32 s6, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execnz .LBB19_3
-; CHECK-NEXT:  ; %bb.1: ; %Flow44
+; CHECK-NEXT:  ; %bb.1: ; %Flow46
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s5, s6
 ; CHECK-NEXT:    s_cbranch_execnz .LBB19_10
-; CHECK-NEXT:  .LBB19_2: ; %Flow45
+; CHECK-NEXT:  .LBB19_2: ; %Flow47
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ; CHECK-NEXT:  .LBB19_3: ; %memmove_copy_forward
@@ -2193,8 +2183,8 @@ define void @memmove_p5_p5(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align
 ; CHECK-NEXT:    buffer_load_dword v10, v3, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v11, v3, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v12, v3, s[0:3], 0 offen
-; CHECK-NEXT:    v_add_co_u32 v4, s5, v4, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, s5, -1, v5, s5
+; CHECK-NEXT:    v_add_co_u32 v6, s5, v6, -16
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v7, s5, -1, v7, s5
 ; CHECK-NEXT:    v_add_nc_u32_e32 v3, 16, v3
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
 ; CHECK-NEXT:    buffer_store_dword v9, v8, s[0:3], 0 offen offset:12
@@ -2204,12 +2194,12 @@ define void @memmove_p5_p5(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align
 ; CHECK-NEXT:    buffer_store_dword v11, v8, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_store_dword v12, v8, s[0:3], 0 offen
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[4:5]
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[6:7]
 ; CHECK-NEXT:    v_add_nc_u32_e32 v8, 16, v8
 ; CHECK-NEXT:    s_or_b32 s8, s5, s8
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    s_cbranch_execnz .LBB19_5
-; CHECK-NEXT:  .LBB19_6: ; %Flow39
+; CHECK-NEXT:  .LBB19_6: ; %Flow41
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_and_saveexec_b32 s7, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB19_9
@@ -2221,81 +2211,82 @@ define void @memmove_p5_p5(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align
 ; CHECK-NEXT:  .LBB19_8: ; %memmove_fwd_residual_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen
-; CHECK-NEXT:    v_add_co_u32 v6, s5, v6, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v7, s5, -1, v7, s5
+; CHECK-NEXT:    v_add_co_u32 v4, s5, v4, -1
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, s5, -1, v5, s5
 ; CHECK-NEXT:    v_add_nc_u32_e32 v1, 1, v1
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[6:7]
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[4:5]
 ; CHECK-NEXT:    s_or_b32 s8, s5, s8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
 ; CHECK-NEXT:    s_cbranch_execnz .LBB19_8
-; CHECK-NEXT:  .LBB19_9: ; %Flow37
+; CHECK-NEXT:  .LBB19_9: ; %Flow39
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
-; CHECK-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; CHECK-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; CHECK-NEXT:    ; implicit-def: $vgpr0
 ; CHECK-NEXT:    ; implicit-def: $vgpr1
-; CHECK-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; CHECK-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s5, s6
 ; CHECK-NEXT:    s_cbranch_execz .LBB19_2
 ; CHECK-NEXT:  .LBB19_10: ; %memmove_copy_backwards
 ; CHECK-NEXT:    s_and_saveexec_b32 s6, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB19_13
 ; CHECK-NEXT:  ; %bb.11: ; %memmove_bwd_residual_loop.preheader
-; CHECK-NEXT:    v_add_nc_u32_e32 v8, -1, v2
+; CHECK-NEXT:    v_add_nc_u32_e32 v7, -1, v2
 ; CHECK-NEXT:    s_mov_b32 s7, 0
-; CHECK-NEXT:    v_add_nc_u32_e32 v3, v0, v8
-; CHECK-NEXT:    v_add_nc_u32_e32 v8, v1, v8
+; CHECK-NEXT:    v_add_nc_u32_e32 v6, v0, v7
+; CHECK-NEXT:    v_add_nc_u32_e32 v7, v1, v7
 ; CHECK-NEXT:  .LBB19_12: ; %memmove_bwd_residual_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    buffer_load_ubyte v9, v8, s[0:3], 0 offen
-; CHECK-NEXT:    v_add_co_u32 v6, s4, v6, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v7, s4, -1, v7, s4
-; CHECK-NEXT:    v_add_nc_u32_e32 v8, -1, v8
-; CHECK-NEXT:    v_cmp_eq_u64_e64 s4, 0, v[6:7]
+; CHECK-NEXT:    buffer_load_ubyte v8, v7, s[0:3], 0 offen
+; CHECK-NEXT:    v_add_co_u32 v4, s4, v4, -1
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v5, s4, -1, v5, s4
+; CHECK-NEXT:    v_add_nc_u32_e32 v7, -1, v7
+; CHECK-NEXT:    v_cmp_eq_u64_e64 s4, 0, v[4:5]
 ; CHECK-NEXT:    s_or_b32 s7, s4, s7
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v9, v3, s[0:3], 0 offen
-; CHECK-NEXT:    v_add_nc_u32_e32 v3, -1, v3
+; CHECK-NEXT:    buffer_store_byte v8, v6, s[0:3], 0 offen
+; CHECK-NEXT:    v_add_nc_u32_e32 v6, -1, v6
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s7
 ; CHECK-NEXT:    s_cbranch_execnz .LBB19_12
-; CHECK-NEXT:  .LBB19_13: ; %Flow43
+; CHECK-NEXT:  .LBB19_13: ; %Flow45
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; CHECK-NEXT:    s_cbranch_execz .LBB19_16
 ; CHECK-NEXT:  ; %bb.14: ; %memmove_bwd_main_loop.preheader
-; CHECK-NEXT:    v_and_b32_e32 v2, -16, v2
+; CHECK-NEXT:    v_and_b32_e32 v5, -16, v2
 ; CHECK-NEXT:    s_mov_b32 s6, 0
-; CHECK-NEXT:    v_add_nc_u32_e32 v2, -16, v2
-; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v2
-; CHECK-NEXT:    v_add_nc_u32_e32 v1, v1, v2
+; CHECK-NEXT:    v_add_nc_u32_e32 v4, -16, v5
+; CHECK-NEXT:    v_add_nc_u32_e32 v2, v0, v4
+; CHECK-NEXT:    v_sub_co_u32 v0, vcc_lo, 0, v5
+; CHECK-NEXT:    v_add_nc_u32_e32 v4, v1, v4
+; CHECK-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB19_15: ; %memmove_bwd_main_loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen
-; CHECK-NEXT:    v_add_co_u32 v4, vcc_lo, v4, -1
-; CHECK-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v5, vcc_lo
-; CHECK-NEXT:    v_add_nc_u32_e32 v1, -16, v1
+; CHECK-NEXT:    buffer_load_dword v3, v4, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v5, v4, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v4, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v7, v4, s[0:3], 0 offen
+; CHECK-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 16
+; CHECK-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; CHECK-NEXT:    v_add_nc_u32_e32 v4, -16, v4
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v6, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
-; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[4:5]
-; CHECK-NEXT:    v_add_nc_u32_e32 v0, -16, v0
+; CHECK-NEXT:    buffer_store_dword v7, v2, s[0:3], 0 offen
+; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
+; CHECK-NEXT:    v_add_nc_u32_e32 v2, -16, v2
 ; CHECK-NEXT:    s_or_b32 s6, vcc_lo, s6
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s6
 ; CHECK-NEXT:    s_cbranch_execnz .LBB19_15
-; CHECK-NEXT:  .LBB19_16: ; %Flow41
+; CHECK-NEXT:  .LBB19_16: ; %Flow43
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]


        


More information about the llvm-commits mailing list