[llvm] [LowerMemIntrinsics] Use i8 GEPs in memcpy/memmove lowering (PR #112707)

Thu Oct 17 06:32:27 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Fabian Ritter (ritter-x2a)

<details>
<summary>Changes</summary>

The IR lowering of memcpy/memmove intrinsics uses a target-specific type for its load/store operations. So far, the loaded and stored addresses are computed with GEPs based on this type. That is wrong if the allocation size of the type differs from its store size: The width of the accesses is determined by the store size, while the GEP stride is determined by the allocation size. If the allocation size is greater than the store size, some bytes are not copied/moved.

This patch changes the GEPs to use i8 addressing, with offsets based on the type's store size. The correctness of the lowering therefore no longer depends on the type's allocation size.

This is in support of PR #112332, which allows adjusting the memcpy loop lowering type through a command line argument in the AMDGPU backend.

---

Patch is 187.25 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/112707.diff


4 Files Affected:

- (modified) llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp (+109-107) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll (+77-82) 
- (modified) llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll (+441-465) 
- (modified) llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll (+93-111) 


``````````diff

diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index ba62d75250c85e..ef6db0a8c98cc8 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -49,11 +49,12 @@ void llvm::createMemCpyLoopKnownSize(
   assert((!AtomicElementSize || !LoopOpType->isVectorTy()) &&
          "Atomic memcpy lowering is not supported for vector operand type");
 
+  Type *Int8Type = Type::getInt8Ty(Ctx);
   unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
   assert((!AtomicElementSize || LoopOpSize % *AtomicElementSize == 0) &&
-      "Atomic memcpy lowering is not supported for selected operand size");
+         "Atomic memcpy lowering is not supported for selected operand size");
 
-  uint64_t LoopEndCount = CopyLen->getZExtValue() / LoopOpSize;
+  uint64_t LoopEndCount = (CopyLen->getZExtValue() / LoopOpSize) * LoopOpSize;
 
   if (LoopEndCount != 0) {
     // Split
@@ -71,8 +72,12 @@ void llvm::createMemCpyLoopKnownSize(
     PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 2, "loop-index");
     LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0U), PreLoopBB);
     // Loop Body
-    Value *SrcGEP =
-        LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex);
+
+    // If we used LoopOpType as GEP element type, we would iterate over the
+    // buffers in TypeStoreSize strides while copying TypeAllocSize bytes, i.e.,
+    // we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore, use
+    // byte offsets computed from the TypeStoreSize.
+    Value *SrcGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, LoopIndex);
     LoadInst *Load = LoopBuilder.CreateAlignedLoad(LoopOpType, SrcGEP,
                                                    PartSrcAlign, SrcIsVolatile);
     if (!CanOverlap) {
@@ -80,8 +85,7 @@ void llvm::createMemCpyLoopKnownSize(
       Load->setMetadata(LLVMContext::MD_alias_scope,
                         MDNode::get(Ctx, NewScope));
     }
-    Value *DstGEP =
-        LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex);
+    Value *DstGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LoopIndex);
     StoreInst *Store = LoopBuilder.CreateAlignedStore(
         Load, DstGEP, PartDstAlign, DstIsVolatile);
     if (!CanOverlap) {
@@ -92,8 +96,8 @@ void llvm::createMemCpyLoopKnownSize(
       Load->setAtomic(AtomicOrdering::Unordered);
       Store->setAtomic(AtomicOrdering::Unordered);
     }
-    Value *NewIndex =
-        LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1U));
+    Value *NewIndex = LoopBuilder.CreateAdd(
+        LoopIndex, ConstantInt::get(TypeOfCopyLen, LoopOpSize));
     LoopIndex->addIncoming(NewIndex, LoopBB);
 
     // Create the loop branch condition.
@@ -102,7 +106,7 @@ void llvm::createMemCpyLoopKnownSize(
                              LoopBB, PostLoopBB);
   }
 
-  uint64_t BytesCopied = LoopEndCount * LoopOpSize;
+  uint64_t BytesCopied = LoopEndCount;
   uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopied;
   if (RemainingBytes) {
     IRBuilder<> RBuilder(PostLoopBB ? PostLoopBB->getFirstNonPHI()
@@ -117,18 +121,13 @@ void llvm::createMemCpyLoopKnownSize(
       Align PartSrcAlign(commonAlignment(SrcAlign, BytesCopied));
       Align PartDstAlign(commonAlignment(DstAlign, BytesCopied));
 
-      // Calculate the new index
       unsigned OperandSize = DL.getTypeStoreSize(OpTy);
       assert(
           (!AtomicElementSize || OperandSize % *AtomicElementSize == 0) &&
           "Atomic memcpy lowering is not supported for selected operand size");
 
-      uint64_t GepIndex = BytesCopied / OperandSize;
-      assert(GepIndex * OperandSize == BytesCopied &&
-             "Division should have no Remainder!");
-
       Value *SrcGEP = RBuilder.CreateInBoundsGEP(
-          OpTy, SrcAddr, ConstantInt::get(TypeOfCopyLen, GepIndex));
+          Int8Type, SrcAddr, ConstantInt::get(TypeOfCopyLen, BytesCopied));
       LoadInst *Load =
           RBuilder.CreateAlignedLoad(OpTy, SrcGEP, PartSrcAlign, SrcIsVolatile);
       if (!CanOverlap) {
@@ -137,7 +136,7 @@ void llvm::createMemCpyLoopKnownSize(
                           MDNode::get(Ctx, NewScope));
       }
       Value *DstGEP = RBuilder.CreateInBoundsGEP(
-          OpTy, DstAddr, ConstantInt::get(TypeOfCopyLen, GepIndex));
+          Int8Type, DstAddr, ConstantInt::get(TypeOfCopyLen, BytesCopied));
       StoreInst *Store = RBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign,
                                                      DstIsVolatile);
       if (!CanOverlap) {
@@ -155,16 +154,6 @@ void llvm::createMemCpyLoopKnownSize(
          "Bytes copied should match size in the call!");
 }
 
-// \returns \p Len udiv \p OpSize, checking for optimization opportunities.
-static Value *getRuntimeLoopCount(const DataLayout &DL, IRBuilderBase &B,
-                                  Value *Len, Value *OpSize,
-                                  unsigned OpSizeVal) {
-  // For powers of 2, we can lshr by log2 instead of using udiv.
-  if (isPowerOf2_32(OpSizeVal))
-    return B.CreateLShr(Len, Log2_32(OpSizeVal));
-  return B.CreateUDiv(Len, OpSize);
-}
-
 // \returns \p Len urem \p OpSize, checking for optimization opportunities.
 static Value *getRuntimeLoopRemainder(const DataLayout &DL, IRBuilderBase &B,
                                       Value *Len, Value *OpSize,
@@ -175,6 +164,18 @@ static Value *getRuntimeLoopRemainder(const DataLayout &DL, IRBuilderBase &B,
   return B.CreateURem(Len, OpSize);
 }
 
+// \returns (\p Len udiv \p OpSize) mul \p OpSize, checking for optimization
+// opportunities.
+// If RTLoopRemainder is provided, it must be the result of
+// getRuntimeLoopRemainder() with the same arguments.
+static Value *getRuntimeLoopBytes(const DataLayout &DL, IRBuilderBase &B,
+                                  Value *Len, Value *OpSize, unsigned OpSizeVal,
+                                  Value *RTLoopRemainder = nullptr) {
+  if (!RTLoopRemainder)
+    RTLoopRemainder = getRuntimeLoopRemainder(DL, B, Len, OpSize, OpSizeVal);
+  return B.CreateSub(Len, RTLoopRemainder);
+}
+
 void llvm::createMemCpyLoopUnknownSize(
     Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, Value *CopyLen,
     Align SrcAlign, Align DstAlign, bool SrcIsVolatile, bool DstIsVolatile,
@@ -213,10 +214,15 @@ void llvm::createMemCpyLoopUnknownSize(
   Type *Int8Type = Type::getInt8Ty(Ctx);
   bool LoopOpIsInt8 = LoopOpType == Int8Type;
   ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
-  Value *RuntimeLoopCount = LoopOpIsInt8
-                                ? CopyLen
-                                : getRuntimeLoopCount(DL, PLBuilder, CopyLen,
-                                                      CILoopOpSize, LoopOpSize);
+
+  Value *RuntimeLoopBytes = CopyLen;
+  Value *RuntimeResidualBytes = nullptr;
+  if (!LoopOpIsInt8) {
+    RuntimeResidualBytes = getRuntimeLoopRemainder(DL, PLBuilder, CopyLen,
+                                                   CILoopOpSize, LoopOpSize);
+    RuntimeLoopBytes = getRuntimeLoopBytes(DL, PLBuilder, CopyLen, CILoopOpSize,
+                                           LoopOpSize, RuntimeResidualBytes);
+  }
 
   BasicBlock *LoopBB =
       BasicBlock::Create(Ctx, "loop-memcpy-expansion", ParentFunc, PostLoopBB);
@@ -228,14 +234,18 @@ void llvm::createMemCpyLoopUnknownSize(
   PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLenType, 2, "loop-index");
   LoopIndex->addIncoming(ConstantInt::get(CopyLenType, 0U), PreLoopBB);
 
-  Value *SrcGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex);
+  // If we used LoopOpType as GEP element type, we would iterate over the
+  // buffers in TypeStoreSize strides while copying TypeAllocSize bytes, i.e.,
+  // we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore, use byte
+  // offsets computed from the TypeStoreSize.
+  Value *SrcGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, LoopIndex);
   LoadInst *Load = LoopBuilder.CreateAlignedLoad(LoopOpType, SrcGEP,
                                                  PartSrcAlign, SrcIsVolatile);
   if (!CanOverlap) {
     // Set alias scope for loads.
     Load->setMetadata(LLVMContext::MD_alias_scope, MDNode::get(Ctx, NewScope));
   }
-  Value *DstGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex);
+  Value *DstGEP = LoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LoopIndex);
   StoreInst *Store =
       LoopBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
   if (!CanOverlap) {
@@ -246,13 +256,13 @@ void llvm::createMemCpyLoopUnknownSize(
     Load->setAtomic(AtomicOrdering::Unordered);
     Store->setAtomic(AtomicOrdering::Unordered);
   }
-  Value *NewIndex =
-      LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLenType, 1U));
+  Value *NewIndex = LoopBuilder.CreateAdd(
+      LoopIndex, ConstantInt::get(CopyLenType, LoopOpSize));
   LoopIndex->addIncoming(NewIndex, LoopBB);
 
-  bool requiresResidual =
+  bool RequiresResidual =
       !LoopOpIsInt8 && !(AtomicElementSize && LoopOpSize == AtomicElementSize);
-  if (requiresResidual) {
+  if (RequiresResidual) {
     Type *ResLoopOpType = AtomicElementSize
                               ? Type::getIntNTy(Ctx, *AtomicElementSize * 8)
                               : Int8Type;
@@ -263,14 +273,9 @@ void llvm::createMemCpyLoopUnknownSize(
     Align ResSrcAlign(commonAlignment(PartSrcAlign, ResLoopOpSize));
     Align ResDstAlign(commonAlignment(PartDstAlign, ResLoopOpSize));
 
-    Value *RuntimeResidual = getRuntimeLoopRemainder(DL, PLBuilder, CopyLen,
-                                                     CILoopOpSize, LoopOpSize);
-    Value *RuntimeBytesCopied = PLBuilder.CreateSub(CopyLen, RuntimeResidual);
-
     // Loop body for the residual copy.
-    BasicBlock *ResLoopBB = BasicBlock::Create(Ctx, "loop-memcpy-residual",
-                                               PreLoopBB->getParent(),
-                                               PostLoopBB);
+    BasicBlock *ResLoopBB = BasicBlock::Create(
+        Ctx, "loop-memcpy-residual", PreLoopBB->getParent(), PostLoopBB);
     // Residual loop header.
     BasicBlock *ResHeaderBB = BasicBlock::Create(
         Ctx, "loop-memcpy-residual-header", PreLoopBB->getParent(), nullptr);
@@ -281,17 +286,17 @@ void llvm::createMemCpyLoopUnknownSize(
     // non-zero and finally branch to after the residual loop if the memcpy
     //  size is zero.
     ConstantInt *Zero = ConstantInt::get(ILengthType, 0U);
-    PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopCount, Zero),
+    PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopBytes, Zero),
                            LoopBB, ResHeaderBB);
     PreLoopBB->getTerminator()->eraseFromParent();
 
     LoopBuilder.CreateCondBr(
-        LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopCount), LoopBB,
+        LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopBytes), LoopBB,
         ResHeaderBB);
 
     // Determine if we need to branch to the residual loop or bypass it.
     IRBuilder<> RHBuilder(ResHeaderBB);
-    RHBuilder.CreateCondBr(RHBuilder.CreateICmpNE(RuntimeResidual, Zero),
+    RHBuilder.CreateCondBr(RHBuilder.CreateICmpNE(RuntimeResidualBytes, Zero),
                            ResLoopBB, PostLoopBB);
 
     // Copy the residual with single byte load/store loop.
@@ -300,9 +305,8 @@ void llvm::createMemCpyLoopUnknownSize(
         ResBuilder.CreatePHI(CopyLenType, 2, "residual-loop-index");
     ResidualIndex->addIncoming(Zero, ResHeaderBB);
 
-    Value *FullOffset = ResBuilder.CreateAdd(RuntimeBytesCopied, ResidualIndex);
-    Value *SrcGEP =
-        ResBuilder.CreateInBoundsGEP(ResLoopOpType, SrcAddr, FullOffset);
+    Value *FullOffset = ResBuilder.CreateAdd(RuntimeLoopBytes, ResidualIndex);
+    Value *SrcGEP = ResBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, FullOffset);
     LoadInst *Load = ResBuilder.CreateAlignedLoad(ResLoopOpType, SrcGEP,
                                                   ResSrcAlign, SrcIsVolatile);
     if (!CanOverlap) {
@@ -310,8 +314,7 @@ void llvm::createMemCpyLoopUnknownSize(
       Load->setMetadata(LLVMContext::MD_alias_scope,
                         MDNode::get(Ctx, NewScope));
     }
-    Value *DstGEP =
-        ResBuilder.CreateInBoundsGEP(ResLoopOpType, DstAddr, FullOffset);
+    Value *DstGEP = ResBuilder.CreateInBoundsGEP(Int8Type, DstAddr, FullOffset);
     StoreInst *Store =
         ResBuilder.CreateAlignedStore(Load, DstGEP, ResDstAlign, DstIsVolatile);
     if (!CanOverlap) {
@@ -328,7 +331,7 @@ void llvm::createMemCpyLoopUnknownSize(
 
     // Create the loop branch condition.
     ResBuilder.CreateCondBr(
-        ResBuilder.CreateICmpULT(ResNewIndex, RuntimeResidual), ResLoopBB,
+        ResBuilder.CreateICmpULT(ResNewIndex, RuntimeResidualBytes), ResLoopBB,
         PostLoopBB);
   } else {
     // In this case the loop operand type was a byte, and there is no need for a
@@ -336,11 +339,11 @@ void llvm::createMemCpyLoopUnknownSize(
     // We do however need to patch up the control flow by creating the
     // terminators for the preloop block and the memcpy loop.
     ConstantInt *Zero = ConstantInt::get(ILengthType, 0U);
-    PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopCount, Zero),
+    PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopBytes, Zero),
                            LoopBB, PostLoopBB);
     PreLoopBB->getTerminator()->eraseFromParent();
     LoopBuilder.CreateCondBr(
-        LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopCount), LoopBB,
+        LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopBytes), LoopBB,
         PostLoopBB);
   }
 }
@@ -425,27 +428,25 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
   // Calculate the loop trip count and remaining bytes to copy after the loop.
   IntegerType *ILengthType = cast<IntegerType>(TypeOfCopyLen);
   ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
+  ConstantInt *CIResidualLoopOpSize =
+      ConstantInt::get(ILengthType, ResidualLoopOpSize);
   ConstantInt *Zero = ConstantInt::get(ILengthType, 0);
-  ConstantInt *One = ConstantInt::get(ILengthType, 1);
 
   IRBuilder<> PLBuilder(InsertBefore);
 
-  Value *RuntimeLoopCount = CopyLen;
+  Value *RuntimeLoopBytes = CopyLen;
   Value *RuntimeLoopRemainder = nullptr;
-  Value *RuntimeBytesCopiedMainLoop = CopyLen;
   Value *SkipResidualCondition = nullptr;
   if (RequiresResidual) {
-    RuntimeLoopCount =
-        getRuntimeLoopCount(DL, PLBuilder, CopyLen, CILoopOpSize, LoopOpSize);
     RuntimeLoopRemainder = getRuntimeLoopRemainder(DL, PLBuilder, CopyLen,
                                                    CILoopOpSize, LoopOpSize);
-    RuntimeBytesCopiedMainLoop =
-        PLBuilder.CreateSub(CopyLen, RuntimeLoopRemainder);
+    RuntimeLoopBytes = getRuntimeLoopBytes(DL, PLBuilder, CopyLen, CILoopOpSize,
+                                           LoopOpSize, RuntimeLoopRemainder);
     SkipResidualCondition =
         PLBuilder.CreateICmpEQ(RuntimeLoopRemainder, Zero, "skip_residual");
   }
   Value *SkipMainCondition =
-      PLBuilder.CreateICmpEQ(RuntimeLoopCount, Zero, "skip_main");
+      PLBuilder.CreateICmpEQ(RuntimeLoopBytes, Zero, "skip_main");
 
   // Create the a comparison of src and dst, based on which we jump to either
   // the forward-copy part of the function (if src >= dst) or the backwards-copy
@@ -464,7 +465,7 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
   SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore->getIterator(),
                                 &ThenTerm, &ElseTerm);
 
-  // If the LoopOpSize is greater than 1, each part of the function consist of
+  // If the LoopOpSize is greater than 1, each part of the function consists of
   // four blocks:
   //   memmove_copy_backwards:
   //       skip the residual loop when 0 iterations are required
@@ -519,14 +520,18 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
       IRBuilder<> ResidualLoopBuilder(ResidualLoopBB);
       PHINode *ResidualLoopPhi = ResidualLoopBuilder.CreatePHI(ILengthType, 0);
       Value *ResidualIndex = ResidualLoopBuilder.CreateSub(
-          ResidualLoopPhi, One, "bwd_residual_index");
-      Value *LoadGEP = ResidualLoopBuilder.CreateInBoundsGEP(
-          ResidualLoopOpType, SrcAddr, ResidualIndex);
+          ResidualLoopPhi, CIResidualLoopOpSize, "bwd_residual_index");
+      // If we used LoopOpType as GEP element type, we would iterate over the
+      // buffers in TypeStoreSize strides while copying TypeAllocSize bytes,
+      // i.e., we would miss bytes if TypeStoreSize != TypeAllocSize. Therefore,
+      // use byte offsets computed from the TypeStoreSize.
+      Value *LoadGEP = ResidualLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr,
+                                                             ResidualIndex);
       Value *Element = ResidualLoopBuilder.CreateAlignedLoad(
           ResidualLoopOpType, LoadGEP, ResidualSrcAlign, SrcIsVolatile,
           "element");
-      Value *StoreGEP = ResidualLoopBuilder.CreateInBoundsGEP(
-          ResidualLoopOpType, DstAddr, ResidualIndex);
+      Value *StoreGEP = ResidualLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr,
+                                                              ResidualIndex);
       ResidualLoopBuilder.CreateAlignedStore(Element, StoreGEP,
                                              ResidualDstAlign, DstIsVolatile);
 
@@ -537,8 +542,7 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
       IRBuilder<> IntermediateBuilder(IntermediateBB);
       IntermediateBuilder.CreateUnreachable();
       ResidualLoopBuilder.CreateCondBr(
-          ResidualLoopBuilder.CreateICmpEQ(ResidualIndex,
-                                           RuntimeBytesCopiedMainLoop),
+          ResidualLoopBuilder.CreateICmpEQ(ResidualIndex, RuntimeLoopBytes),
           IntermediateBB, ResidualLoopBB);
 
       ResidualLoopPhi->addIncoming(ResidualIndex, ResidualLoopBB);
@@ -556,19 +560,19 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
     IRBuilder<> MainLoopBuilder(MainLoopBB);
     PHINode *MainLoopPhi = MainLoopBuilder.CreatePHI(ILengthType, 0);
     Value *MainIndex =
-        MainLoopBuilder.CreateSub(MainLoopPhi, One, "bwd_main_index");
+        MainLoopBuilder.CreateSub(MainLoopPhi, CILoopOpSize, "bwd_main_index");
     Value *LoadGEP =
-        MainLoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, MainIndex);
+        MainLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, MainIndex);
     Value *Element = MainLoopBuilder.CreateAlignedLoad(
         LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
     Value *StoreGEP =
-        MainLoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, MainIndex);
+        MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, MainIndex);
     MainLoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
                                        DstIsVolatile);
     MainLoopBuilder.CreateCondBr(MainLoopBuilder.CreateICmpEQ(MainIndex, Zero),
                                  ExitBB, MainLoopBB);
     MainLoopPhi->addIncoming(MainIndex, MainLoopBB);
-    MainLoopPhi->addIncoming(RuntimeLoopCount, PredBB);
+    MainLoopPhi->addIncoming(RuntimeLoopBytes, PredBB);
 
     // How to get to the main loop:
     Instruction *PredBBTerm = PredBB->getTerminator();
@@ -586,14 +590,14 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
     PHINode *MainLoopPhi =
         MainLoopBuilder.CreatePHI(ILengthType, 0, "fwd_main_index");
     Value *LoadGEP =
-        MainLoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, MainLoopPhi);
+        MainLoopBuilder.CreateInBoundsGEP(Int8Type, SrcAddr, MainLoopPhi);
     Value *Element = MainLoopBuilder.CreateAlignedLoad(
         LoopOpType, LoadGEP, PartSrcAlign, SrcIsVolatile, "element");
     Value *StoreGEP =
-        MainLoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, MainLoopPhi);
+        MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, MainLoopPhi);
     MainLoopBuilder.CreateAlignedStore(Element, StoreGEP, PartDstAlign,
                                        DstIsVolatile);
-    Value *MainIndex = MainLoopBuilder.CreateAdd(MainLoopPhi, One);
+    Value *MainIndex = MainLoopBuilder.CreateAdd(MainLoopPhi, CILoopOpSize);
     MainLo...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/112707