[llvm-branch-commits] [llvm] 3667122 - Revert "[MemCpyOpt] support offset slices for performStackMoveOptzn and proce…"

Wed Jan 21 11:14:09 PST 2026

Author: Jameson Nash
Date: 2026-01-21T14:14:05-05:00
New Revision: 366712217e5455a8e2a24c43832c0b4ef609a80f

URL: https://github.com/llvm/llvm-project/commit/366712217e5455a8e2a24c43832c0b4ef609a80f
DIFF: https://github.com/llvm/llvm-project/commit/366712217e5455a8e2a24c43832c0b4ef609a80f.diff

LOG: Revert "[MemCpyOpt] support offset slices for performStackMoveOptzn and proce…"

This reverts commit 019eb855dd6a18a8f7ae5dd86abf6bc3ad0d9fa4.

Added: 
    

Modified: 
    llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
    llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp

Removed: 
    llvm/test/Transforms/MemCpyOpt/stack-move-offset.ll


################################################################################
diff  --git a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
index b87ea0b9d243f..496d2958fc2d0 100644

--- a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
+++ b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
@@ -16,7 +16,6 @@
 
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Support/Compiler.h"
 
 namespace llvm {
 
@@ -55,12 +54,12 @@ class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
 public:
   MemCpyOptPass() = default;
 
-  LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 
   // Glue for the old PM.
-  LLVM_ABI bool runImpl(Function &F, TargetLibraryInfo *TLI, AAResults *AA,
-                        AssumptionCache *AC, DominatorTree *DT,
-                        PostDominatorTree *PDT, MemorySSA *MSSA);
+  bool runImpl(Function &F, TargetLibraryInfo *TLI, AAResults *AA,
+               AssumptionCache *AC, DominatorTree *DT, PostDominatorTree *PDT,
+               MemorySSA *MSSA);
 
 private:
   // Helper functions
@@ -86,8 +85,8 @@ class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
                                     Value *ByteVal);
   bool moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI);
   bool performStackMoveOptzn(Instruction *Load, Instruction *Store,
-                             Value *DestPtr, Value *SrcPtr, TypeSize Size,
-                             BatchAAResults &BAA);
+                             AllocaInst *DestAlloca, AllocaInst *SrcAlloca,
+                             TypeSize Size, BatchAAResults &BAA);
   bool isMemMoveMemSetDependency(MemMoveInst *M);
 
   void eraseInstruction(Instruction *I);

diff  --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index d8d784f2ff774..5de3dfa148314 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -726,15 +726,18 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI,
   // If this is a load-store pair from a stack slot to a stack slot, we
   // might be able to perform the stack-move optimization just as we do for
   // memcpys from an alloca to an alloca.
-  if (performStackMoveOptzn(LI, SI, SI->getPointerOperand(),
-                            LI->getPointerOperand(), DL.getTypeStoreSize(T),
-                            BAA)) {
-    // Avoid invalidating the iterator.
-    BBI = SI->getNextNode()->getIterator();
-    eraseInstruction(SI);
-    eraseInstruction(LI);
-    ++NumMemCpyInstr;
-    return true;
+  if (auto *DestAlloca = dyn_cast<AllocaInst>(SI->getPointerOperand())) {
+    if (auto *SrcAlloca = dyn_cast<AllocaInst>(LI->getPointerOperand())) {
+      if (performStackMoveOptzn(LI, SI, DestAlloca, SrcAlloca,
+                                DL.getTypeStoreSize(T), BAA)) {
+        // Avoid invalidating the iterator.
+        BBI = SI->getNextNode()->getIterator();
+        eraseInstruction(SI);
+        eraseInstruction(LI);
+        ++NumMemCpyInstr;
+        return true;
+      }
+    }
   }
 
   return false;
@@ -1494,47 +1497,20 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
 // transformation only because we restrict the scope of this optimization to
 // allocas that aren't captured.
 bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
-                                          Value *DestPtr, Value *SrcPtr,
-                                          TypeSize Size, BatchAAResults &BAA) {
+                                          AllocaInst *DestAlloca,
+                                          AllocaInst *SrcAlloca, TypeSize Size,
+                                          BatchAAResults &BAA) {
   LLVM_DEBUG(dbgs() << "Stack Move: Attempting to optimize:\n"
                     << *Store << "\n");
 
-  AllocaInst *DestAlloca = dyn_cast<AllocaInst>(getUnderlyingObject(DestPtr));
-  if (!DestAlloca)
-    return false;
-
-  AllocaInst *SrcAlloca = dyn_cast<AllocaInst>(getUnderlyingObject(SrcPtr));
-  if (!SrcAlloca)
-    return false;
-
-  // Explicitly don't handle degenerate case of a partial copy within one
-  // alloca. It would always fail the dominator check later anyways, and
-  // possibly the modref checks also.
-  if (SrcAlloca == DestAlloca)
-    return false;
-
   // Make sure the two allocas are in the same address space.
   if (SrcAlloca->getAddressSpace() != DestAlloca->getAddressSpace()) {
     LLVM_DEBUG(dbgs() << "Stack Move: Address space mismatch\n");
     return false;
   }
 
-  if (!SrcAlloca->isStaticAlloca() || !DestAlloca->isStaticAlloca())
-    return false;
-
   // Check that copy is full with static size.
   const DataLayout &DL = DestAlloca->getDataLayout();
-
-  auto DestOffset = DestPtr->getPointerOffsetFrom(DestAlloca, DL);
-  if (!DestOffset)
-    return false;
-
-  auto SrcOffset = SrcPtr->getPointerOffsetFrom(SrcAlloca, DL);
-  if (!SrcOffset || *SrcOffset < *DestOffset || *SrcOffset < 0)
-    return false;
-  // Offset 
diff erence must preserve dest alloca's alignment.
-  if ((*SrcOffset - *DestOffset) % DestAlloca->getAlign().value() != 0)
-    return false;
   std::optional<TypeSize> SrcSize = SrcAlloca->getAllocationSize(DL);
   std::optional<TypeSize> DestSize = DestAlloca->getAllocationSize(DL);
   if (!SrcSize || !DestSize)
@@ -1542,12 +1518,14 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
   if (*SrcSize != *DestSize)
     if (!SrcSize->isFixed() || !DestSize->isFixed())
       return false;
-  // Check that copy covers entirety of dest alloca.
-  if (Size != *DestSize || *DestOffset != 0) {
+  if (Size != *DestSize) {
     LLVM_DEBUG(dbgs() << "Stack Move: Destination alloca size mismatch\n");
     return false;
   }
 
+  if (!SrcAlloca->isStaticAlloca() || !DestAlloca->isStaticAlloca())
+    return false;
+
   // Check if it will be legal to combine allocas without breaking dominator.
   bool MoveSrc = !DT->dominates(SrcAlloca, DestAlloca);
   if (MoveSrc) {
@@ -1711,13 +1689,7 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
   }
 
   // Merge the two allocas.
-  Value *NewDestPtr = SrcAlloca;
-  if (*SrcOffset != *DestOffset) {
-    IRBuilder<> Builder(DestAlloca);
-    NewDestPtr = Builder.CreateInBoundsPtrAdd(
-        SrcAlloca, Builder.getInt64(*SrcOffset - *DestOffset));
-  }
-  DestAlloca->replaceAllUsesWith(NewDestPtr);
+  DestAlloca->replaceAllUsesWith(SrcAlloca);
   eraseInstruction(DestAlloca);
 
   // Drop metadata on the source alloca.
@@ -1788,7 +1760,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
     return false;
 
   // If copying from a constant, try to turn the memcpy into a memset.
-  if (auto *GV = dyn_cast<GlobalVariable>(getUnderlyingObject(M->getSource())))
+  if (auto *GV = dyn_cast<GlobalVariable>(M->getSource()))
     if (GV->isConstant() && GV->hasDefinitiveInitializer())
       if (Value *ByteVal = isBytewiseValue(GV->getInitializer(),
                                            M->getDataLayout())) {
@@ -1874,10 +1846,16 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
   // If the transfer is from a stack slot to a stack slot, then we may be able
   // to perform the stack-move optimization. See the comments in
   // performStackMoveOptzn() for more details.
+  auto *DestAlloca = dyn_cast<AllocaInst>(M->getDest());
+  if (!DestAlloca)
+    return false;
+  auto *SrcAlloca = dyn_cast<AllocaInst>(M->getSource());
+  if (!SrcAlloca)
+    return false;
   ConstantInt *Len = dyn_cast<ConstantInt>(M->getLength());
   if (Len == nullptr)
     return false;
-  if (performStackMoveOptzn(M, M, M->getDest(), M->getSource(),
+  if (performStackMoveOptzn(M, M, DestAlloca, SrcAlloca,
                             TypeSize::getFixed(Len->getZExtValue()), BAA)) {
     // Avoid invalidating the iterator.
     BBI = M->getNextNode()->getIterator();

diff  --git a/llvm/test/Transforms/MemCpyOpt/stack-move-offset.ll b/llvm/test/Transforms/MemCpyOpt/stack-move-offset.ll
deleted file mode 100644
index fdf5e838b20e5..0000000000000
--- a/llvm/test/Transforms/MemCpyOpt/stack-move-offset.ll
+++ /dev/null
@@ -1,238 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt < %s -passes=memcpyopt -verify-memoryssa -S | FileCheck %s
-
-; Test that stack-move optimization works when src is a GEP into an alloca.
-; For the optimization to trigger:
-;   - The copy must cover the entire dest alloca (Size == DestSize, DestOffset == 0)
-;   - SrcOffset must be a multiple of DestAlloca's alignment
-;   - SrcOffset must be non-negative
-
-declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
-declare void @llvm.lifetime.start.p0(ptr nocapture)
-declare void @llvm.lifetime.end.p0(ptr nocapture)
-
-declare void @use_nocapture(ptr nocapture)
-
-; Basic test: memcpy from GEP(src) to dest alloca
-; src = [16 x i8], dest = [8 x i8] align 8, copy 8 bytes
-; SrcOffset(8) is a multiple of DestAlign(8), so optimization applies.
-; After optimization: dest uses become src+8
-define void @memcpy_src_gep_to_dest_alloca() {
-; CHECK-LABEL: define void @memcpy_src_gep_to_dest_alloca() {
-; CHECK-NEXT:    [[SRC:%.*]] = alloca [16 x i8], align 8
-; CHECK-NEXT:    [[SRC_GEP:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 8
-; CHECK-NEXT:    [[SRC_GEP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 8
-; CHECK-NEXT:    store i64 42, ptr [[SRC_GEP1]], align 4
-; CHECK-NEXT:    call void @use_nocapture(ptr captures(none) [[SRC_GEP1]])
-; CHECK-NEXT:    call void @use_nocapture(ptr captures(none) [[SRC_GEP]])
-; CHECK-NEXT:    ret void
-;
-  %src = alloca [16 x i8], align 4
-  %dest = alloca [8 x i8], align 8
-  call void @llvm.lifetime.start.p0(ptr %src)
-  call void @llvm.lifetime.start.p0(ptr %dest)
-  %src.gep = getelementptr inbounds i8, ptr %src, i64 8
-  store i64 42, ptr %src.gep
-  call void @use_nocapture(ptr nocapture %src.gep)
-
-  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src.gep, i64 8, i1 false)
-
-  call void @use_nocapture(ptr nocapture %dest)
-
-  call void @llvm.lifetime.end.p0(ptr %src)
-  call void @llvm.lifetime.end.p0(ptr %dest)
-  ret void
-}
-
-; Test: memcpy from GEP(src) to dest alloca with 
diff erent offset
-; src = [12 x i8], dest = [8 x i8] align 4, copy 8 bytes from src+4
-; SrcOffset(4) is a multiple of DestAlign(4), so optimization applies.
-; After optimization: dest uses become src+4
-define void @memcpy_src_gep_offset4_to_dest_alloca() {
-; CHECK-LABEL: define void @memcpy_src_gep_offset4_to_dest_alloca() {
-; CHECK-NEXT:    [[SRC:%.*]] = alloca [12 x i8], align 4
-; CHECK-NEXT:    [[SRC_GEP:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 4
-; CHECK-NEXT:    [[SRC_GEP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 4
-; CHECK-NEXT:    store i64 42, ptr [[SRC_GEP1]], align 4
-; CHECK-NEXT:    call void @use_nocapture(ptr captures(none) [[SRC_GEP1]])
-; CHECK-NEXT:    call void @use_nocapture(ptr captures(none) [[SRC_GEP]])
-; CHECK-NEXT:    ret void
-;
-  %src = alloca [12 x i8], align 4
-  %dest = alloca [8 x i8], align 4
-  call void @llvm.lifetime.start.p0(ptr %src)
-  call void @llvm.lifetime.start.p0(ptr %dest)
-  %src.gep = getelementptr inbounds i8, ptr %src, i64 4
-  store i64 42, ptr %src.gep
-  call void @use_nocapture(ptr nocapture %src.gep)
-
-  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src.gep, i64 8, i1 false)
-
-  call void @use_nocapture(ptr nocapture %dest)
-
-  call void @llvm.lifetime.end.p0(ptr %src)
-  call void @llvm.lifetime.end.p0(ptr %dest)
-  ret void
-}
-
-; Test: load/store from GEP(src) to dest alloca
-; src = [8 x i8], dest = [4 x i8] align 4, load/store 4 bytes from src+4
-; SrcOffset(4) is a multiple of DestAlign(4), so optimization applies.
-define void @load_store_src_gep_to_dest_alloca() {
-; CHECK-LABEL: define void @load_store_src_gep_to_dest_alloca() {
-; CHECK-NEXT:    [[SRC:%.*]] = alloca [8 x i8], align 4
-; CHECK-NEXT:    [[SRC_GEP:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 4
-; CHECK-NEXT:    [[SRC_GEP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 4
-; CHECK-NEXT:    store i32 42, ptr [[SRC_GEP1]], align 4
-; CHECK-NEXT:    call void @use_nocapture(ptr captures(none) [[SRC_GEP1]])
-; CHECK-NEXT:    call void @use_nocapture(ptr captures(none) [[SRC_GEP]])
-; CHECK-NEXT:    ret void
-;
-  %src = alloca [8 x i8], align 4
-  %dest = alloca [4 x i8], align 4
-  call void @llvm.lifetime.start.p0(ptr %src)
-  call void @llvm.lifetime.start.p0(ptr %dest)
-  %src.gep = getelementptr inbounds i8, ptr %src, i64 4
-  store i32 42, ptr %src.gep
-  call void @use_nocapture(ptr nocapture %src.gep)
-
-  %val = load i32, ptr %src.gep
-  store i32 %val, ptr %dest
-
-  call void @use_nocapture(ptr nocapture %dest)
-
-  call void @llvm.lifetime.end.p0(ptr %src)
-  call void @llvm.lifetime.end.p0(ptr %dest)
-  ret void
-}
-
-; Test: both src and dest are direct allocas (no offset), same size
-; This is the basic stack-move case, included here for completeness.
-define void @memcpy_both_direct_allocas() {
-; CHECK-LABEL: define void @memcpy_both_direct_allocas() {
-; CHECK-NEXT:    [[SRC:%.*]] = alloca [8 x i8], align 8
-; CHECK-NEXT:    store i64 42, ptr [[SRC]], align 4
-; CHECK-NEXT:    call void @use_nocapture(ptr captures(none) [[SRC]])
-; CHECK-NEXT:    call void @use_nocapture(ptr captures(none) [[SRC]])
-; CHECK-NEXT:    ret void
-;
-  %src = alloca [8 x i8], align 4
-  %dest = alloca [8 x i8], align 8
-  call void @llvm.lifetime.start.p0(ptr %src)
-  call void @llvm.lifetime.start.p0(ptr %dest)
-  store i64 42, ptr %src
-  call void @use_nocapture(ptr nocapture %src)
-
-  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 4 %src, i64 8, i1 false)
-
-  call void @use_nocapture(ptr nocapture %dest)
-
-  call void @llvm.lifetime.end.p0(ptr %src)
-  call void @llvm.lifetime.end.p0(ptr %dest)
-  ret void
-}
-
-; Negative test: dest has offset (dest is GEP, not direct alloca)
-; The optimization requires DestOffset == 0.
-define void @no_optimize_dest_has_offset() {
-; CHECK-LABEL: define void @no_optimize_dest_has_offset() {
-; CHECK-NEXT:    [[SRC:%.*]] = alloca [16 x i8], align 4
-; CHECK-NEXT:    [[DEST:%.*]] = alloca [16 x i8], align 8
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[SRC]])
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DEST]])
-; CHECK-NEXT:    [[SRC_GEP:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 8
-; CHECK-NEXT:    [[DEST_GEP:%.*]] = getelementptr inbounds i8, ptr [[DEST]], i64 8
-; CHECK-NEXT:    store i64 42, ptr [[SRC_GEP]], align 4
-; CHECK-NEXT:    call void @use_nocapture(ptr captures(none) [[SRC_GEP]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST_GEP]], ptr align 4 [[SRC_GEP]], i64 8, i1 false)
-; CHECK-NEXT:    call void @use_nocapture(ptr captures(none) [[DEST_GEP]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[SRC]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[DEST]])
-; CHECK-NEXT:    ret void
-;
-  %src = alloca [16 x i8], align 4
-  %dest = alloca [16 x i8], align 8
-  call void @llvm.lifetime.start.p0(ptr %src)
-  call void @llvm.lifetime.start.p0(ptr %dest)
-  %src.gep = getelementptr inbounds i8, ptr %src, i64 8
-  %dest.gep = getelementptr inbounds i8, ptr %dest, i64 8
-  store i64 42, ptr %src.gep
-  call void @use_nocapture(ptr nocapture %src.gep)
-
-  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest.gep, ptr align 4 %src.gep, i64 8, i1 false)
-
-  call void @use_nocapture(ptr nocapture %dest.gep)
-
-  call void @llvm.lifetime.end.p0(ptr %src)
-  call void @llvm.lifetime.end.p0(ptr %dest)
-  ret void
-}
-
-; Negative test: copy doesn't cover entire dest alloca (Size != DestSize)
-; src = [12 x i8], dest = [16 x i8], copy only 8 bytes
-define void @no_optimize_partial_dest_copy() {
-; CHECK-LABEL: define void @no_optimize_partial_dest_copy() {
-; CHECK-NEXT:    [[SRC:%.*]] = alloca [12 x i8], align 4
-; CHECK-NEXT:    [[DEST:%.*]] = alloca [16 x i8], align 4
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[SRC]])
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DEST]])
-; CHECK-NEXT:    [[SRC_GEP:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 4
-; CHECK-NEXT:    store i64 42, ptr [[SRC_GEP]], align 4
-; CHECK-NEXT:    call void @use_nocapture(ptr captures(none) [[SRC_GEP]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[DEST]], ptr align 4 [[SRC_GEP]], i64 8, i1 false)
-; CHECK-NEXT:    call void @use_nocapture(ptr captures(none) [[DEST]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[SRC]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[DEST]])
-; CHECK-NEXT:    ret void
-;
-  %src = alloca [12 x i8], align 4
-  %dest = alloca [16 x i8], align 4
-  call void @llvm.lifetime.start.p0(ptr %src)
-  call void @llvm.lifetime.start.p0(ptr %dest)
-  %src.gep = getelementptr inbounds i8, ptr %src, i64 4
-  store i64 42, ptr %src.gep
-  call void @use_nocapture(ptr nocapture %src.gep)
-
-  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %src.gep, i64 8, i1 false)
-
-  call void @use_nocapture(ptr nocapture %dest)
-
-  call void @llvm.lifetime.end.p0(ptr %src)
-  call void @llvm.lifetime.end.p0(ptr %dest)
-  ret void
-}
-
-; Negative test: SrcOffset not a multiple of DestAlign
-; src = [12 x i8] with offset 4, dest = [8 x i8] align 8
-; SrcOffset(4) % DestAlign(8) = 4 != 0 -> rejected
-define void @no_optimize_alignment_mismatch() {
-; CHECK-LABEL: define void @no_optimize_alignment_mismatch() {
-; CHECK-NEXT:    [[SRC:%.*]] = alloca [12 x i8], align 4
-; CHECK-NEXT:    [[DEST:%.*]] = alloca [8 x i8], align 8
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[SRC]])
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DEST]])
-; CHECK-NEXT:    [[SRC_GEP:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 4
-; CHECK-NEXT:    store i64 42, ptr [[SRC_GEP]], align 4
-; CHECK-NEXT:    call void @use_nocapture(ptr captures(none) [[SRC_GEP]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DEST]], ptr align 4 [[SRC_GEP]], i64 8, i1 false)
-; CHECK-NEXT:    call void @use_nocapture(ptr captures(none) [[DEST]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[SRC]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[DEST]])
-; CHECK-NEXT:    ret void
-;
-  %src = alloca [12 x i8], align 4
-  %dest = alloca [8 x i8], align 8
-  call void @llvm.lifetime.start.p0(ptr %src)
-  call void @llvm.lifetime.start.p0(ptr %dest)
-  %src.gep = getelementptr inbounds i8, ptr %src, i64 4
-  store i64 42, ptr %src.gep
-  call void @use_nocapture(ptr nocapture %src.gep)
-
-  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dest, ptr align 4 %src.gep, i64 8, i1 false)
-
-  call void @use_nocapture(ptr nocapture %dest)
-
-  call void @llvm.lifetime.end.p0(ptr %src)
-  call void @llvm.lifetime.end.p0(ptr %dest)
-  ret void
-}