[llvm] 64c54c5 - [MemCpyOpt] Regnerate test checks (NFC)

Nikita Popov via llvm-commits llvm-commits at lists.llvm.org
Fri Oct 2 09:42:32 PDT 2020


Author: Nikita Popov
Date: 2020-10-02T18:42:13+02:00
New Revision: 64c54c5459cfae8478ce28710784f36b0d94fb2f

URL: https://github.com/llvm/llvm-project/commit/64c54c5459cfae8478ce28710784f36b0d94fb2f
DIFF: https://github.com/llvm/llvm-project/commit/64c54c5459cfae8478ce28710784f36b0d94fb2f.diff

LOG: [MemCpyOpt] Regnerate test checks (NFC)

Added: 
    

Modified: 
    llvm/test/Transforms/MemCpyOpt/2008-02-24-MultipleUseofSRet.ll
    llvm/test/Transforms/MemCpyOpt/2008-03-13-ReturnSlotBitcast.ll
    llvm/test/Transforms/MemCpyOpt/2011-06-02-CallSlotOverwritten.ll
    llvm/test/Transforms/MemCpyOpt/aggregate-type-crash.ll
    llvm/test/Transforms/MemCpyOpt/align.ll
    llvm/test/Transforms/MemCpyOpt/atomic.ll
    llvm/test/Transforms/MemCpyOpt/callslot_aa.ll
    llvm/test/Transforms/MemCpyOpt/callslot_deref.ll
    llvm/test/Transforms/MemCpyOpt/callslot_throw.ll
    llvm/test/Transforms/MemCpyOpt/capturing-func.ll
    llvm/test/Transforms/MemCpyOpt/crash.ll
    llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll
    llvm/test/Transforms/MemCpyOpt/form-memset.ll
    llvm/test/Transforms/MemCpyOpt/invariant.start.ll
    llvm/test/Transforms/MemCpyOpt/lifetime.ll
    llvm/test/Transforms/MemCpyOpt/load-store-to-memcpy.ll
    llvm/test/Transforms/MemCpyOpt/loadstore-sret.ll
    llvm/test/Transforms/MemCpyOpt/memcpy-to-memset.ll
    llvm/test/Transforms/MemCpyOpt/memcpy-undef.ll
    llvm/test/Transforms/MemCpyOpt/memcpy.ll
    llvm/test/Transforms/MemCpyOpt/memmove.ll
    llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll
    llvm/test/Transforms/MemCpyOpt/memset-memcpy-redundant-memset.ll
    llvm/test/Transforms/MemCpyOpt/memset-memcpy-to-2x-memset.ll
    llvm/test/Transforms/MemCpyOpt/nontemporal.ll
    llvm/test/Transforms/MemCpyOpt/pr29105.ll
    llvm/test/Transforms/MemCpyOpt/pr37967.ll
    llvm/test/Transforms/MemCpyOpt/process_store.ll
    llvm/test/Transforms/MemCpyOpt/profitable-memset.ll
    llvm/test/Transforms/MemCpyOpt/smaller.ll
    llvm/test/Transforms/MemCpyOpt/sret.ll
    llvm/test/Transforms/MemCpyOpt/stackrestore.ll
    llvm/test/Transforms/MemCpyOpt/store-to-memset-is-nonzero-type.ll
    llvm/test/Transforms/MemCpyOpt/store-to-memset.ll
    llvm/test/Transforms/MemCpyOpt/vscale-memset.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/Transforms/MemCpyOpt/2008-02-24-MultipleUseofSRet.ll b/llvm/test/Transforms/MemCpyOpt/2008-02-24-MultipleUseofSRet.ll
index 237b8fec4f64..0f8c417f2127 100644
--- a/llvm/test/Transforms/MemCpyOpt/2008-02-24-MultipleUseofSRet.ll
+++ b/llvm/test/Transforms/MemCpyOpt/2008-02-24-MultipleUseofSRet.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s -basic-aa -memcpyopt -dse -S | grep "call.*initialize" | not grep memtmp
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basic-aa -memcpyopt -dse -S | FileCheck %s
 ; PR2077
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
@@ -7,6 +8,14 @@ target triple = "i386-pc-linux-gnu"
 %0 = type { x86_fp80, x86_fp80 }
 
 define internal fastcc void @initialize(%0* noalias nocapture sret %agg.result) nounwind {
+; CHECK-LABEL: @initialize(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AGG_RESULT_03:%.*]] = getelementptr [[TMP0:%.*]], %0* [[AGG_RESULT:%.*]], i32 0, i32 0
+; CHECK-NEXT:    store x86_fp80 0xK00000000000000000000, x86_fp80* [[AGG_RESULT_03]], align 4
+; CHECK-NEXT:    [[AGG_RESULT_15:%.*]] = getelementptr [[TMP0]], %0* [[AGG_RESULT]], i32 0, i32 1
+; CHECK-NEXT:    store x86_fp80 0xK00000000000000000000, x86_fp80* [[AGG_RESULT_15]], align 4
+; CHECK-NEXT:    ret void
+;
 entry:
   %agg.result.03 = getelementptr %0, %0* %agg.result, i32 0, i32 0
   store x86_fp80 0xK00000000000000000000, x86_fp80* %agg.result.03
@@ -18,6 +27,13 @@ entry:
 declare fastcc x86_fp80 @passed_uninitialized(%0* nocapture) nounwind
 
 define fastcc void @badly_optimized() nounwind {
+; CHECK-LABEL: @badly_optimized(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[Z:%.*]] = alloca [[TMP0:%.*]], align 8
+; CHECK-NEXT:    call fastcc void @initialize(%0* noalias sret [[Z]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call fastcc x86_fp80 @passed_uninitialized(%0* [[Z]])
+; CHECK-NEXT:    ret void
+;
 entry:
   %z = alloca %0
   %tmp = alloca %0

diff  --git a/llvm/test/Transforms/MemCpyOpt/2008-03-13-ReturnSlotBitcast.ll b/llvm/test/Transforms/MemCpyOpt/2008-03-13-ReturnSlotBitcast.ll
index a0f34b9baa6d..dbe819adb689 100644
--- a/llvm/test/Transforms/MemCpyOpt/2008-03-13-ReturnSlotBitcast.ll
+++ b/llvm/test/Transforms/MemCpyOpt/2008-03-13-ReturnSlotBitcast.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s -basic-aa -memcpyopt -S | not grep "call.*memcpy."
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basic-aa -memcpyopt -S | FileCheck %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
 
 %a = type { i32 }
@@ -7,6 +8,18 @@ target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f3
 declare void @g(%a* nocapture)
 
 define float @f() {
+; CHECK-LABEL: @f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_VAR:%.*]] = alloca [[A:%.*]], align 8
+; CHECK-NEXT:    [[B_VAR:%.*]] = alloca [[B:%.*]], align 8
+; CHECK-NEXT:    [[B_VAR1:%.*]] = bitcast %b* [[B_VAR]] to %a*
+; CHECK-NEXT:    call void @g(%a* [[B_VAR1]])
+; CHECK-NEXT:    [[A_I8:%.*]] = bitcast %a* [[A_VAR]] to i8*
+; CHECK-NEXT:    [[B_I8:%.*]] = bitcast %b* [[B_VAR]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[B]], %b* [[B_VAR]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[TMP1]], align 4
+; CHECK-NEXT:    ret float [[TMP2]]
+;
 entry:
   %a_var = alloca %a
   %b_var = alloca %b, align 1

diff  --git a/llvm/test/Transforms/MemCpyOpt/2011-06-02-CallSlotOverwritten.ll b/llvm/test/Transforms/MemCpyOpt/2011-06-02-CallSlotOverwritten.ll
index 8ba8df4d8b39..bd086967ec29 100644
--- a/llvm/test/Transforms/MemCpyOpt/2011-06-02-CallSlotOverwritten.ll
+++ b/llvm/test/Transforms/MemCpyOpt/2011-06-02-CallSlotOverwritten.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -basic-aa -memcpyopt -S | FileCheck %s
 ; PR10067
 ; Make sure the call+copy isn't optimized in such a way that
@@ -12,10 +13,25 @@ target triple = "i386-apple-darwin10"
 declare void @bar(%struct1* nocapture sret %agg.result) nounwind
 
 define i32 @foo() nounwind {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    [[X:%.*]] = alloca [[STRUCT1:%.*]], align 8
+; CHECK-NEXT:    [[Y:%.*]] = alloca [[STRUCT2:%.*]], align 8
+; CHECK-NEXT:    call void @bar(%struct1* sret [[X]]) [[ATTR0:#.*]]
+; CHECK-NEXT:    [[GEPN1:%.*]] = getelementptr inbounds [[STRUCT2]], %struct2* [[Y]], i32 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 0, i32* [[GEPN1]], align 8
+; CHECK-NEXT:    [[GEPN2:%.*]] = getelementptr inbounds [[STRUCT2]], %struct2* [[Y]], i32 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 0, i32* [[GEPN2]], align 4
+; CHECK-NEXT:    [[BIT1:%.*]] = bitcast %struct1* [[X]] to i64*
+; CHECK-NEXT:    [[BIT2:%.*]] = bitcast %struct2* [[Y]] to i64*
+; CHECK-NEXT:    [[LOAD:%.*]] = load i64, i64* [[BIT1]], align 8
+; CHECK-NEXT:    store i64 [[LOAD]], i64* [[BIT2]], align 8
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr [[STRUCT2]], %struct2* [[Y]], i32 0, i32 0, i32 0
+; CHECK-NEXT:    [[RET:%.*]] = load i32, i32* [[GEP1]], align 4
+; CHECK-NEXT:    ret i32 [[RET]]
+;
   %x = alloca %struct1, align 8
   %y = alloca %struct2, align 8
   call void @bar(%struct1* sret %x) nounwind
-; CHECK: call void @bar(%struct1* sret %x)
 
   %gepn1 = getelementptr inbounds %struct2, %struct2* %y, i32 0, i32 0, i32 0
   store i32 0, i32* %gepn1, align 8
@@ -27,8 +43,6 @@ define i32 @foo() nounwind {
   %load = load i64, i64* %bit1, align 8
   store i64 %load, i64* %bit2, align 8
 
-; CHECK: %load = load i64, i64* %bit1, align 8
-; CHECK: store i64 %load, i64* %bit2, align 8
 
   %gep1 = getelementptr %struct2, %struct2* %y, i32 0, i32 0, i32 0
   %ret = load i32, i32* %gep1

diff  --git a/llvm/test/Transforms/MemCpyOpt/aggregate-type-crash.ll b/llvm/test/Transforms/MemCpyOpt/aggregate-type-crash.ll
index 16d107730acd..dd9536a85888 100644
--- a/llvm/test/Transforms/MemCpyOpt/aggregate-type-crash.ll
+++ b/llvm/test/Transforms/MemCpyOpt/aggregate-type-crash.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -memcpyopt -S -o - < %s | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
@@ -9,13 +10,19 @@ target triple = "x86_64-apple-macosx10.14.0"
 declare noalias i8* @my_malloc(%my_struct*) #0
 
 define void @my_func(%my_struct* %0) {
+; CHECK-LABEL: @my_func(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = load [[MY_STRUCT:%.*]], %my_struct* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i8* @my_malloc(%my_struct* [[TMP0]])
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to %my_struct*
+; CHECK-NEXT:    store [[MY_STRUCT]] [[TMP1]], %my_struct* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
 entry:
-; CHECK: entry:
   %1 = load %my_struct, %my_struct* %0
   %2 = call i8* @my_malloc(%my_struct* %0)
   %3 = bitcast i8* %2 to %my_struct*
   store %my_struct %1, %my_struct* %3
-; CHECK-NOT: call void @llvm.memcpy.{{.*}}.{{.*}}.{{.*}}
   ret void
 }
 

diff  --git a/llvm/test/Transforms/MemCpyOpt/align.ll b/llvm/test/Transforms/MemCpyOpt/align.ll
index 2e683bfa91f5..cdaf44f69e01 100644
--- a/llvm/test/Transforms/MemCpyOpt/align.ll
+++ b/llvm/test/Transforms/MemCpyOpt/align.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -S -basic-aa -memcpyopt | FileCheck %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
 
@@ -9,7 +10,14 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
 
 define void @foo(i32* %p) {
 ; CHECK-LABEL: @foo(
-; CHECK: call void @llvm.memset.p0i8.i64(i8* align 4 {{.*}}, i8 0, i64 16, i1 false)
+; CHECK-NEXT:    [[A0:%.*]] = getelementptr i32, i32* [[P:%.*]], i64 0
+; CHECK-NEXT:    [[A1:%.*]] = getelementptr i32, i32* [[P]], i64 1
+; CHECK-NEXT:    [[A2:%.*]] = getelementptr i32, i32* [[P]], i64 2
+; CHECK-NEXT:    [[A3:%.*]] = getelementptr i32, i32* [[P]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[A0]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP1]], i8 0, i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
   %a0 = getelementptr i32, i32* %p, i64 0
   store i32 0, i32* %a0, align 4
   %a1 = getelementptr i32, i32* %p, i64 1
@@ -25,8 +33,14 @@ define void @foo(i32* %p) {
 
 define void @bar() {
 ; CHECK-LABEL: @bar(
-; CHECK: %a4 = alloca i32, align 8
-; CHECK-NOT: memcpy
+; CHECK-NEXT:    [[A4:%.*]] = alloca i32, align 8
+; CHECK-NEXT:    [[A8:%.*]] = alloca i32, align 8
+; CHECK-NEXT:    [[A8_CAST:%.*]] = bitcast i32* [[A8]] to i8*
+; CHECK-NEXT:    [[A4_CAST:%.*]] = bitcast i32* [[A4]] to i8*
+; CHECK-NEXT:    [[A41:%.*]] = bitcast i32* [[A4]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[A41]], i8 0, i64 4, i1 false)
+; CHECK-NEXT:    ret void
+;
   %a4 = alloca i32, align 4
   %a8 = alloca i32, align 8
   %a8.cast = bitcast i32* %a8 to i8*

diff  --git a/llvm/test/Transforms/MemCpyOpt/atomic.ll b/llvm/test/Transforms/MemCpyOpt/atomic.ll
index 65f6c925e205..ed31766b2f54 100644
--- a/llvm/test/Transforms/MemCpyOpt/atomic.ll
+++ b/llvm/test/Transforms/MemCpyOpt/atomic.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -basic-aa -memcpyopt -S < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
@@ -11,8 +12,16 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
 
 ; memcpyopt should not touch atomic ops
 define void @test1() nounwind uwtable ssp {
-; CHECK: test1
-; CHECK: store atomic
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[X:%.*]] = alloca [101 x i32], align 16
+; CHECK-NEXT:    [[BC:%.*]] = bitcast [101 x i32]* [[X]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 16 [[BC]], i8 0, i64 400, i1 false)
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [101 x i32], [101 x i32]* [[X]], i32 0, i32 100
+; CHECK-NEXT:    store atomic i32 0, i32* [[GEP1]] unordered, align 4
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [101 x i32], [101 x i32]* [[X]], i32 0, i32 0
+; CHECK-NEXT:    call void @otherf(i32* [[GEP2]])
+; CHECK-NEXT:    ret void
+;
   %x = alloca [101 x i32], align 16
   %bc = bitcast [101 x i32]* %x to i8*
   call void @llvm.memset.p0i8.i64(i8* align 16 %bc, i8 0, i64 400, i1 false)
@@ -25,17 +34,21 @@ define void @test1() nounwind uwtable ssp {
 
 ; memcpyopt across unordered store
 define void @test2() nounwind uwtable ssp {
-; CHECK: test2
-; CHECK: call
-; CHECK-NEXT: store atomic
-; CHECK-NEXT: call
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[OLD:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[NEW:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    call void @otherf(i32* nocapture [[NEW]])
+; CHECK-NEXT:    store atomic i32 0, i32* @x unordered, align 4
+; CHECK-NEXT:    call void @otherf(i32* nocapture [[NEW]])
+; CHECK-NEXT:    ret void
+;
   %old = alloca i32
   %new = alloca i32
   call void @otherf(i32* nocapture %old)
   store atomic i32 0, i32* @x unordered, align 4
   %v = load i32, i32* %old
   store i32 %v, i32* %new
-  call void @otherf(i32* nocapture %new)  
+  call void @otherf(i32* nocapture %new)
   ret void
 }
 

diff  --git a/llvm/test/Transforms/MemCpyOpt/callslot_aa.ll b/llvm/test/Transforms/MemCpyOpt/callslot_aa.ll
index 1d45cbe9e5cb..6e7b78d4da71 100644
--- a/llvm/test/Transforms/MemCpyOpt/callslot_aa.ll
+++ b/llvm/test/Transforms/MemCpyOpt/callslot_aa.ll
@@ -1,12 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -S -basic-aa -memcpyopt | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 %T = type { i64, i64 }
 
 define void @test(i8* %src) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[DST:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[DST]], i8* align 8 [[SRC:%.*]], i64 1, i1 false)
+; CHECK-NEXT:    ret void
+;
   %tmp = alloca i8
   %dst = alloca i8
-; CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %dst, i8* align 8 %src, i64 1, i1 false)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %tmp, i8* align 8 %src, i64 1, i1 false), !noalias !2
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %dst, i8* align 8 %tmp, i64 1, i1 false)
 

diff  --git a/llvm/test/Transforms/MemCpyOpt/callslot_deref.ll b/llvm/test/Transforms/MemCpyOpt/callslot_deref.ll
index ad578be711cd..a2c0503894a1 100644
--- a/llvm/test/Transforms/MemCpyOpt/callslot_deref.ll
+++ b/llvm/test/Transforms/MemCpyOpt/callslot_deref.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -S -basic-aa -memcpyopt | FileCheck %s
 target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -7,8 +8,13 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
 ; all bytes of %dst that are touch by the memset are dereferenceable
 define void @must_remove_memcpy(i8* noalias nocapture dereferenceable(4096) %dst) {
 ; CHECK-LABEL: @must_remove_memcpy(
-; CHECK: call void @llvm.memset.p0i8.i64
-; CHECK-NOT: call void @llvm.memcpy.p0i8.p0i8.i64
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [4096 x i8], align 1
+; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds [4096 x i8], [4096 x i8]* [[SRC]], i64 0, i64 0
+; CHECK-NEXT:    [[DST1:%.*]] = bitcast i8* [[DST:%.*]] to [4096 x i8]*
+; CHECK-NEXT:    [[DST12:%.*]] = bitcast [4096 x i8]* [[DST1]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[DST12]], i8 0, i64 4096, i1 false)
+; CHECK-NEXT:    ret void
+;
   %src = alloca [4096 x i8], align 1
   %p = getelementptr inbounds [4096 x i8], [4096 x i8]* %src, i64 0, i64 0
   call void @llvm.memset.p0i8.i64(i8* %p, i8 0, i64 4096, i1 false)
@@ -20,8 +26,12 @@ define void @must_remove_memcpy(i8* noalias nocapture dereferenceable(4096) %dst
 ; We can't remove the memcpy, but we can turn it into an independent memset.
 define void @must_not_remove_memcpy(i8* noalias nocapture dereferenceable(1024) %dst) {
 ; CHECK-LABEL: @must_not_remove_memcpy(
-; CHECK: call void @llvm.memset.p0i8.i64
-; CHECK: call void @llvm.memset.p0i8.i64
+; CHECK-NEXT:    [[SRC:%.*]] = alloca [4096 x i8], align 1
+; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds [4096 x i8], [4096 x i8]* [[SRC]], i64 0, i64 0
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[P]], i8 0, i64 4096, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[DST:%.*]], i8 0, i64 4096, i1 false)
+; CHECK-NEXT:    ret void
+;
   %src = alloca [4096 x i8], align 1
   %p = getelementptr inbounds [4096 x i8], [4096 x i8]* %src, i64 0, i64 0
   call void @llvm.memset.p0i8.i64(i8* %p, i8 0, i64 4096, i1 false)

diff  --git a/llvm/test/Transforms/MemCpyOpt/callslot_throw.ll b/llvm/test/Transforms/MemCpyOpt/callslot_throw.ll
index 1aa4c92efc72..7092f046af31 100644
--- a/llvm/test/Transforms/MemCpyOpt/callslot_throw.ll
+++ b/llvm/test/Transforms/MemCpyOpt/callslot_throw.ll
@@ -1,34 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -memcpyopt < %s | FileCheck %s
 declare void @may_throw(i32* nocapture %x)
 
-; CHECK-LABEL: define void @test1(
 define void @test1(i32* nocapture noalias dereferenceable(4) %x) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[T:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    call void @may_throw(i32* nonnull [[T]])
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[T]], align 4
+; CHECK-NEXT:    store i32 [[LOAD]], i32* [[X:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
 entry:
   %t = alloca i32, align 4
   call void @may_throw(i32* nonnull %t)
   %load = load i32, i32* %t, align 4
   store i32 %load, i32* %x, align 4
-; CHECK:       %[[t:.*]] = alloca i32, align 4
-; CHECK-NEXT:  call void @may_throw(i32* {{.*}} %[[t]])
-; CHECK-NEXT:  %[[load:.*]] = load i32, i32* %[[t]], align 4
-; CHECK-NEXT:  store i32 %[[load]], i32* %x, align 4
   ret void
 }
 
 declare void @always_throws()
 
-; CHECK-LABEL: define void @test2(
 define void @test2(i32* nocapture noalias dereferenceable(4) %x) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[T:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    call void @may_throw(i32* nonnull [[T]]) [[ATTR0:#.*]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[T]], align 4
+; CHECK-NEXT:    call void @always_throws()
+; CHECK-NEXT:    store i32 [[LOAD]], i32* [[X:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
 entry:
   %t = alloca i32, align 4
   call void @may_throw(i32* nonnull %t) nounwind
   %load = load i32, i32* %t, align 4
   call void @always_throws()
   store i32 %load, i32* %x, align 4
-; CHECK:       %[[t:.*]] = alloca i32, align 4
-; CHECK-NEXT:  call void @may_throw(i32* {{.*}} %[[t]])
-; CHECK-NEXT:  %[[load:.*]] = load i32, i32* %[[t]], align 4
-; CHECK-NEXT:  call void @always_throws()
-; CHECK-NEXT:  store i32 %[[load]], i32* %x, align 4
   ret void
 }

diff  --git a/llvm/test/Transforms/MemCpyOpt/capturing-func.ll b/llvm/test/Transforms/MemCpyOpt/capturing-func.ll
index 0ea889a66497..8376ecd3d30d 100644
--- a/llvm/test/Transforms/MemCpyOpt/capturing-func.ll
+++ b/llvm/test/Transforms/MemCpyOpt/capturing-func.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -basic-aa -memcpyopt -S | FileCheck %s
 
 target datalayout = "e"
@@ -6,6 +7,14 @@ declare void @foo(i8*)
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i1) nounwind
 
 define void @test() {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    [[PTR1:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[PTR2:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    call void @foo(i8* [[PTR2]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[PTR1]], i8* [[PTR2]], i32 1, i1 false)
+; CHECK-NEXT:    call void @foo(i8* [[PTR1]])
+; CHECK-NEXT:    ret void
+;
   %ptr1 = alloca i8
   %ptr2 = alloca i8
   call void @foo(i8* %ptr2)
@@ -15,8 +24,4 @@ define void @test() {
 
   ; Check that the transformation isn't applied if the called function can
   ; capture the pointer argument (i.e. the nocapture attribute isn't present)
-  ; CHECK-LABEL: @test(
-  ; CHECK: call void @foo(i8* %ptr2)
-  ; CHECK-NEXT: call void @llvm.memcpy
-  ; CHECK-NEXT: call void @foo(i8* %ptr1)
 }

diff  --git a/llvm/test/Transforms/MemCpyOpt/crash.ll b/llvm/test/Transforms/MemCpyOpt/crash.ll
index 1fd4d0deae6d..489a1827604b 100644
--- a/llvm/test/Transforms/MemCpyOpt/crash.ll
+++ b/llvm/test/Transforms/MemCpyOpt/crash.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s -basic-aa -memcpyopt -disable-output
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -basic-aa -memcpyopt | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
 target triple = "armv7-eabi"
@@ -8,6 +9,30 @@ target triple = "armv7-eabi"
 
 ; PR4882
 define void @test1(%struct.bar* %this) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_BAR:%.*]], %struct.bar* [[THIS:%.*]], i32 0, i32 0, i32 0, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_BAR]], %struct.bar* [[THIS]], i32 0, i32 0, i32 0, i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_BAR]], %struct.bar* [[THIS]], i32 0, i32 0, i32 0, i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_BAR]], %struct.bar* [[THIS]], i32 0, i32 0, i32 0, i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_BAR]], %struct.bar* [[THIS]], i32 0, i32 1, i32 0, i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_BAR]], %struct.bar* [[THIS]], i32 0, i32 1, i32 0, i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_BAR]], %struct.bar* [[THIS]], i32 0, i32 1, i32 0, i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_BAR]], %struct.bar* [[THIS]], i32 0, i32 1, i32 0, i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_BAR]], %struct.bar* [[THIS]], i32 0, i32 3, i32 0, i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_BAR]], %struct.bar* [[THIS]], i32 0, i32 3, i32 0, i32 2
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_BAR]], %struct.bar* [[THIS]], i32 0, i32 3, i32 0, i32 3
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_BAR]], %struct.bar* [[THIS]], i32 0, i32 4, i32 0, i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_BAR]], %struct.bar* [[THIS]], i32 0, i32 4, i32 0, i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_BAR]], %struct.bar* [[THIS]], i32 0, i32 4, i32 0, i32 2
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_BAR]], %struct.bar* [[THIS]], i32 0, i32 4, i32 0, i32 3
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_BAR]], %struct.bar* [[THIS]], i32 0, i32 5
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast float* [[TMP0]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP16]], i8 0, i64 32, i1 false)
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP8]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP17]], i8 0, i64 32, i1 false)
+; CHECK-NEXT:    unreachable
+;
 entry:
   %0 = getelementptr inbounds %struct.bar, %struct.bar* %this, i32 0, i32 0, i32 0, i32 0
   store float 0.000000e+00, float* %0, align 4
@@ -49,6 +74,10 @@ entry:
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind
 
 define void @test2(i32 %cmd) nounwind {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* null, i8* undef, i64 20, i1 false) [[ATTR1:#.*]]
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* undef, i8* undef, i64 20, i1 false) nounwind
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* null, i8* undef, i64 20, i1 false) nounwind
   ret void

diff  --git a/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll b/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll
index 6ce1aee338d8..777ba51f3827 100644
--- a/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -memcpyopt -S < %s | FileCheck %s
 
 target datalayout = "e-i64:64-f80:128-n8:16:32:64"
@@ -6,41 +7,49 @@ target triple = "x86_64-unknown-linux-gnu"
 %S = type { i8*, i8, i32 }
 
 define void @copy(%S* %src, %S* %dst) {
-; CHECK-LABEL: copy
-; CHECK-NOT: load
-; CHECK: call void @llvm.memmove.p0i8.p0i8.i64
-; CHECK-NEXT: ret void
+; CHECK-LABEL: @copy(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast %S* [[DST:%.*]] to i8*
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast %S* [[SRC:%.*]] to i8*
+; CHECK-NEXT:    call void @llvm.memmove.p0i8.p0i8.i64(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
   %1 = load %S, %S* %src
   store %S %1, %S* %dst
   ret void
 }
 
 define void @noaliassrc(%S* noalias %src, %S* %dst) {
-; CHECK-LABEL: noaliassrc
-; CHECK-NOT: load
-; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64
-; CHECK-NEXT: ret void
+; CHECK-LABEL: @noaliassrc(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast %S* [[DST:%.*]] to i8*
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast %S* [[SRC:%.*]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
   %1 = load %S, %S* %src
   store %S %1, %S* %dst
   ret void
 }
 
 define void @noaliasdst(%S* %src, %S* noalias %dst) {
-; CHECK-LABEL: noaliasdst
-; CHECK-NOT: load
-; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64
-; CHECK-NEXT: ret void
+; CHECK-LABEL: @noaliasdst(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast %S* [[DST:%.*]] to i8*
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast %S* [[SRC:%.*]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
   %1 = load %S, %S* %src
   store %S %1, %S* %dst
   ret void
 }
 
 define void @destroysrc(%S* %src, %S* %dst) {
-; CHECK-LABEL: destroysrc
-; CHECK: load %S, %S* %src
-; CHECK: call void @llvm.memset.p0i8.i64
-; CHECK-NEXT: store %S %1, %S* %dst
-; CHECK-NEXT: ret void
+; CHECK-LABEL: @destroysrc(
+; CHECK-NEXT:    [[TMP1:%.*]] = load [[S:%.*]], %S* [[SRC:%.*]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast %S* [[SRC]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP2]], i8 0, i64 16, i1 false)
+; CHECK-NEXT:    store [[S]] [[TMP1]], %S* [[DST:%.*]], align 8
+; CHECK-NEXT:    ret void
+;
   %1 = load %S, %S* %src
   store %S zeroinitializer, %S* %src
   store %S %1, %S* %dst
@@ -48,11 +57,14 @@ define void @destroysrc(%S* %src, %S* %dst) {
 }
 
 define void @destroynoaliassrc(%S* noalias %src, %S* %dst) {
-; CHECK-LABEL: destroynoaliassrc
-; CHECK-NOT: load
-; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64
-; CHECK-NEXT: ret void
+; CHECK-LABEL: @destroynoaliassrc(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast %S* [[SRC:%.*]] to i8*
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast %S* [[DST:%.*]] to i8*
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast %S* [[SRC]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP1]], i8 0, i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
   %1 = load %S, %S* %src
   store %S zeroinitializer, %S* %src
   store %S %1, %S* %dst
@@ -60,12 +72,14 @@ define void @destroynoaliassrc(%S* noalias %src, %S* %dst) {
 }
 
 define void @copyalias(%S* %src, %S* %dst) {
-; CHECK-LABEL: copyalias
-; CHECK-NEXT: [[LOAD:%[a-z0-9\.]+]] = load %S, %S* %src
-; CHECK-NOT: load
-; CHECK: call void @llvm.memmove.p0i8.p0i8.i64
-; CHECK-NEXT: store %S [[LOAD]], %S* %dst
-; CHECK-NEXT: ret void
+; CHECK-LABEL: @copyalias(
+; CHECK-NEXT:    [[TMP1:%.*]] = load [[S:%.*]], %S* [[SRC:%.*]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast %S* [[DST:%.*]] to i8*
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast %S* [[SRC]] to i8*
+; CHECK-NEXT:    call void @llvm.memmove.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 16, i1 false)
+; CHECK-NEXT:    store [[S]] [[TMP1]], %S* [[DST]], align 8
+; CHECK-NEXT:    ret void
+;
   %1 = load %S, %S* %src
   %2 = load %S, %S* %src
   store %S %1, %S* %dst
@@ -76,14 +90,15 @@ define void @copyalias(%S* %src, %S* %dst) {
 ; If the store address is computed in a complex manner, make
 ; sure we lift the computation as well if needed and possible.
 define void @addrproducer(%S* %src, %S* %dst) {
-; CHECK-LABEL: addrproducer(
-; CHECK-NEXT: %[[DSTCAST:[0-9]+]] = bitcast %S* %dst to i8*
-; CHECK-NEXT: %dst2 = getelementptr %S, %S* %dst, i64 1
-; CHECK-NEXT: %[[DST2CAST:[0-9]+]] = bitcast %S* %dst2 to i8*
-; CHECK-NEXT: %[[SRCCAST:[0-9]+]] = bitcast %S* %src to i8*
-; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64(i8* align 8 %[[DST2CAST]], i8* align 8 %[[SRCCAST]], i64 16, i1 false)
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 %[[DSTCAST]], i8 undef, i64 16, i1 false)
-; CHECK-NEXT: ret void
+; CHECK-LABEL: @addrproducer(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast %S* [[DST:%.*]] to i8*
+; CHECK-NEXT:    [[DST2:%.*]] = getelementptr [[S:%.*]], %S* [[DST]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast %S* [[DST2]] to i8*
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast %S* [[SRC:%.*]] to i8*
+; CHECK-NEXT:    call void @llvm.memmove.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP1]], i8 undef, i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
   %1 = load %S, %S* %src
   store %S undef, %S* %dst
   %dst2 = getelementptr %S , %S* %dst, i64 1
@@ -92,14 +107,15 @@ define void @addrproducer(%S* %src, %S* %dst) {
 }
 
 define void @aliasaddrproducer(%S* %src, %S* %dst, i32* %dstidptr) {
-; CHECK-LABEL: aliasaddrproducer(
-; CHECK-NEXT: %[[SRC:[0-9]+]] = load %S, %S* %src
-; CHECK-NEXT: %[[DSTCAST:[0-9]+]] = bitcast %S* %dst to i8*
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 %[[DSTCAST]], i8 undef, i64 16, i1 false)
-; CHECK-NEXT: %dstindex = load i32, i32* %dstidptr
-; CHECK-NEXT: %dst2 = getelementptr %S, %S* %dst, i32 %dstindex
-; CHECK-NEXT: store %S %[[SRC]], %S* %dst2
-; CHECK-NEXT: ret void
+; CHECK-LABEL: @aliasaddrproducer(
+; CHECK-NEXT:    [[TMP1:%.*]] = load [[S:%.*]], %S* [[SRC:%.*]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast %S* [[DST:%.*]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP2]], i8 undef, i64 16, i1 false)
+; CHECK-NEXT:    [[DSTINDEX:%.*]] = load i32, i32* [[DSTIDPTR:%.*]], align 4
+; CHECK-NEXT:    [[DST2:%.*]] = getelementptr [[S]], %S* [[DST]], i32 [[DSTINDEX]]
+; CHECK-NEXT:    store [[S]] [[TMP1]], %S* [[DST2]], align 8
+; CHECK-NEXT:    ret void
+;
   %1 = load %S, %S* %src
   store %S undef, %S* %dst
   %dstindex = load i32, i32* %dstidptr
@@ -109,16 +125,17 @@ define void @aliasaddrproducer(%S* %src, %S* %dst, i32* %dstidptr) {
 }
 
 define void @noaliasaddrproducer(%S* %src, %S* noalias %dst, i32* noalias %dstidptr) {
-; CHECK-LABEL: noaliasaddrproducer(
-; CHECK-NEXT: %[[SRCCAST:[0-9]+]] = bitcast %S* %src to i8*
-; CHECK-NEXT: %[[LOADED:[0-9]+]] = load i32, i32* %dstidptr
-; CHECK-NEXT: %dstindex = or i32 %[[LOADED]], 1
-; CHECK-NEXT: %dst2 = getelementptr %S, %S* %dst, i32 %dstindex
-; CHECK-NEXT: %[[DST2CAST:[0-9]+]] = bitcast %S* %dst2 to i8*
-; CHECK-NEXT: %[[SRCCAST2:[0-9]+]] = bitcast %S* %src to i8*
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[DST2CAST]], i8* align 8 %[[SRCCAST2]], i64 16, i1 false)
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 %[[SRCCAST]], i8 undef, i64 16, i1 false)
-; CHECK-NEXT: ret void
+; CHECK-LABEL: @noaliasaddrproducer(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast %S* [[SRC:%.*]] to i8*
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[DSTIDPTR:%.*]], align 4
+; CHECK-NEXT:    [[DSTINDEX:%.*]] = or i32 [[TMP2]], 1
+; CHECK-NEXT:    [[DST2:%.*]] = getelementptr [[S:%.*]], %S* [[DST:%.*]], i32 [[DSTINDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast %S* [[DST2]] to i8*
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast %S* [[SRC]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP1]], i8 undef, i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
   %1 = load %S, %S* %src
   store %S undef, %S* %src
   %2 = load i32, i32* %dstidptr

diff  --git a/llvm/test/Transforms/MemCpyOpt/form-memset.ll b/llvm/test/Transforms/MemCpyOpt/form-memset.ll
index dde025dac926..bec6b8855a2b 100644
--- a/llvm/test/Transforms/MemCpyOpt/form-memset.ll
+++ b/llvm/test/Transforms/MemCpyOpt/form-memset.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -memcpyopt -S | FileCheck %s
 
 ; All the stores in this example should be merged into a single memset.
@@ -6,53 +7,74 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f3
 target triple = "i386-apple-darwin8"
 
 define void @test1(i8 signext  %c) nounwind  {
-entry:
-	%x = alloca [19 x i8]		; <[19 x i8]*> [#uses=20]
-	%tmp = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 0		; <i8*> [#uses=1]
-	store i8 %c, i8* %tmp, align 1
-	%tmp5 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 1		; <i8*> [#uses=1]
-	store i8 %c, i8* %tmp5, align 1
-	%tmp9 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 2		; <i8*> [#uses=1]
-	store i8 %c, i8* %tmp9, align 1
-	%tmp13 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 3		; <i8*> [#uses=1]
-	store i8 %c, i8* %tmp13, align 1
-	%tmp17 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 4		; <i8*> [#uses=1]
-	store i8 %c, i8* %tmp17, align 1
-	%tmp21 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 5		; <i8*> [#uses=1]
-	store i8 %c, i8* %tmp21, align 1
-	%tmp25 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 6		; <i8*> [#uses=1]
-	store i8 %c, i8* %tmp25, align 1
-	%tmp29 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 7		; <i8*> [#uses=1]
-	store i8 %c, i8* %tmp29, align 1
-	%tmp33 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 8		; <i8*> [#uses=1]
-	store i8 %c, i8* %tmp33, align 1
-	%tmp37 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 9		; <i8*> [#uses=1]
-	store i8 %c, i8* %tmp37, align 1
-	%tmp41 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 10		; <i8*> [#uses=1]
-	store i8 %c, i8* %tmp41, align 1
-	%tmp45 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 11		; <i8*> [#uses=1]
-	store i8 %c, i8* %tmp45, align 1
-	%tmp49 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 12		; <i8*> [#uses=1]
-	store i8 %c, i8* %tmp49, align 1
-	%tmp53 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 13		; <i8*> [#uses=1]
-	store i8 %c, i8* %tmp53, align 1
-	%tmp57 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 14		; <i8*> [#uses=1]
-	store i8 %c, i8* %tmp57, align 1
-	%tmp61 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 15		; <i8*> [#uses=1]
-	store i8 %c, i8* %tmp61, align 1
-	%tmp65 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 16		; <i8*> [#uses=1]
-	store i8 %c, i8* %tmp65, align 1
-	%tmp69 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 17		; <i8*> [#uses=1]
-	store i8 %c, i8* %tmp69, align 1
-	%tmp73 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 18		; <i8*> [#uses=1]
-	store i8 %c, i8* %tmp73, align 1
-	%tmp76 = call i32 (...) @bar( [19 x i8]* %x ) nounwind
-	ret void
 ; CHECK-LABEL: @test1(
-; CHECK-NOT: store
-; CHECK: call void @llvm.memset.p0i8.i64
-; CHECK-NOT: store
-; CHECK: ret
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X:%.*]] = alloca [19 x i8], align 1
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr [19 x i8], [19 x i8]* [[X]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr [19 x i8], [19 x i8]* [[X]], i32 0, i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr [19 x i8], [19 x i8]* [[X]], i32 0, i32 2
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr [19 x i8], [19 x i8]* [[X]], i32 0, i32 3
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr [19 x i8], [19 x i8]* [[X]], i32 0, i32 4
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr [19 x i8], [19 x i8]* [[X]], i32 0, i32 5
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr [19 x i8], [19 x i8]* [[X]], i32 0, i32 6
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr [19 x i8], [19 x i8]* [[X]], i32 0, i32 7
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr [19 x i8], [19 x i8]* [[X]], i32 0, i32 8
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr [19 x i8], [19 x i8]* [[X]], i32 0, i32 9
+; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr [19 x i8], [19 x i8]* [[X]], i32 0, i32 10
+; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr [19 x i8], [19 x i8]* [[X]], i32 0, i32 11
+; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr [19 x i8], [19 x i8]* [[X]], i32 0, i32 12
+; CHECK-NEXT:    [[TMP53:%.*]] = getelementptr [19 x i8], [19 x i8]* [[X]], i32 0, i32 13
+; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr [19 x i8], [19 x i8]* [[X]], i32 0, i32 14
+; CHECK-NEXT:    [[TMP61:%.*]] = getelementptr [19 x i8], [19 x i8]* [[X]], i32 0, i32 15
+; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr [19 x i8], [19 x i8]* [[X]], i32 0, i32 16
+; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr [19 x i8], [19 x i8]* [[X]], i32 0, i32 17
+; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr [19 x i8], [19 x i8]* [[X]], i32 0, i32 18
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 1 [[TMP]], i8 [[C:%.*]], i64 19, i1 false)
+; CHECK-NEXT:    [[TMP76:%.*]] = call i32 (...) @bar([19 x i8]* [[X]]) [[ATTR0:#.*]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %x = alloca [19 x i8]		; <[19 x i8]*> [#uses=20]
+  %tmp = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 0		; <i8*> [#uses=1]
+  store i8 %c, i8* %tmp, align 1
+  %tmp5 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 1		; <i8*> [#uses=1]
+  store i8 %c, i8* %tmp5, align 1
+  %tmp9 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 2		; <i8*> [#uses=1]
+  store i8 %c, i8* %tmp9, align 1
+  %tmp13 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 3		; <i8*> [#uses=1]
+  store i8 %c, i8* %tmp13, align 1
+  %tmp17 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 4		; <i8*> [#uses=1]
+  store i8 %c, i8* %tmp17, align 1
+  %tmp21 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 5		; <i8*> [#uses=1]
+  store i8 %c, i8* %tmp21, align 1
+  %tmp25 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 6		; <i8*> [#uses=1]
+  store i8 %c, i8* %tmp25, align 1
+  %tmp29 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 7		; <i8*> [#uses=1]
+  store i8 %c, i8* %tmp29, align 1
+  %tmp33 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 8		; <i8*> [#uses=1]
+  store i8 %c, i8* %tmp33, align 1
+  %tmp37 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 9		; <i8*> [#uses=1]
+  store i8 %c, i8* %tmp37, align 1
+  %tmp41 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 10		; <i8*> [#uses=1]
+  store i8 %c, i8* %tmp41, align 1
+  %tmp45 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 11		; <i8*> [#uses=1]
+  store i8 %c, i8* %tmp45, align 1
+  %tmp49 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 12		; <i8*> [#uses=1]
+  store i8 %c, i8* %tmp49, align 1
+  %tmp53 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 13		; <i8*> [#uses=1]
+  store i8 %c, i8* %tmp53, align 1
+  %tmp57 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 14		; <i8*> [#uses=1]
+  store i8 %c, i8* %tmp57, align 1
+  %tmp61 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 15		; <i8*> [#uses=1]
+  store i8 %c, i8* %tmp61, align 1
+  %tmp65 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 16		; <i8*> [#uses=1]
+  store i8 %c, i8* %tmp65, align 1
+  %tmp69 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 17		; <i8*> [#uses=1]
+  store i8 %c, i8* %tmp69, align 1
+  %tmp73 = getelementptr [19 x i8], [19 x i8]* %x, i32 0, i32 18		; <i8*> [#uses=1]
+  store i8 %c, i8* %tmp73, align 1
+  %tmp76 = call i32 (...) @bar( [19 x i8]* %x ) nounwind
+  ret void
 }
 
 declare i32 @bar(...)
@@ -61,104 +83,150 @@ declare i32 @bar(...)
 
 
 define void @test2() nounwind  {
-entry:
-	%ref_idx = alloca [8 x i8]		; <[8 x i8]*> [#uses=8]
-	%left_mvd = alloca [8 x %struct.MV]		; <[8 x %struct.MV]*> [#uses=17]
-	%up_mvd = alloca [8 x %struct.MV]		; <[8 x %struct.MV]*> [#uses=17]
-	%tmp20 = getelementptr [8 x i8], [8 x i8]* %ref_idx, i32 0, i32 7		; <i8*> [#uses=1]
-	store i8 -1, i8* %tmp20, align 1
-	%tmp23 = getelementptr [8 x i8], [8 x i8]* %ref_idx, i32 0, i32 6		; <i8*> [#uses=1]
-	store i8 -1, i8* %tmp23, align 1
-	%tmp26 = getelementptr [8 x i8], [8 x i8]* %ref_idx, i32 0, i32 5		; <i8*> [#uses=1]
-	store i8 -1, i8* %tmp26, align 1
-	%tmp29 = getelementptr [8 x i8], [8 x i8]* %ref_idx, i32 0, i32 4		; <i8*> [#uses=1]
-	store i8 -1, i8* %tmp29, align 1
-	%tmp32 = getelementptr [8 x i8], [8 x i8]* %ref_idx, i32 0, i32 3		; <i8*> [#uses=1]
-	store i8 -1, i8* %tmp32, align 1
-	%tmp35 = getelementptr [8 x i8], [8 x i8]* %ref_idx, i32 0, i32 2		; <i8*> [#uses=1]
-	store i8 -1, i8* %tmp35, align 1
-	%tmp38 = getelementptr [8 x i8], [8 x i8]* %ref_idx, i32 0, i32 1		; <i8*> [#uses=1]
-	store i8 -1, i8* %tmp38, align 1
-	%tmp41 = getelementptr [8 x i8], [8 x i8]* %ref_idx, i32 0, i32 0		; <i8*> [#uses=2]
-	store i8 -1, i8* %tmp41, align 1
-	%tmp43 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 7, i32 0		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp43, align 2
-	%tmp46 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 7, i32 1		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp46, align 2
-	%tmp57 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 6, i32 0		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp57, align 2
-	%tmp60 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 6, i32 1		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp60, align 2
-	%tmp71 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 5, i32 0		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp71, align 2
-	%tmp74 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 5, i32 1		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp74, align 2
-	%tmp85 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 4, i32 0		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp85, align 2
-	%tmp88 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 4, i32 1		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp88, align 2
-	%tmp99 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 3, i32 0		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp99, align 2
-	%tmp102 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 3, i32 1		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp102, align 2
-	%tmp113 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 2, i32 0		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp113, align 2
-	%tmp116 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 2, i32 1		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp116, align 2
-	%tmp127 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 1, i32 0		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp127, align 2
-	%tmp130 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 1, i32 1		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp130, align 2
-	%tmp141 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 0, i32 0		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp141, align 8
-	%tmp144 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 0, i32 1		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp144, align 2
-	%tmp148 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 7, i32 0		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp148, align 2
-	%tmp151 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 7, i32 1		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp151, align 2
-	%tmp162 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 6, i32 0		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp162, align 2
-	%tmp165 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 6, i32 1		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp165, align 2
-	%tmp176 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 5, i32 0		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp176, align 2
-	%tmp179 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 5, i32 1		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp179, align 2
-	%tmp190 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 4, i32 0		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp190, align 2
-	%tmp193 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 4, i32 1		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp193, align 2
-	%tmp204 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 3, i32 0		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp204, align 2
-	%tmp207 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 3, i32 1		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp207, align 2
-	%tmp218 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 2, i32 0		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp218, align 2
-	%tmp221 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 2, i32 1		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp221, align 2
-	%tmp232 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 1, i32 0		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp232, align 2
-	%tmp235 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 1, i32 1		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp235, align 2
-	%tmp246 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 0, i32 0		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp246, align 8
-	%tmp249 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 0, i32 1		; <i16*> [#uses=1]
-	store i16 0, i16* %tmp249, align 2
-	%up_mvd252 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 0		; <%struct.MV*> [#uses=1]
-	%left_mvd253 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 0		; <%struct.MV*> [#uses=1]
-	call void @foo( %struct.MV* %up_mvd252, %struct.MV* %left_mvd253, i8* %tmp41 ) nounwind 
-	ret void
-        
 ; CHECK-LABEL: @test2(
-; CHECK-NOT: store
-; CHECK: call void @llvm.memset.p0i8.i64(i8* align 1 %tmp41, i8 -1, i64 8, i1 false)
-; CHECK-NOT: store
-; CHECK: call void @llvm.memset.p0i8.i64(i8* align 8 %0, i8 0, i64 32, i1 false)
-; CHECK-NOT: store
-; CHECK: call void @llvm.memset.p0i8.i64(i8* align 8 %1, i8 0, i64 32, i1 false)
-; CHECK-NOT: store
-; CHECK: ret
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[REF_IDX:%.*]] = alloca [8 x i8], align 1
+; CHECK-NEXT:    [[LEFT_MVD:%.*]] = alloca [8 x %struct.MV], align 8
+; CHECK-NEXT:    [[UP_MVD:%.*]] = alloca [8 x %struct.MV], align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr [8 x i8], [8 x i8]* [[REF_IDX]], i32 0, i32 7
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr [8 x i8], [8 x i8]* [[REF_IDX]], i32 0, i32 6
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr [8 x i8], [8 x i8]* [[REF_IDX]], i32 0, i32 5
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr [8 x i8], [8 x i8]* [[REF_IDX]], i32 0, i32 4
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr [8 x i8], [8 x i8]* [[REF_IDX]], i32 0, i32 3
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr [8 x i8], [8 x i8]* [[REF_IDX]], i32 0, i32 2
+; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr [8 x i8], [8 x i8]* [[REF_IDX]], i32 0, i32 1
+; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr [8 x i8], [8 x i8]* [[REF_IDX]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[UP_MVD]], i32 0, i32 7, i32 0
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 1 [[TMP41]], i8 -1, i64 8, i1 false)
+; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[UP_MVD]], i32 0, i32 7, i32 1
+; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[UP_MVD]], i32 0, i32 6, i32 0
+; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[UP_MVD]], i32 0, i32 6, i32 1
+; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[UP_MVD]], i32 0, i32 5, i32 0
+; CHECK-NEXT:    [[TMP74:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[UP_MVD]], i32 0, i32 5, i32 1
+; CHECK-NEXT:    [[TMP85:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[UP_MVD]], i32 0, i32 4, i32 0
+; CHECK-NEXT:    [[TMP88:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[UP_MVD]], i32 0, i32 4, i32 1
+; CHECK-NEXT:    [[TMP99:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[UP_MVD]], i32 0, i32 3, i32 0
+; CHECK-NEXT:    [[TMP102:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[UP_MVD]], i32 0, i32 3, i32 1
+; CHECK-NEXT:    [[TMP113:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[UP_MVD]], i32 0, i32 2, i32 0
+; CHECK-NEXT:    [[TMP116:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[UP_MVD]], i32 0, i32 2, i32 1
+; CHECK-NEXT:    [[TMP127:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[UP_MVD]], i32 0, i32 1, i32 0
+; CHECK-NEXT:    [[TMP130:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[UP_MVD]], i32 0, i32 1, i32 1
+; CHECK-NEXT:    [[TMP141:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[UP_MVD]], i32 0, i32 0, i32 0
+; CHECK-NEXT:    [[TMP144:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[UP_MVD]], i32 0, i32 0, i32 1
+; CHECK-NEXT:    [[TMP148:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[LEFT_MVD]], i32 0, i32 7, i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[TMP141]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP0]], i8 0, i64 32, i1 false)
+; CHECK-NEXT:    [[TMP151:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[LEFT_MVD]], i32 0, i32 7, i32 1
+; CHECK-NEXT:    [[TMP162:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[LEFT_MVD]], i32 0, i32 6, i32 0
+; CHECK-NEXT:    [[TMP165:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[LEFT_MVD]], i32 0, i32 6, i32 1
+; CHECK-NEXT:    [[TMP176:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[LEFT_MVD]], i32 0, i32 5, i32 0
+; CHECK-NEXT:    [[TMP179:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[LEFT_MVD]], i32 0, i32 5, i32 1
+; CHECK-NEXT:    [[TMP190:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[LEFT_MVD]], i32 0, i32 4, i32 0
+; CHECK-NEXT:    [[TMP193:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[LEFT_MVD]], i32 0, i32 4, i32 1
+; CHECK-NEXT:    [[TMP204:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[LEFT_MVD]], i32 0, i32 3, i32 0
+; CHECK-NEXT:    [[TMP207:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[LEFT_MVD]], i32 0, i32 3, i32 1
+; CHECK-NEXT:    [[TMP218:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[LEFT_MVD]], i32 0, i32 2, i32 0
+; CHECK-NEXT:    [[TMP221:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[LEFT_MVD]], i32 0, i32 2, i32 1
+; CHECK-NEXT:    [[TMP232:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[LEFT_MVD]], i32 0, i32 1, i32 0
+; CHECK-NEXT:    [[TMP235:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[LEFT_MVD]], i32 0, i32 1, i32 1
+; CHECK-NEXT:    [[TMP246:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[LEFT_MVD]], i32 0, i32 0, i32 0
+; CHECK-NEXT:    [[TMP249:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[LEFT_MVD]], i32 0, i32 0, i32 1
+; CHECK-NEXT:    [[UP_MVD252:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[UP_MVD]], i32 0, i32 0
+; CHECK-NEXT:    [[LEFT_MVD253:%.*]] = getelementptr [8 x %struct.MV], [8 x %struct.MV]* [[LEFT_MVD]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[TMP246]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP1]], i8 0, i64 32, i1 false)
+; CHECK-NEXT:    call void @foo(%struct.MV* [[UP_MVD252]], %struct.MV* [[LEFT_MVD253]], i8* [[TMP41]]) [[ATTR0]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ref_idx = alloca [8 x i8]		; <[8 x i8]*> [#uses=8]
+  %left_mvd = alloca [8 x %struct.MV]		; <[8 x %struct.MV]*> [#uses=17]
+  %up_mvd = alloca [8 x %struct.MV]		; <[8 x %struct.MV]*> [#uses=17]
+  %tmp20 = getelementptr [8 x i8], [8 x i8]* %ref_idx, i32 0, i32 7		; <i8*> [#uses=1]
+  store i8 -1, i8* %tmp20, align 1
+  %tmp23 = getelementptr [8 x i8], [8 x i8]* %ref_idx, i32 0, i32 6		; <i8*> [#uses=1]
+  store i8 -1, i8* %tmp23, align 1
+  %tmp26 = getelementptr [8 x i8], [8 x i8]* %ref_idx, i32 0, i32 5		; <i8*> [#uses=1]
+  store i8 -1, i8* %tmp26, align 1
+  %tmp29 = getelementptr [8 x i8], [8 x i8]* %ref_idx, i32 0, i32 4		; <i8*> [#uses=1]
+  store i8 -1, i8* %tmp29, align 1
+  %tmp32 = getelementptr [8 x i8], [8 x i8]* %ref_idx, i32 0, i32 3		; <i8*> [#uses=1]
+  store i8 -1, i8* %tmp32, align 1
+  %tmp35 = getelementptr [8 x i8], [8 x i8]* %ref_idx, i32 0, i32 2		; <i8*> [#uses=1]
+  store i8 -1, i8* %tmp35, align 1
+  %tmp38 = getelementptr [8 x i8], [8 x i8]* %ref_idx, i32 0, i32 1		; <i8*> [#uses=1]
+  store i8 -1, i8* %tmp38, align 1
+  %tmp41 = getelementptr [8 x i8], [8 x i8]* %ref_idx, i32 0, i32 0		; <i8*> [#uses=2]
+  store i8 -1, i8* %tmp41, align 1
+  %tmp43 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 7, i32 0		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp43, align 2
+  %tmp46 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 7, i32 1		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp46, align 2
+  %tmp57 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 6, i32 0		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp57, align 2
+  %tmp60 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 6, i32 1		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp60, align 2
+  %tmp71 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 5, i32 0		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp71, align 2
+  %tmp74 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 5, i32 1		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp74, align 2
+  %tmp85 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 4, i32 0		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp85, align 2
+  %tmp88 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 4, i32 1		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp88, align 2
+  %tmp99 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 3, i32 0		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp99, align 2
+  %tmp102 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 3, i32 1		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp102, align 2
+  %tmp113 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 2, i32 0		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp113, align 2
+  %tmp116 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 2, i32 1		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp116, align 2
+  %tmp127 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 1, i32 0		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp127, align 2
+  %tmp130 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 1, i32 1		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp130, align 2
+  %tmp141 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 0, i32 0		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp141, align 8
+  %tmp144 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 0, i32 1		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp144, align 2
+  %tmp148 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 7, i32 0		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp148, align 2
+  %tmp151 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 7, i32 1		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp151, align 2
+  %tmp162 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 6, i32 0		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp162, align 2
+  %tmp165 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 6, i32 1		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp165, align 2
+  %tmp176 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 5, i32 0		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp176, align 2
+  %tmp179 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 5, i32 1		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp179, align 2
+  %tmp190 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 4, i32 0		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp190, align 2
+  %tmp193 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 4, i32 1		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp193, align 2
+  %tmp204 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 3, i32 0		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp204, align 2
+  %tmp207 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 3, i32 1		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp207, align 2
+  %tmp218 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 2, i32 0		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp218, align 2
+  %tmp221 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 2, i32 1		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp221, align 2
+  %tmp232 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 1, i32 0		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp232, align 2
+  %tmp235 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 1, i32 1		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp235, align 2
+  %tmp246 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 0, i32 0		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp246, align 8
+  %tmp249 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 0, i32 1		; <i16*> [#uses=1]
+  store i16 0, i16* %tmp249, align 2
+  %up_mvd252 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %up_mvd, i32 0, i32 0		; <%struct.MV*> [#uses=1]
+  %left_mvd253 = getelementptr [8 x %struct.MV], [8 x %struct.MV]* %left_mvd, i32 0, i32 0		; <%struct.MV*> [#uses=1]
+  call void @foo( %struct.MV* %up_mvd252, %struct.MV* %left_mvd253, i8* %tmp41 ) nounwind
+  ret void
+
 }
 
 declare void @foo(%struct.MV*, %struct.MV*, i8*)
@@ -166,6 +234,15 @@ declare void @foo(%struct.MV*, %struct.MV*, i8*)
 
 ; Store followed by memset.
 define void @test3(i32* nocapture %P) nounwind ssp {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ADD_PTR]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[ARRAYIDX]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP1]], i8 0, i64 15, i1 false)
+; CHECK-NEXT:    ret void
+;
 entry:
   %arrayidx = getelementptr inbounds i32, i32* %P, i64 1
   store i32 0, i32* %arrayidx, align 4
@@ -173,28 +250,39 @@ entry:
   %0 = bitcast i32* %add.ptr to i8*
   tail call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 11, i1 false)
   ret void
-; CHECK-LABEL: @test3(
-; CHECK-NOT: store
-; CHECK: call void @llvm.memset.p0i8.i64(i8* align 4 %1, i8 0, i64 15, i1 false)
 }
 
 ; store followed by memset, 
diff erent offset scenario
 define void @test4(i32* nocapture %P) nounwind ssp {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ADD_PTR]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP1]], i8 0, i64 15, i1 false)
+; CHECK-NEXT:    ret void
+;
 entry:
   store i32 0, i32* %P, align 4
   %add.ptr = getelementptr inbounds i32, i32* %P, i64 1
   %0 = bitcast i32* %add.ptr to i8*
   tail call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 11, i1 false)
   ret void
-; CHECK-LABEL: @test4(
-; CHECK-NOT: store
-; CHECK: call void @llvm.memset.p0i8.i64(i8* align 4 %1, i8 0, i64 15, i1 false)
 }
 
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
 
 ; Memset followed by store.
 define void @test5(i32* nocapture %P) nounwind ssp {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 2
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ADD_PTR]] to i8*
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[ARRAYIDX]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP1]], i8 0, i64 15, i1 false)
+; CHECK-NEXT:    ret void
+;
 entry:
   %add.ptr = getelementptr inbounds i32, i32* %P, i64 2
   %0 = bitcast i32* %add.ptr to i8*
@@ -202,13 +290,19 @@ entry:
   %arrayidx = getelementptr inbounds i32, i32* %P, i64 1
   store i32 0, i32* %arrayidx, align 4
   ret void
-; CHECK-LABEL: @test5(
-; CHECK-NOT: store
-; CHECK: call void @llvm.memset.p0i8.i64(i8* align 4 %1, i8 0, i64 15, i1 false)
 }
 
 ;; Memset followed by memset.
 define void @test6(i32* nocapture %P) nounwind ssp {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P:%.*]] to i8*
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[ADD_PTR]] to i8*
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[P]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[TMP2]], i8 0, i64 24, i1 false)
+; CHECK-NEXT:    ret void
+;
 entry:
   %0 = bitcast i32* %P to i8*
   tail call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 12, i1 false)
@@ -216,13 +310,20 @@ entry:
   %1 = bitcast i32* %add.ptr to i8*
   tail call void @llvm.memset.p0i8.i64(i8* %1, i8 0, i64 12, i1 false)
   ret void
-; CHECK-LABEL: @test6(
-; CHECK: call void @llvm.memset.p0i8.i64(i8* %2, i8 0, i64 24, i1 false)
 }
 
 ; More aggressive heuristic
 ; rdar://9892684
 define void @test7(i32* nocapture %c) nounwind optsize {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 4
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[C]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP5]], i8 -1, i64 20, i1 false)
+; CHECK-NEXT:    ret void
+;
   store i32 -1, i32* %c, align 4
   %1 = getelementptr inbounds i32, i32* %c, i32 1
   store i32 -1, i32* %1, align 4
@@ -232,26 +333,33 @@ define void @test7(i32* nocapture %c) nounwind optsize {
   store i32 -1, i32* %3, align 4
   %4 = getelementptr inbounds i32, i32* %c, i32 4
   store i32 -1, i32* %4, align 4
-; CHECK-LABEL: @test7(
-; CHECK: call void @llvm.memset.p0i8.i64(i8* align 4 %5, i8 -1, i64 20, i1 false)
   ret void
 }
 
 %struct.test8 = type { [4 x i32] }
 
 define void @test8() {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MEMTMP:%.*]] = alloca [[STRUCT_TEST8:%.*]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.test8* [[MEMTMP]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32>* [[TMP0]], align 16
+; CHECK-NEXT:    ret void
+;
 entry:
   %memtmp = alloca %struct.test8, align 16
   %0 = bitcast %struct.test8* %memtmp to <4 x i32>*
   store <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32>* %0, align 16
   ret void
-; CHECK-LABEL: @test8(
-; CHECK: store <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32>* %0, align 16
 }
 
 @test9buf = internal unnamed_addr global [16 x i64] zeroinitializer, align 16
 
 define void @test9() nounwind {
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 16 bitcast ([16 x i64]* @test9buf to i8*), i8 -1, i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
   store i8 -1, i8* bitcast ([16 x i64]* @test9buf to i8*), align 16
   store i8 -1, i8* getelementptr (i8, i8* bitcast ([16 x i64]* @test9buf to i8*), i64 1), align 1
   store i8 -1, i8* getelementptr (i8, i8* bitcast ([16 x i64]* @test9buf to i8*), i64 2), align 2
@@ -269,24 +377,31 @@ define void @test9() nounwind {
   store i8 -1, i8* getelementptr (i8, i8* bitcast ([16 x i64]* @test9buf to i8*), i64 14), align 2
   store i8 -1, i8* getelementptr (i8, i8* bitcast ([16 x i64]* @test9buf to i8*), i64 15), align 1
   ret void
-; CHECK-LABEL: @test9(
-; CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 bitcast ([16 x i64]* @test9buf to i8*), i8 -1, i64 16, i1 false)
 }
 
 ; PR19092
 define void @test10(i8* nocapture %P) nounwind {
+; CHECK-LABEL: @test10(
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[P:%.*]], i8 0, i64 42, i1 false)
+; CHECK-NEXT:    ret void
+;
   tail call void @llvm.memset.p0i8.i64(i8* %P, i8 0, i64 42, i1 false)
   tail call void @llvm.memset.p0i8.i64(i8* %P, i8 0, i64 23, i1 false)
   ret void
-; CHECK-LABEL: @test10(
-; CHECK-NOT: memset
-; CHECK: call void @llvm.memset.p0i8.i64(i8* %P, i8 0, i64 42, i1 false)
-; CHECK-NOT: memset
-; CHECK: ret void
 }
 
 ; Memset followed by odd store.
 define void @test11(i32* nocapture %P) nounwind ssp {
+; CHECK-LABEL: @test11(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ADD_PTR]] to i8*
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 0
+; CHECK-NEXT:    [[ARRAYIDX_CAST:%.*]] = bitcast i32* [[ARRAYIDX]] to i96*
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i96* [[ARRAYIDX_CAST]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP1]], i8 1, i64 23, i1 false)
+; CHECK-NEXT:    ret void
+;
 entry:
   %add.ptr = getelementptr inbounds i32, i32* %P, i64 3
   %0 = bitcast i32* %add.ptr to i8*
@@ -295,20 +410,22 @@ entry:
   %arrayidx.cast = bitcast i32* %arrayidx to i96*
   store i96 310698676526526814092329217, i96* %arrayidx.cast, align 4
   ret void
-; CHECK-LABEL: @test11(
-; CHECK-NOT: store
-; CHECK: call void @llvm.memset.p0i8.i64(i8* align 4 %1, i8 1, i64 23, i1 false)
 }
 
 ; Alignment should be preserved when there is a store with default align
 define void @test12(i32* nocapture %P) nounwind ssp {
+; CHECK-LABEL: @test12(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ADD_PTR]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP1]], i8 0, i64 15, i1 false)
+; CHECK-NEXT:    ret void
+;
 entry:
   store i32 0, i32* %P
   %add.ptr = getelementptr inbounds i32, i32* %P, i64 1
   %0 = bitcast i32* %add.ptr to i8*
   tail call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 11, i1 false)
   ret void
-; CHECK-LABEL: @test12(
-; CHECK-NOT: store
-; CHECK: call void @llvm.memset.p0i8.i64(i8* align 4 %1, i8 0, i64 15, i1 false)
 }

diff  --git a/llvm/test/Transforms/MemCpyOpt/invariant.start.ll b/llvm/test/Transforms/MemCpyOpt/invariant.start.ll
index b7e3160c7da7..1bab2f65799a 100644
--- a/llvm/test/Transforms/MemCpyOpt/invariant.start.ll
+++ b/llvm/test/Transforms/MemCpyOpt/invariant.start.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; MemCpy optimizations should take place even in presence of invariant.start
 ; RUN: opt < %s -basic-aa -memcpyopt -dse -S | FileCheck %s
 
@@ -16,30 +17,32 @@ declare {}* @llvm.invariant.start.p0i8(i64, i8* nocapture) nounwind readonly
 ; The intermediate alloca and one of the memcpy's should be eliminated, the
 ; other should be transformed to a memmove.
 define void @test1(i8* %P, i8* %Q) nounwind  {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[MEMTMP:%.*]] = alloca [[TMP0:%.*]], align 16
+; CHECK-NEXT:    [[R:%.*]] = bitcast %0* [[MEMTMP]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[R]], i8* align 16 [[P:%.*]], i32 32, i1 false)
+; CHECK-NEXT:    [[I:%.*]] = call {}* @llvm.invariant.start.p0i8(i64 32, i8* [[P]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[Q:%.*]], i8* align 16 [[R]], i32 32, i1 false)
+; CHECK-NEXT:    ret void
+;
   %memtmp = alloca %0, align 16
   %R = bitcast %0* %memtmp to i8*
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %R, i8* align 16 %P, i32 32, i1 false)
   %i = call {}* @llvm.invariant.start.p0i8(i64 32, i8* %P)
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %Q, i8* align 16 %R, i32 32, i1 false)
   ret void
-; CHECK-LABEL: @test1(
-; CHECK-NEXT: %memtmp = alloca %0, align 16
-; CHECK-NEXT: %R = bitcast %0* %memtmp to i8*
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %R, i8* align 16 %P, i32 32, i1 false)
-; CHECK-NEXT: %i = call {}* @llvm.invariant.start.p0i8(i64 32, i8* %P)
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %Q, i8* align 16 %R, i32 32, i1 false)
-; CHECK-NEXT: ret void
 }
 
 
 ; The invariant.start intrinsic does not inhibit tranforming the memcpy to a
 ; memset.
 define void @test2(i8* %dst1, i8* %dst2, i8 %c) {
-; CHECK-LABEL: define void @test2(
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i1 false)
-; CHECK-NEXT: %i = call {}* @llvm.invariant.start.p0i8(i64 32, i8* %dst1)
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 %dst2, i8 %c, i64 128, i1 false)
-; CHECK-NEXT: ret void
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[DST1:%.*]], i8 [[C:%.*]], i64 128, i1 false)
+; CHECK-NEXT:    [[I:%.*]] = call {}* @llvm.invariant.start.p0i8(i64 32, i8* [[DST1]])
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[DST2:%.*]], i8 [[C]], i64 128, i1 false)
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i1 false)
   %i = call {}* @llvm.invariant.start.p0i8(i64 32, i8* %dst1)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %dst2, i8* align 8 %dst1, i64 128, i1 false)

diff  --git a/llvm/test/Transforms/MemCpyOpt/lifetime.ll b/llvm/test/Transforms/MemCpyOpt/lifetime.ll
index ad14bdd6df66..f998a194d688 100644
--- a/llvm/test/Transforms/MemCpyOpt/lifetime.ll
+++ b/llvm/test/Transforms/MemCpyOpt/lifetime.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -O2 -S | FileCheck %s
 
 ; performCallSlotOptzn in MemCpy should not exchange the calls to
@@ -8,10 +9,13 @@ declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
 declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
 define void @_ZN4CordC2EOS_(i8* nocapture dereferenceable(16) %arg1) {
+; CHECK-LABEL: @_ZN4CordC2EOS_(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP_SROA_3_0_ARG1_SROA_RAW_IDX:%.*]] = getelementptr inbounds i8, i8* [[ARG1:%.*]], i64 7
+; CHECK-NEXT:    store i8 0, i8* [[TMP_SROA_3_0_ARG1_SROA_RAW_IDX]], align 1
+; CHECK-NEXT:    ret void
+;
 bb:
-; CHECK-LABEL: @_ZN4CordC2EOS_
-; CHECK-NOT: call void @llvm.lifetime.start
-; CHECK: ret void
   %tmp = alloca [8 x i8], align 8
   %tmp5 = bitcast [8 x i8]* %tmp to i8*
   call void @llvm.lifetime.start.p0i8(i64 16, i8* %tmp5)

diff  --git a/llvm/test/Transforms/MemCpyOpt/load-store-to-memcpy.ll b/llvm/test/Transforms/MemCpyOpt/load-store-to-memcpy.ll
index c3f7a1127281..1c61132eb2d2 100644
--- a/llvm/test/Transforms/MemCpyOpt/load-store-to-memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/load-store-to-memcpy.ll
@@ -35,9 +35,9 @@ define void @test_memcpy(%T* noalias align 8 %a, %T* noalias align 16 %b) {
 define void @f(%T* %a, %T* %b, %T* %c, %T* %d) {
 ; CHECK-LABEL: @f(
 ; CHECK-NEXT:    [[VAL:%.*]] = load [[T:%.*]], %T* [[A:%.*]], align 4, !alias.scope !0
-; CHECK-NEXT:    store [[T]] { i8 23, i32 23 }, %T* [[B:%.*]], !alias.scope !3
-; CHECK-NEXT:    store [[T]] { i8 44, i32 44 }, %T* [[C:%.*]], !alias.scope !6, !noalias !3
-; CHECK-NEXT:    store [[T]] %val, %T* [[D:%.*]], !alias.scope !9, !noalias !12
+; CHECK-NEXT:    store [[T]] { i8 23, i32 23 }, %T* [[B:%.*]], align 4, !alias.scope !3
+; CHECK-NEXT:    store [[T]] { i8 44, i32 44 }, %T* [[C:%.*]], align 4, !alias.scope !6, !noalias !3
+; CHECK-NEXT:    store [[T]] [[VAL]], %T* [[D:%.*]], align 4, !alias.scope !9, !noalias !12
 ; CHECK-NEXT:    ret void
 ;
   %val = load %T, %T* %a, !alias.scope !{!10}

diff  --git a/llvm/test/Transforms/MemCpyOpt/loadstore-sret.ll b/llvm/test/Transforms/MemCpyOpt/loadstore-sret.ll
index 0f8a70a5511d..9b0098a499d9 100644
--- a/llvm/test/Transforms/MemCpyOpt/loadstore-sret.ll
+++ b/llvm/test/Transforms/MemCpyOpt/loadstore-sret.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S < %s -basic-aa -memcpyopt | FileCheck %s
 ; <rdar://problem/8536696>
 
@@ -6,19 +7,22 @@ target triple = "x86_64-apple-darwin10.0.0"
 
 %"class.std::auto_ptr" = type { i32* }
 
-; CHECK-LABEL: @_Z3foov(
 define void @_Z3foov(%"class.std::auto_ptr"* noalias nocapture sret %agg.result) ssp {
+; CHECK-LABEL: @_Z3foov(
+; CHECK-NEXT:  _ZNSt8auto_ptrIiED1Ev.exit:
+; CHECK-NEXT:    [[TEMP_LVALUE:%.*]] = alloca %"class.std::auto_ptr", align 8
+; CHECK-NEXT:    call void @_Z3barv(%"class.std::auto_ptr"* sret [[AGG_RESULT:%.*]])
+; CHECK-NEXT:    [[TMP_I_I:%.*]] = getelementptr inbounds %"class.std::auto_ptr", %"class.std::auto_ptr"* [[TEMP_LVALUE]], i64 0, i32 0
+; CHECK-NEXT:    [[TMP_I_I4:%.*]] = getelementptr inbounds %"class.std::auto_ptr", %"class.std::auto_ptr"* [[AGG_RESULT]], i64 0, i32 0
+; CHECK-NEXT:    ret void
+;
 _ZNSt8auto_ptrIiED1Ev.exit:
   %temp.lvalue = alloca %"class.std::auto_ptr", align 8
-; CHECK: call void @_Z3barv(%"class.std::auto_ptr"* sret %agg.result)
   call void @_Z3barv(%"class.std::auto_ptr"* sret %temp.lvalue)
   %tmp.i.i = getelementptr inbounds %"class.std::auto_ptr", %"class.std::auto_ptr"* %temp.lvalue, i64 0, i32 0
-; CHECK-NOT: load
   %tmp2.i.i = load i32*, i32** %tmp.i.i, align 8
   %tmp.i.i4 = getelementptr inbounds %"class.std::auto_ptr", %"class.std::auto_ptr"* %agg.result, i64 0, i32 0
-; CHECK-NOT: store
   store i32* %tmp2.i.i, i32** %tmp.i.i4, align 8
-; CHECK: ret void
   ret void
 }
 

diff  --git a/llvm/test/Transforms/MemCpyOpt/memcpy-to-memset.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-to-memset.ll
index 1424ca3709cc..97237a6e68dd 100644
--- a/llvm/test/Transforms/MemCpyOpt/memcpy-to-memset.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy-to-memset.ll
@@ -1,89 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -memcpyopt -S < %s | FileCheck %s
 
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind
 
 @undef = internal constant i32 undef, align 4
 define void @test_undef() nounwind {
+; CHECK-LABEL: @test_undef(
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[I8:%.*]] = bitcast i32* [[A]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[I8]], i8 undef, i64 4, i1 false)
+; CHECK-NEXT:    ret void
+;
   %a = alloca i32, align 4
   %i8 = bitcast i32* %a to i8*
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %i8, i8* align 4 bitcast (i32* @undef to i8*), i64 4, i1 false)
   ret void
-; CHECK-LABEL: @test_undef(
-; CHECK:       call void @llvm.memset
-; CHECK-NOT:   call void @llvm.memcpy
-; CHECK:       ret void
 }
 
 @i32x3 = internal constant [3 x i32] [i32 -1, i32 -1, i32 -1], align 4
 define void @test_i32x3() nounwind {
+; CHECK-LABEL: @test_i32x3(
+; CHECK-NEXT:    [[A:%.*]] = alloca [3 x i32], align 4
+; CHECK-NEXT:    [[I8:%.*]] = bitcast [3 x i32]* [[A]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[I8]], i8 -1, i64 12, i1 false)
+; CHECK-NEXT:    ret void
+;
   %a = alloca [3 x i32], align 4
   %i8 = bitcast [3 x i32]* %a to i8*
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %i8, i8* align 4 bitcast ([3 x i32]* @i32x3 to i8*), i64 12, i1 false)
   ret void
-; CHECK-LABEL: @test_i32x3(
-; CHECK:       call void @llvm.memset
-; CHECK-NOT:   call void @llvm.memcpy
-; CHECK:       ret void
 }
 
 @i32x3_undef = internal constant [3 x i32] [i32 -1, i32 undef, i32 -1], align 4
 define void @test_i32x3_undef() nounwind {
+; CHECK-LABEL: @test_i32x3_undef(
+; CHECK-NEXT:    [[A:%.*]] = alloca [3 x i32], align 4
+; CHECK-NEXT:    [[I8:%.*]] = bitcast [3 x i32]* [[A]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[I8]], i8 -1, i64 12, i1 false)
+; CHECK-NEXT:    ret void
+;
   %a = alloca [3 x i32], align 4
   %i8 = bitcast [3 x i32]* %a to i8*
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %i8, i8* align 4 bitcast ([3 x i32]* @i32x3_undef to i8*), i64 12, i1 false)
   ret void
-; CHECK-LABEL: @test_i32x3_undef(
-; CHECK:       call void @llvm.memset
-; CHECK-NOT:   call void @llvm.memcpy
-; CHECK:       ret void
 }
 
 %struct.bitfield = type { i8, [3 x i8] }
 @bitfield = private unnamed_addr constant %struct.bitfield { i8 -86, [3 x i8] [i8 -86, i8 -86, i8 -86] }, align 4
 define void @test_bitfield() nounwind {
+; CHECK-LABEL: @test_bitfield(
+; CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_BITFIELD:%.*]], align 4
+; CHECK-NEXT:    [[I8:%.*]] = bitcast %struct.bitfield* [[A]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[I8]], i8 -86, i64 4, i1 false)
+; CHECK-NEXT:    ret void
+;
   %a = alloca %struct.bitfield, align 4
   %i8 = bitcast %struct.bitfield* %a to i8*
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %i8, i8* align 4 bitcast (%struct.bitfield* @bitfield to i8*), i64 4, i1 false)
   ret void
-; CHECK-LABEL: @test_bitfield(
-; CHECK:       call void @llvm.memset
-; CHECK-NOT:   call void @llvm.memcpy
-; CHECK:       ret void
 }
 
 @i1x16_zero = internal constant <16 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, align 4
 define void @test_i1x16_zero() nounwind {
+; CHECK-LABEL: @test_i1x16_zero(
+; CHECK-NEXT:    [[A:%.*]] = alloca <16 x i1>, align 4
+; CHECK-NEXT:    [[I8:%.*]] = bitcast <16 x i1>* [[A]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[I8]], i8 0, i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
   %a = alloca <16 x i1>, align 4
   %i8 = bitcast <16 x i1>* %a to i8*
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %i8, i8* align 4 bitcast (<16 x i1>* @i1x16_zero to i8*), i64 16, i1 false)
   ret void
-; CHECK-LABEL: @test_i1x16_zero(
-; CHECK:       call void @llvm.memset
-; CHECK-NOT:   call void @llvm.memcpy
-; CHECK:       ret void
 }
 
 ; i1 isn't currently handled. Should it?
 @i1x16_one = internal constant <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, align 4
 define void @test_i1x16_one() nounwind {
+; CHECK-LABEL: @test_i1x16_one(
+; CHECK-NEXT:    [[A:%.*]] = alloca <16 x i1>, align 4
+; CHECK-NEXT:    [[I8:%.*]] = bitcast <16 x i1>* [[A]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[I8]], i8* align 4 bitcast (<16 x i1>* @i1x16_one to i8*), i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
   %a = alloca <16 x i1>, align 4
   %i8 = bitcast <16 x i1>* %a to i8*
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %i8, i8* align 4 bitcast (<16 x i1>* @i1x16_one to i8*), i64 16, i1 false)
   ret void
-; CHECK-LABEL: @test_i1x16_one(
-; CHECK-NOT:   call void @llvm.memset
-; CHECK:      call void @llvm.memcpy
-; CHECK:       ret void
 }
 
 @half = internal constant half 0xH0000, align 4
 define void @test_half() nounwind {
+; CHECK-LABEL: @test_half(
+; CHECK-NEXT:    [[A:%.*]] = alloca half, align 4
+; CHECK-NEXT:    [[I8:%.*]] = bitcast half* [[A]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[I8]], i8 0, i64 2, i1 false)
+; CHECK-NEXT:    ret void
+;
   %a = alloca half, align 4
   %i8 = bitcast half* %a to i8*
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %i8, i8* align 4 bitcast (half* @half to i8*), i64 2, i1 false)
   ret void
-; CHECK-LABEL: @test_half(
-; CHECK:       call void @llvm.memset
-; CHECK-NOT:   call void @llvm.memcpy
-; CHECK:       ret void
 }

diff  --git a/llvm/test/Transforms/MemCpyOpt/memcpy-undef.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-undef.ll
index 5cdd1a27258c..e1dd9c92d4e0 100644
--- a/llvm/test/Transforms/MemCpyOpt/memcpy-undef.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy-undef.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -basic-aa -memcpyopt -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -6,6 +7,16 @@ target triple = "x86_64-apple-macosx10.8.0"
 %struct.foo = type { i8, [7 x i8], i32 }
 
 define i32 @test1(%struct.foo* nocapture %foobie) nounwind noinline ssp uwtable {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[BLETCH_SROA_1:%.*]] = alloca [7 x i8], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_FOO:%.*]], %struct.foo* [[FOOBIE:%.*]], i64 0, i32 0
+; CHECK-NEXT:    store i8 98, i8* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_FOO]], %struct.foo* [[FOOBIE]], i64 0, i32 1, i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [7 x i8], [7 x i8]* [[BLETCH_SROA_1]], i64 0, i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_FOO]], %struct.foo* [[FOOBIE]], i64 0, i32 2
+; CHECK-NEXT:    store i32 20, i32* [[TMP4]], align 4
+; CHECK-NEXT:    ret i32 undef
+;
   %bletch.sroa.1 = alloca [7 x i8], align 1
   %1 = getelementptr inbounds %struct.foo, %struct.foo* %foobie, i64 0, i32 0
   store i8 98, i8* %1, align 4
@@ -17,28 +28,31 @@ define i32 @test1(%struct.foo* nocapture %foobie) nounwind noinline ssp uwtable
   ret i32 undef
 
 ; Check that the memcpy is removed.
-; CHECK-LABEL: @test1(
-; CHECK-NOT: call void @llvm.memcpy
 }
 
 define void @test2(i8* sret noalias nocapture %out, i8* %in) nounwind noinline ssp uwtable {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 8, i8* [[IN:%.*]])
+; CHECK-NEXT:    ret void
+;
   call void @llvm.lifetime.start.p0i8(i64 8, i8* %in)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %out, i8* %in, i64 8, i1 false)
   ret void
 
 ; Check that the memcpy is removed.
-; CHECK-LABEL: @test2(
-; CHECK-NOT: call void @llvm.memcpy
 }
 
 define void @test3(i8* sret noalias nocapture %out, i8* %in) nounwind noinline ssp uwtable {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* [[IN:%.*]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[OUT:%.*]], i8* [[IN]], i64 8, i1 false)
+; CHECK-NEXT:    ret void
+;
   call void @llvm.lifetime.start.p0i8(i64 4, i8* %in)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %out, i8* %in, i64 8, i1 false)
   ret void
 
 ; Check that the memcpy is not removed.
-; CHECK-LABEL: @test3(
-; CHECK: call void @llvm.memcpy
 }
 
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind

diff  --git a/llvm/test/Transforms/MemCpyOpt/memcpy.ll b/llvm/test/Transforms/MemCpyOpt/memcpy.ll
index 1741da030c2e..54e5e75fd6e2 100644
--- a/llvm/test/Transforms/MemCpyOpt/memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s -basic-aa -memcpyopt -dse -S | FileCheck -enable-var-scope %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basic-aa -memcpyopt -dse -S | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i686-apple-darwin9"
@@ -7,6 +8,16 @@ target triple = "i686-apple-darwin9"
 %1 = type { i32, i32 }
 
 define void @test1(%0* sret  %agg.result, x86_fp80 %z.0, x86_fp80 %z.1) nounwind  {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca [[TMP0:%.*]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = fsub x86_fp80 0xK80000000000000000000, [[Z_1:%.*]]
+; CHECK-NEXT:    call void @ccoshl(%0* sret [[TMP2]], x86_fp80 [[TMP5]], x86_fp80 [[Z_0:%.*]]) [[ATTR0:#.*]]
+; CHECK-NEXT:    [[TMP219:%.*]] = bitcast %0* [[TMP2]] to i8*
+; CHECK-NEXT:    [[AGG_RESULT21:%.*]] = bitcast %0* [[AGG_RESULT:%.*]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[AGG_RESULT21]], i8* align 16 [[TMP219]], i32 32, i1 false)
+; CHECK-NEXT:    ret void
+;
 entry:
   %tmp2 = alloca %0
   %memtmp = alloca %0, align 16
@@ -22,11 +33,6 @@ entry:
 ; Check that one of the memcpy's are removed.
 ;; FIXME: PR 8643 We should be able to eliminate the last memcpy here.
 
-; CHECK-LABEL: @test1(
-; CHECK: call void @ccoshl
-; CHECK: call void @llvm.memcpy
-; CHECK-NOT: llvm.memcpy
-; CHECK: ret void
 }
 
 declare void @ccoshl(%0* nocapture sret, x86_fp80, x86_fp80) nounwind
@@ -35,29 +41,31 @@ declare void @ccoshl(%0* nocapture sret, x86_fp80, x86_fp80) nounwind
 ; The intermediate alloca and one of the memcpy's should be eliminated, the
 ; other should be related with a memmove.
 define void @test2(i8* %P, i8* %Q) nounwind  {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    call void @llvm.memmove.p0i8.p0i8.i32(i8* align 16 [[Q:%.*]], i8* align 16 [[P:%.*]], i32 32, i1 false)
+; CHECK-NEXT:    ret void
+;
   %memtmp = alloca %0, align 16
   %R = bitcast %0* %memtmp to i8*
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %R, i8* align 16 %P, i32 32, i1 false)
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %Q, i8* align 16 %R, i32 32, i1 false)
   ret void
 
-; CHECK-LABEL: @test2(
-; CHECK-NEXT: call void @llvm.memmove{{.*}}(i8* align 16 %Q, i8* align 16 %P
-; CHECK-NEXT: ret void
 }
 
 ; The intermediate alloca and one of the memcpy's should be eliminated, the
 ; other should be related with a memcpy.
 define void @test2_memcpy(i8* noalias %P, i8* noalias %Q) nounwind  {
+; CHECK-LABEL: @test2_memcpy(
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[Q:%.*]], i8* align 16 [[P:%.*]], i32 32, i1 false)
+; CHECK-NEXT:    ret void
+;
   %memtmp = alloca %0, align 16
   %R = bitcast %0* %memtmp to i8*
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %R, i8* align 16 %P, i32 32, i1 false)
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %Q, i8* align 16 %R, i32 32, i1 false)
   ret void
 
-; CHECK-LABEL: @test2_memcpy(
-; CHECK-NEXT: call void @llvm.memcpy{{.*}}(i8* align 16 %Q, i8* align 16 %P
-; CHECK-NEXT: ret void
 }
 
 
@@ -66,40 +74,47 @@ define void @test2_memcpy(i8* noalias %P, i8* noalias %Q) nounwind  {
 @x = external global %0
 
 define void @test3(%0* noalias sret %agg.result) nounwind  {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[AGG_RESULT1:%.*]] = bitcast %0* [[AGG_RESULT:%.*]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[AGG_RESULT1]], i8* align 16 bitcast (%0* @x to i8*), i32 32, i1 false)
+; CHECK-NEXT:    ret void
+;
   %x.0 = alloca %0
   %x.01 = bitcast %0* %x.0 to i8*
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %x.01, i8* align 16 bitcast (%0* @x to i8*), i32 32, i1 false)
   %agg.result2 = bitcast %0* %agg.result to i8*
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %agg.result2, i8* align 16 %x.01, i32 32, i1 false)
   ret void
-; CHECK-LABEL: @test3(
-; CHECK-NEXT: %agg.result1 = bitcast
-; CHECK-NEXT: call void @llvm.memcpy
-; CHECK-NEXT: ret void
 }
 
 
 ; PR8644
 define void @test4(i8 *%P) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    call void @test4a(i8* byval align 1 [[P:%.*]])
+; CHECK-NEXT:    ret void
+;
   %A = alloca %1
   %a = bitcast %1* %A to i8*
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %a, i8* align 4 %P, i64 8, i1 false)
   call void @test4a(i8* align 1 byval %a)
   ret void
-; CHECK-LABEL: @test4(
-; CHECK-NEXT: call void @test4a(
 }
 
 ; Make sure we don't remove the memcpy if the source address space doesn't match the byval argument
 define void @test4_addrspace(i8 addrspace(1)* %P) {
-  %A = alloca %1
-  %a = bitcast %1* %A to i8*
-  call void @llvm.memcpy.p0i8.p1i8.i64(i8* align 4 %a, i8 addrspace(1)* align 4 %P, i64 8, i1 false)
-  call void @test4a(i8* align 1 byval %a)
-  ret void
 ; CHECK-LABEL: @test4_addrspace(
-; CHECK: call void @llvm.memcpy.p0i8.p1i8.i64(
-; CHECK-NEXT: call void @test4a(
+; CHECK-NEXT:    [[A1:%.*]] = alloca [[TMP1:%.*]], align 8
+; CHECK-NEXT:    [[A2:%.*]] = bitcast %1* [[A1]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p1i8.i64(i8* align 4 [[A2]], i8 addrspace(1)* align 4 [[P:%.*]], i64 8, i1 false)
+; CHECK-NEXT:    call void @test4a(i8* byval align 1 [[A2]])
+; CHECK-NEXT:    ret void
+;
+  %a1 = alloca %1
+  %a2 = bitcast %1* %a1 to i8*
+  call void @llvm.memcpy.p0i8.p1i8.i64(i8* align 4 %a2, i8 addrspace(1)* align 4 %P, i64 8, i1 false)
+  call void @test4a(i8* align 1 byval %a2)
+  ret void
 }
 
 declare void @test4a(i8* align 1 byval)
@@ -116,6 +131,16 @@ declare void @test5a(%struct.S* align 16 byval) nounwind ssp
 
 ; rdar://8713376 - This memcpy can't be eliminated.
 define i32 @test5(i32 %x) nounwind ssp {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[Y:%.*]] = alloca [[STRUCT_S:%.*]], align 16
+; CHECK-NEXT:    [[TMP:%.*]] = bitcast %struct.S* [[Y]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP]], i8* align 16 bitcast (%struct.S* @sS to i8*), i64 32, i1 false)
+; CHECK-NEXT:    [[A:%.*]] = getelementptr [[STRUCT_S]], %struct.S* [[Y]], i64 0, i32 1, i64 0
+; CHECK-NEXT:    store i8 4, i8* [[A]], align 1
+; CHECK-NEXT:    call void @test5a(%struct.S* byval align 16 [[Y]])
+; CHECK-NEXT:    ret i32 0
+;
 entry:
   %y = alloca %struct.S, align 16
   %tmp = bitcast %struct.S* %y to i8*
@@ -124,17 +149,15 @@ entry:
   store i8 4, i8* %a
   call void @test5a(%struct.S* align 16 byval %y)
   ret i32 0
-  ; CHECK-LABEL: @test5(
-  ; CHECK: store i8 4
-  ; CHECK: call void @test5a(%struct.S* byval align 16 %y)
 }
 
 ;; Noop memcpy should be zapped.
 define void @test6(i8 *%P) {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %P, i8* align 4 %P, i64 8, i1 false)
   ret void
-; CHECK-LABEL: @test6(
-; CHECK-NEXT: ret void
 }
 
 
@@ -143,6 +166,11 @@ define void @test6(i8 *%P) {
 %struct.p = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
 
 define i32 @test7(%struct.p* nocapture align 8 byval %q) nounwind ssp {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @g(%struct.p* byval align 8 [[Q:%.*]]) [[ATTR0]]
+; CHECK-NEXT:    ret i32 [[CALL]]
+;
 entry:
   %agg.tmp = alloca %struct.p, align 4
   %tmp = bitcast %struct.p* %agg.tmp to i8*
@@ -150,8 +178,6 @@ entry:
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %tmp, i8* align 4 %tmp1, i64 48, i1 false)
   %call = call i32 @g(%struct.p* align 8 byval %agg.tmp) nounwind
   ret i32 %call
-; CHECK-LABEL: @test7(
-; CHECK: call i32 @g(%struct.p* byval align 8 %q) [[$NUW:#[0-9]+]]
 }
 
 declare i32 @g(%struct.p* align 8 byval)
@@ -163,8 +189,9 @@ declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i1) n
 @test8.str = internal constant [7 x i8] c"ABCDEF\00"
 
 define void @test8() {
-; CHECK: test8
-; CHECK-NOT: memcpy
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:    ret void
+;
   %A = tail call i8* @malloc(i32 10)
   %B = getelementptr inbounds i8, i8* %A, i64 2
   tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %B, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @test8.str, i64 0, i64 0), i32 7, i1 false)
@@ -172,7 +199,6 @@ define void @test8() {
   %D = getelementptr inbounds i8, i8* %C, i64 2
   tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %D, i8* %B, i32 7, i1 false)
   ret void
-; CHECK: ret void
 }
 
 declare noalias i8* @malloc(i32)
@@ -181,11 +207,14 @@ declare noalias i8* @malloc(i32)
 %struct.big = type { [50 x i32] }
 
 define void @test9_addrspacecast() nounwind ssp uwtable {
-entry:
 ; CHECK-LABEL: @test9_addrspacecast(
-; CHECK: f1
-; CHECK-NOT: memcpy
-; CHECK: f2
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_BIG:%.*]], align 4
+; CHECK-NEXT:    call void @f1(%struct.big* sret [[B]])
+; CHECK-NEXT:    call void @f2(%struct.big* [[B]])
+; CHECK-NEXT:    ret void
+;
+entry:
   %b = alloca %struct.big, align 4
   %tmp = alloca %struct.big, align 4
   call void @f1(%struct.big* sret %tmp)
@@ -197,11 +226,14 @@ entry:
 }
 
 define void @test9() nounwind ssp uwtable {
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_BIG:%.*]], align 4
+; CHECK-NEXT:    call void @f1(%struct.big* sret [[B]])
+; CHECK-NEXT:    call void @f2(%struct.big* [[B]])
+; CHECK-NEXT:    ret void
+;
 entry:
-; CHECK: test9
-; CHECK: f1
-; CHECK-NOT: memcpy
-; CHECK: f2
   %b = alloca %struct.big, align 4
   %tmp = alloca %struct.big, align 4
   call void @f1(%struct.big* sret %tmp)
@@ -220,6 +252,15 @@ entry:
 declare void @foo(i32* noalias nocapture)
 
 define void @test10(%opaque* noalias nocapture sret %x, i32 %y) {
+; CHECK-LABEL: @test10(
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[Y:%.*]], i32* [[A]], align 4
+; CHECK-NEXT:    call void @foo(i32* noalias nocapture [[A]])
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    [[D:%.*]] = bitcast %opaque* [[X:%.*]] to i32*
+; CHECK-NEXT:    store i32 [[C]], i32* [[D]], align 4
+; CHECK-NEXT:    ret void
+;
   %a = alloca i32, align 4
   store i32 %y, i32* %a
   call void @foo(i32* noalias nocapture %a)
@@ -231,14 +272,17 @@ define void @test10(%opaque* noalias nocapture sret %x, i32 %y) {
 
 ; don't create new addressspacecasts when we don't know they're safe for the target
 define void @test11([20 x i32] addrspace(1)* nocapture dereferenceable(80) %P) {
+; CHECK-LABEL: @test11(
+; CHECK-NEXT:    [[B:%.*]] = bitcast [20 x i32] addrspace(1)* [[P:%.*]] to i8 addrspace(1)*
+; CHECK-NEXT:    call void @llvm.memset.p1i8.i64(i8 addrspace(1)* align 4 [[B]], i8 0, i64 80, i1 false)
+; CHECK-NEXT:    ret void
+;
   %A = alloca [20 x i32], align 4
   %a = bitcast [20 x i32]* %A to i8*
   %b = bitcast [20 x i32] addrspace(1)* %P to i8 addrspace(1)*
   call void @llvm.memset.p0i8.i64(i8* align 4 %a, i8 0, i64 80, i1 false)
   call void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* align 4 %b, i8* align 4 %a, i64 80, i1 false)
   ret void
-; CHECK-LABEL: @test11(
-; CHECK-NOT: addrspacecast
 }
 
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
@@ -247,7 +291,7 @@ declare void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* nocapture, i8* nocaptur
 declare void @f1(%struct.big* nocapture sret)
 declare void @f2(%struct.big*)
 
-; CHECK: attributes [[$NUW]] = { nounwind }
+; CHECK: attributes [[ATTR0]] = { nounwind }
 ; CHECK: attributes #1 = { argmemonly nounwind willreturn }
 ; CHECK: attributes #2 = { nounwind ssp }
 ; CHECK: attributes #3 = { nounwind ssp uwtable }

diff  --git a/llvm/test/Transforms/MemCpyOpt/memmove.ll b/llvm/test/Transforms/MemCpyOpt/memmove.ll
index d152cfb63f2b..4a75cfe6a046 100644
--- a/llvm/test/Transforms/MemCpyOpt/memmove.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memmove.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -basic-aa -memcpyopt -S | FileCheck %s
 ; These memmoves should get optimized to memcpys.
 
@@ -7,9 +8,15 @@ target triple = "x86_64-apple-darwin9.0"
 declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind
 
 define i8* @test1(i8* nocapture %src) nounwind {
-entry:
 ; CHECK-LABEL: @test1(
-; CHECK: call void @llvm.memcpy
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MALLOCCALL:%.*]] = tail call i8* @malloc(i32 trunc (i64 mul nuw (i64 ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64), i64 13) to i32))
+; CHECK-NEXT:    [[CALL3:%.*]] = bitcast i8* [[MALLOCCALL]] to [13 x i8]*
+; CHECK-NEXT:    [[CALL3_SUB:%.*]] = getelementptr inbounds [13 x i8], [13 x i8]* [[CALL3]], i64 0, i64 0
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[CALL3_SUB]], i8* [[SRC:%.*]], i64 13, i1 false)
+; CHECK-NEXT:    ret i8* [[CALL3_SUB]]
+;
+entry:
 
   %malloccall = tail call i8* @malloc(i32 trunc (i64 mul nuw (i64 ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64), i64 13) to i32))
   %call3 = bitcast i8* %malloccall to [13 x i8]*
@@ -21,9 +28,13 @@ declare noalias i8* @malloc(i32)
 
 
 define void @test2(i8* %P) nounwind {
-entry:
 ; CHECK-LABEL: @test2(
-; CHECK: call void @llvm.memcpy
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr i8, i8* [[P:%.*]], i64 16
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P]], i8* [[ADD_PTR]], i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
   %add.ptr = getelementptr i8, i8* %P, i64 16
   tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %P, i8* %add.ptr, i64 16, i1 false)
   ret void
@@ -31,9 +42,13 @@ entry:
 
 ; This cannot be optimize because the src/dst really do overlap.
 define void @test3(i8* %P) nounwind {
-entry:
 ; CHECK-LABEL: @test3(
-; CHECK: call void @llvm.memmove
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr i8, i8* [[P:%.*]], i64 16
+; CHECK-NEXT:    tail call void @llvm.memmove.p0i8.p0i8.i64(i8* [[P]], i8* [[ADD_PTR]], i64 17, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
   %add.ptr = getelementptr i8, i8* %P, i64 16
   tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %P, i8* %add.ptr, i64 17, i1 false)
   ret void

diff  --git a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll
index 7ee0682ed229..52ac35ba5da5 100644
--- a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll
@@ -130,7 +130,7 @@ define void @test_write_between(i8* %result) {
 ; CHECK-NEXT:    [[A:%.*]] = alloca [[T:%.*]], align 8
 ; CHECK-NEXT:    [[B:%.*]] = bitcast %T* [[A]] to i8*
 ; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[B]], i8 0, i64 12, i1 false)
-; CHECK-NEXT:    store i8 -1, i8* [[B]]
+; CHECK-NEXT:    store i8 -1, i8* [[B]], align 1
 ; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[RESULT:%.*]], i8* align 8 [[B]], i64 16, i1 false)
 ; CHECK-NEXT:    ret void
 ;
@@ -148,7 +148,7 @@ define void @test_write_before_memset_in_memset_region(i8* %result) {
 ; CHECK-LABEL: @test_write_before_memset_in_memset_region(
 ; CHECK-NEXT:    [[A:%.*]] = alloca [[T:%.*]], align 8
 ; CHECK-NEXT:    [[B:%.*]] = bitcast %T* [[A]] to i8*
-; CHECK-NEXT:    store i8 -1, i8* [[B]]
+; CHECK-NEXT:    store i8 -1, i8* [[B]], align 1
 ; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[B]], i8 0, i64 8, i1 false)
 ; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[RESULT:%.*]], i8* align 8 [[B]], i64 16, i1 false)
 ; CHECK-NEXT:    ret void
@@ -168,7 +168,7 @@ define void @test_write_before_memset_in_memcpy_region(i8* %result) {
 ; CHECK-NEXT:    [[A:%.*]] = alloca [[T:%.*]], align 8
 ; CHECK-NEXT:    [[B:%.*]] = bitcast %T* [[A]] to i8*
 ; CHECK-NEXT:    [[C:%.*]] = getelementptr inbounds [[T]], %T* [[A]], i64 0, i32 2
-; CHECK-NEXT:    store i32 -1, i32* [[C]]
+; CHECK-NEXT:    store i32 -1, i32* [[C]], align 4
 ; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[B]], i8 0, i64 8, i1 false)
 ; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[RESULT:%.*]], i8* align 8 [[B]], i64 16, i1 false)
 ; CHECK-NEXT:    ret void
@@ -189,7 +189,7 @@ define void @test_write_before_memset_in_both_regions(i8* %result) {
 ; CHECK-NEXT:    [[A:%.*]] = alloca [[T:%.*]], align 8
 ; CHECK-NEXT:    [[B:%.*]] = bitcast %T* [[A]] to i8*
 ; CHECK-NEXT:    [[C:%.*]] = getelementptr inbounds [[T]], %T* [[A]], i64 0, i32 1
-; CHECK-NEXT:    store i32 -1, i32* [[C]]
+; CHECK-NEXT:    store i32 -1, i32* [[C]], align 4
 ; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[B]], i8 0, i64 10, i1 false)
 ; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[RESULT:%.*]], i8* align 8 [[B]], i64 16, i1 false)
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-redundant-memset.ll b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-redundant-memset.ll
index 651ac3194a15..758a093a3b65 100644
--- a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-redundant-memset.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-redundant-memset.ll
@@ -1,126 +1,155 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -basic-aa -memcpyopt -S %s | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
-; CHECK-LABEL: define void @test
-; CHECK: [[ULE:%[0-9]+]] = icmp ule i64 %dst_size, %src_size
-; CHECK: [[SIZEDIFF:%[0-9]+]] = sub i64 %dst_size, %src_size
-; CHECK: [[SIZE:%[0-9]+]] = select i1 [[ULE]], i64 0, i64 [[SIZEDIFF]]
-; CHECK: [[DST:%[0-9]+]] = getelementptr i8, i8* %dst, i64 %src_size
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 1 [[DST]], i8 %c, i64 [[SIZE]], i1 false)
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %src_size, i1 false)
-; CHECK-NEXT: ret void
 define void @test(i8* %src, i64 %src_size, i8* %dst, i64 %dst_size, i8 %c) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule i64 [[DST_SIZE:%.*]], [[SRC_SIZE:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[DST_SIZE]], [[SRC_SIZE]]
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP1]], i64 0, i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[DST:%.*]], i64 [[SRC_SIZE]]
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 1 [[TMP4]], i8 [[C:%.*]], i64 [[TMP3]], i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DST]], i8* [[SRC:%.*]], i64 [[SRC_SIZE]], i1 false)
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memset.p0i8.i64(i8* %dst, i8 %c, i64 %dst_size, i1 false)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %src_size, i1 false)
   ret void
 }
 
-; CHECK-LABEL: define void @test_
diff erent_types_i32_i64
-; CHECK: [[DSTSIZE:%[0-9]+]] = zext i32 %dst_size to i64
-; CHECK: [[ULE:%[0-9]+]] = icmp ule i64 [[DSTSIZE]], %src_size
-; CHECK: [[SIZEDIFF:%[0-9]+]] = sub i64 [[DSTSIZE]], %src_size
-; CHECK: [[SIZE:%[0-9]+]] = select i1 [[ULE]], i64 0, i64 [[SIZEDIFF]]
-; CHECK: [[DST:%[0-9]+]] = getelementptr i8, i8* %dst, i64 %src_size
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 1 [[DST]], i8 %c, i64 [[SIZE]], i1 false)
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %src_size, i1 false)
-; CHECK-NEXT: ret void
 define void @test_
diff erent_types_i32_i64(i8* %dst, i8* %src, i32 %dst_size, i64 %src_size, i8 %c) {
+; CHECK-LABEL: @test_
diff erent_types_i32_i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[DST_SIZE:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule i64 [[TMP1]], [[SRC_SIZE:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP1]], [[SRC_SIZE]]
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], i64 0, i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[DST:%.*]], i64 [[SRC_SIZE]]
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 1 [[TMP5]], i8 [[C:%.*]], i64 [[TMP4]], i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DST]], i8* [[SRC:%.*]], i64 [[SRC_SIZE]], i1 false)
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memset.p0i8.i32(i8* %dst, i8 %c, i32 %dst_size, i1 false)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %src_size, i1 false)
   ret void
 }
 
-; CHECK-LABEL: define void @test_
diff erent_types_i128_i32
-; CHECK: [[SRCSIZE:%[0-9]+]] = zext i32 %src_size to i128
-; CHECK: [[ULE:%[0-9]+]] = icmp ule i128 %dst_size, [[SRCSIZE]]
-; CHECK: [[SIZEDIFF:%[0-9]+]] = sub i128 %dst_size, [[SRCSIZE]]
-; CHECK: [[SIZE:%[0-9]+]] = select i1 [[ULE]], i128 0, i128 [[SIZEDIFF]]
-; CHECK: [[DST:%[0-9]+]] = getelementptr i8, i8* %dst, i128 [[SRCSIZE]]
-; CHECK-NEXT: call void @llvm.memset.p0i8.i128(i8* align 1 [[DST]], i8 %c, i128 [[SIZE]], i1 false)
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src, i32 %src_size, i1 false)
-; CHECK-NEXT: ret void
 define void @test_
diff erent_types_i128_i32(i8* %dst, i8* %src, i128 %dst_size, i32 %src_size, i8 %c) {
+; CHECK-LABEL: @test_
diff erent_types_i128_i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[SRC_SIZE:%.*]] to i128
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule i128 [[DST_SIZE:%.*]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i128 [[DST_SIZE]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], i128 0, i128 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[DST:%.*]], i128 [[TMP1]]
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i128(i8* align 1 [[TMP5]], i8 [[C:%.*]], i128 [[TMP4]], i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[DST]], i8* [[SRC:%.*]], i32 [[SRC_SIZE]], i1 false)
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memset.p0i8.i128(i8* %dst, i8 %c, i128 %dst_size, i1 false)
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src, i32 %src_size, i1 false)
   ret void
 }
 
-; CHECK-LABEL: define void @test_
diff erent_types_i32_i128
-; CHECK: [[DSTSIZE:%[0-9]+]] = zext i32 %dst_size to i128
-; CHECK: [[ULE:%[0-9]+]] = icmp ule i128 [[DSTSIZE]], %src_size
-; CHECK: [[SIZEDIFF:%[0-9]+]] = sub i128 [[DSTSIZE]], %src_size
-; CHECK: [[SIZE:%[0-9]+]] = select i1 [[ULE]], i128 0, i128 [[SIZEDIFF]]
-; CHECK: [[DST:%[0-9]+]] = getelementptr i8, i8* %dst, i128 %src_size
-; CHECK-NEXT: call void @llvm.memset.p0i8.i128(i8* align 1 [[DST]], i8 %c, i128 [[SIZE]], i1 false)
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i128(i8* %dst, i8* %src, i128 %src_size, i1 false)
-; CHECK-NEXT: ret void
 define void @test_
diff erent_types_i32_i128(i8* %dst, i8* %src, i32 %dst_size, i128 %src_size, i8 %c) {
+; CHECK-LABEL: @test_
diff erent_types_i32_i128(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[DST_SIZE:%.*]] to i128
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule i128 [[TMP1]], [[SRC_SIZE:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i128 [[TMP1]], [[SRC_SIZE]]
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], i128 0, i128 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[DST:%.*]], i128 [[SRC_SIZE]]
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i128(i8* align 1 [[TMP5]], i8 [[C:%.*]], i128 [[TMP4]], i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i128(i8* [[DST]], i8* [[SRC:%.*]], i128 [[SRC_SIZE]], i1 false)
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memset.p0i8.i32(i8* %dst, i8 %c, i32 %dst_size, i1 false)
   call void @llvm.memcpy.p0i8.p0i8.i128(i8* %dst, i8* %src, i128 %src_size, i1 false)
   ret void
 }
 
-; CHECK-LABEL: define void @test_
diff erent_types_i64_i32
-; CHECK: [[SRCSIZE:%[0-9]+]] = zext i32 %src_size to i64
-; CHECK: [[ULE:%[0-9]+]] = icmp ule i64 %dst_size, [[SRCSIZE]]
-; CHECK: [[SIZEDIFF:%[0-9]+]] = sub i64 %dst_size, [[SRCSIZE]]
-; CHECK: [[SIZE:%[0-9]+]] = select i1 [[ULE]], i64 0, i64 [[SIZEDIFF]]
-; CHECK: [[DST:%[0-9]+]] = getelementptr i8, i8* %dst, i64 [[SRCSIZE]]
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 1 [[DST]], i8 %c, i64 [[SIZE]], i1 false)
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src, i32 %src_size, i1 false)
-; CHECK-NEXT: ret void
 define void @test_
diff erent_types_i64_i32(i8* %dst, i8* %src, i64 %dst_size, i32 %src_size, i8 %c) {
+; CHECK-LABEL: @test_
diff erent_types_i64_i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[SRC_SIZE:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule i64 [[DST_SIZE:%.*]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i64 [[DST_SIZE]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], i64 0, i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[DST:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 1 [[TMP5]], i8 [[C:%.*]], i64 [[TMP4]], i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[DST]], i8* [[SRC:%.*]], i32 [[SRC_SIZE]], i1 false)
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memset.p0i8.i64(i8* %dst, i8 %c, i64 %dst_size, i1 false)
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src, i32 %src_size, i1 false)
   ret void
 }
 
-; CHECK-LABEL: define void @test_align_same
-; CHECK: call void @llvm.memset.p0i8.i64(i8* align 8 {{.*}}, i8 0, i64 {{.*}}, i1 false)
 define void @test_align_same(i8* %src, i8* %dst, i64 %dst_size) {
+; CHECK-LABEL: @test_align_same(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule i64 [[DST_SIZE:%.*]], 80
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[DST_SIZE]], 80
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP1]], i64 0, i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[DST:%.*]], i64 80
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP4]], i8 0, i64 [[TMP3]], i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DST]], i8* [[SRC:%.*]], i64 80, i1 false)
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memset.p0i8.i64(i8* align 8 %dst, i8 0, i64 %dst_size, i1 false)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 80, i1 false)
   ret void
 }
 
-; CHECK-LABEL: define void @test_align_min
-; CHECK: call void @llvm.memset.p0i8.i64(i8* align 4 {{.*}}, i8 0, i64 {{.*}}, i1 false)
 define void @test_align_min(i8* %src, i8* %dst, i64 %dst_size) {
+; CHECK-LABEL: @test_align_min(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule i64 [[DST_SIZE:%.*]], 36
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[DST_SIZE]], 36
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP1]], i64 0, i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[DST:%.*]], i64 36
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP4]], i8 0, i64 [[TMP3]], i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DST]], i8* [[SRC:%.*]], i64 36, i1 false)
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memset.p0i8.i64(i8* align 8 %dst, i8 0, i64 %dst_size, i1 false)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 36, i1 false)
   ret void
 }
 
-; CHECK-LABEL: define void @test_align_memcpy
-; CHECK: call void @llvm.memset.p0i8.i64(i8* align 8 {{.*}}, i8 0, i64 {{.*}}, i1 false)
 define void @test_align_memcpy(i8* %src, i8* %dst, i64 %dst_size) {
+; CHECK-LABEL: @test_align_memcpy(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule i64 [[DST_SIZE:%.*]], 80
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[DST_SIZE]], 80
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP1]], i64 0, i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[DST:%.*]], i64 80
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP4]], i8 0, i64 [[TMP3]], i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[DST]], i8* align 8 [[SRC:%.*]], i64 80, i1 false)
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memset.p0i8.i64(i8* %dst, i8 0, i64 %dst_size, i1 false)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %dst, i8* align 8 %src, i64 80, i1 false)
   ret void
 }
 
-; CHECK-LABEL: define void @test_non_i8_dst_type
-; CHECK-NEXT: %dst = bitcast i64* %dst_pi64 to i8*
-; CHECK: [[ULE:%[0-9]+]] = icmp ule i64 %dst_size, %src_size
-; CHECK: [[SIZEDIFF:%[0-9]+]] = sub i64 %dst_size, %src_size
-; CHECK: [[SIZE:%[0-9]+]] = select i1 [[ULE]], i64 0, i64 [[SIZEDIFF]]
-; CHECK: [[DST:%[0-9]+]] = getelementptr i8, i8* %dst, i64 %src_size
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 1 [[DST]], i8 %c, i64 [[SIZE]], i1 false)
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %src_size, i1 false)
-; CHECK-NEXT: ret void
 define void @test_non_i8_dst_type(i8* %src, i64 %src_size, i64* %dst_pi64, i64 %dst_size, i8 %c) {
+; CHECK-LABEL: @test_non_i8_dst_type(
+; CHECK-NEXT:    [[DST:%.*]] = bitcast i64* [[DST_PI64:%.*]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule i64 [[DST_SIZE:%.*]], [[SRC_SIZE:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[DST_SIZE]], [[SRC_SIZE]]
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP1]], i64 0, i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[DST]], i64 [[SRC_SIZE]]
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 1 [[TMP4]], i8 [[C:%.*]], i64 [[TMP3]], i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DST]], i8* [[SRC:%.*]], i64 [[SRC_SIZE]], i1 false)
+; CHECK-NEXT:    ret void
+;
   %dst = bitcast i64* %dst_pi64 to i8*
   call void @llvm.memset.p0i8.i64(i8* %dst, i8 %c, i64 %dst_size, i1 false)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %src_size, i1 false)
   ret void
 }
 
-; CHECK-LABEL: define void @test_
diff erent_dst
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst, i8 0, i64 %dst_size, i1 false)
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %src, i64 %src_size, i1 false)
-; CHECK-NEXT: ret void
 define void @test_
diff erent_dst(i8* %dst2, i8* %src, i64 %src_size, i8* %dst, i64 %dst_size) {
+; CHECK-LABEL: @test_
diff erent_dst(
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[DST:%.*]], i8 0, i64 [[DST_SIZE:%.*]], i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DST2:%.*]], i8* [[SRC:%.*]], i64 [[SRC_SIZE:%.*]], i1 false)
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memset.p0i8.i64(i8* %dst, i8 0, i64 %dst_size, i1 false)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %src, i64 %src_size, i1 false)
   ret void
@@ -128,12 +157,13 @@ define void @test_
diff erent_dst(i8* %dst2, i8* %src, i64 %src_size, i8* %dst, i6
 
 ; Make sure we also take into account dependencies on the destination.
 
-; CHECK-LABEL: define i8 @test_intermediate_read
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 64, i1 false)
-; CHECK-NEXT: %r = load i8, i8* %a
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 24, i1 false)
-; CHECK-NEXT: ret i8 %r
 define i8 @test_intermediate_read(i8* %a, i8* %b) #0 {
+; CHECK-LABEL: @test_intermediate_read(
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[A:%.*]], i8 0, i64 64, i1 false)
+; CHECK-NEXT:    [[R:%.*]] = load i8, i8* [[A]], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[A]], i8* [[B:%.*]], i64 24, i1 false)
+; CHECK-NEXT:    ret i8 [[R]]
+;
   call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 64, i1 false)
   %r = load i8, i8* %a
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 24, i1 false)
@@ -142,15 +172,16 @@ define i8 @test_intermediate_read(i8* %a, i8* %b) #0 {
 
 %struct = type { [8 x i8], [8 x i8] }
 
-; CHECK-LABEL: define void @test_intermediate_write
-; CHECK-NEXT: %a = alloca %struct
-; CHECK-NEXT: %a0 = getelementptr %struct, %struct* %a, i32 0, i32 0, i32 0
-; CHECK-NEXT: %a1 = getelementptr %struct, %struct* %a, i32 0, i32 1, i32 0
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %a0, i8 0, i64 16, i1 false)
-; CHECK-NEXT: store i8 1, i8* %a1
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a0, i8* %b, i64 8, i1 false)
-; CHECK-NEXT: ret void
 define void @test_intermediate_write(i8* %b) #0 {
+; CHECK-LABEL: @test_intermediate_write(
+; CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT:%.*]], align 8
+; CHECK-NEXT:    [[A0:%.*]] = getelementptr [[STRUCT]], %struct* [[A]], i32 0, i32 0, i32 0
+; CHECK-NEXT:    [[A1:%.*]] = getelementptr [[STRUCT]], %struct* [[A]], i32 0, i32 1, i32 0
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[A0]], i8 0, i64 16, i1 false)
+; CHECK-NEXT:    store i8 1, i8* [[A1]], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[A0]], i8* [[B:%.*]], i64 8, i1 false)
+; CHECK-NEXT:    ret void
+;
   %a = alloca %struct
   %a0 = getelementptr %struct, %struct* %a, i32 0, i32 0, i32 0
   %a1 = getelementptr %struct, %struct* %a, i32 0, i32 1, i32 0

diff  --git a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-to-2x-memset.ll b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-to-2x-memset.ll
index e36389a128f9..8867c4f810b2 100644
--- a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-to-2x-memset.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-to-2x-memset.ll
@@ -1,73 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -memcpyopt -S %s | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
-; CHECK-LABEL: define void @test(
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i1 false)
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 %dst2, i8 %c, i64 128, i1 false)
-; CHECK-NEXT: ret void
 define void @test(i8* %dst1, i8* %dst2, i8 %c) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[DST1:%.*]], i8 [[C:%.*]], i64 128, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[DST2:%.*]], i8 [[C]], i64 128, i1 false)
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i1 false)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %dst2, i8* align 8 %dst1, i64 128, i1 false)
   ret void
 }
 
-; CHECK-LABEL: define void @test_smaller_memcpy(
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i1 false)
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst2, i8 %c, i64 100, i1 false)
-; CHECK-NEXT: ret void
 define void @test_smaller_memcpy(i8* %dst1, i8* %dst2, i8 %c) {
+; CHECK-LABEL: @test_smaller_memcpy(
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[DST1:%.*]], i8 [[C:%.*]], i64 128, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[DST2:%.*]], i8 [[C]], i64 100, i1 false)
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i1 false)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 100, i1 false)
   ret void
 }
 
-; CHECK-LABEL: define void @test_smaller_memset(
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 100, i1 false)
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 128, i1 false)
-; CHECK-NEXT: ret void
 define void @test_smaller_memset(i8* %dst1, i8* %dst2, i8 %c) {
+; CHECK-LABEL: @test_smaller_memset(
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[DST1:%.*]], i8 [[C:%.*]], i64 100, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DST2:%.*]], i8* [[DST1]], i64 128, i1 false)
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 100, i1 false)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 128, i1 false)
   ret void
 }
 
-; CHECK-LABEL: define void @test_align_memset(
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 %dst1, i8 %c, i64 128, i1 false)
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst2, i8 %c, i64 128, i1 false)
-; CHECK-NEXT: ret void
 define void @test_align_memset(i8* %dst1, i8* %dst2, i8 %c) {
+; CHECK-LABEL: @test_align_memset(
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[DST1:%.*]], i8 [[C:%.*]], i64 128, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[DST2:%.*]], i8 [[C]], i64 128, i1 false)
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memset.p0i8.i64(i8* align 8 %dst1, i8 %c, i64 128, i1 false)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 128, i1 false)
   ret void
 }
 
-; CHECK-LABEL: define void @test_
diff erent_types(
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 %dst1, i8 %c, i64 128, i1 false)
-; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* %dst2, i8 %c, i32 100, i1 false)
-; CHECK-NEXT: ret void
 define void @test_
diff erent_types(i8* %dst1, i8* %dst2, i8 %c) {
+; CHECK-LABEL: @test_
diff erent_types(
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[DST1:%.*]], i8 [[C:%.*]], i64 128, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i32(i8* [[DST2:%.*]], i8 [[C]], i32 100, i1 false)
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memset.p0i8.i64(i8* align 8 %dst1, i8 %c, i64 128, i1 false)
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst2, i8* %dst1, i32 100, i1 false)
   ret void
 }
 
-; CHECK-LABEL: define void @test_
diff erent_types_2(
-; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 8 %dst1, i8 %c, i32 128, i1 false)
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst2, i8 %c, i64 100, i1 false)
-; CHECK-NEXT: ret void
 define void @test_
diff erent_types_2(i8* %dst1, i8* %dst2, i8 %c) {
+; CHECK-LABEL: @test_
diff erent_types_2(
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i32(i8* align 8 [[DST1:%.*]], i8 [[C:%.*]], i32 128, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[DST2:%.*]], i8 [[C]], i64 100, i1 false)
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memset.p0i8.i32(i8* align 8 %dst1, i8 %c, i32 128, i1 false)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 100, i1 false)
   ret void
 }
 
-; CHECK-LABEL: define void @test_
diff erent_source_gep(
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i1 false)
-; CHECK-NEXT: %p = getelementptr i8, i8* %dst1, i64 64
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %p, i64 64, i1 false)
-; CHECK-NEXT: ret void
 define void @test_
diff erent_source_gep(i8* %dst1, i8* %dst2, i8 %c) {
+; CHECK-LABEL: @test_
diff erent_source_gep(
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[DST1:%.*]], i8 [[C:%.*]], i64 128, i1 false)
+; CHECK-NEXT:    [[P:%.*]] = getelementptr i8, i8* [[DST1]], i64 64
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DST2:%.*]], i8* [[P]], i64 64, i1 false)
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i1 false)
   ; FIXME: We could optimize this as well.
   %p = getelementptr i8, i8* %dst1, i64 64
@@ -75,21 +83,23 @@ define void @test_
diff erent_source_gep(i8* %dst1, i8* %dst2, i8 %c) {
   ret void
 }
 
-; CHECK-LABEL: define void @test_variable_size_1(
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 %dst1_size, i1 false)
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 128, i1 false)
-; CHECK-NEXT: ret void
 define void @test_variable_size_1(i8* %dst1, i64 %dst1_size, i8* %dst2, i8 %c) {
+; CHECK-LABEL: @test_variable_size_1(
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[DST1:%.*]], i8 [[C:%.*]], i64 [[DST1_SIZE:%.*]], i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DST2:%.*]], i8* [[DST1]], i64 128, i1 false)
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 %dst1_size, i1 false)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 128, i1 false)
   ret void
 }
 
-; CHECK-LABEL: define void @test_variable_size_2(
-; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i1 false)
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 %dst2_size, i1 false)
-; CHECK-NEXT: ret void
 define void @test_variable_size_2(i8* %dst1, i8* %dst2, i64 %dst2_size, i8 %c) {
+; CHECK-LABEL: @test_variable_size_2(
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[DST1:%.*]], i8 [[C:%.*]], i64 128, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DST2:%.*]], i8* [[DST1]], i64 [[DST2_SIZE:%.*]], i1 false)
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i1 false)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst2, i8* %dst1, i64 %dst2_size, i1 false)
   ret void

diff  --git a/llvm/test/Transforms/MemCpyOpt/nontemporal.ll b/llvm/test/Transforms/MemCpyOpt/nontemporal.ll
index d9dafcc7b816..a67aa8cf0007 100644
--- a/llvm/test/Transforms/MemCpyOpt/nontemporal.ll
+++ b/llvm/test/Transforms/MemCpyOpt/nontemporal.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -memcpyopt -S | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -5,16 +6,25 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; Verify that we don't combine nontemporal stores into memset calls.
 
 define void @nontemporal_stores_1(<4 x float>* nocapture %dst) {
-; CHECK-LABEL: @nontemporal_stores_1
-; CHECK: store <4 x float> zeroinitializer, <4 x float>* %dst, align 16, !nontemporal !0
-; CHECK: store <4 x float> zeroinitializer, <4 x float>* %ptr1, align 16, !nontemporal !0
-; CHECK: store <4 x float> zeroinitializer, <4 x float>* %ptr2, align 16, !nontemporal !0
-; CHECK: store <4 x float> zeroinitializer, <4 x float>* %ptr3, align 16, !nontemporal !0
-; CHECK: store <4 x float> zeroinitializer, <4 x float>* %ptr4, align 16, !nontemporal !0
-; CHECK: store <4 x float> zeroinitializer, <4 x float>* %ptr5, align 16, !nontemporal !0
-; CHECK: store <4 x float> zeroinitializer, <4 x float>* %ptr6, align 16, !nontemporal !0
-; CHECK: store <4 x float> zeroinitializer, <4 x float>* %ptr7, align 16, !nontemporal !0
-; CHECK-NEXT: ret void
+; CHECK-LABEL: @nontemporal_stores_1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store <4 x float> zeroinitializer, <4 x float>* [[DST:%.*]], align 16, !nontemporal !0
+; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[DST]], i64 1
+; CHECK-NEXT:    store <4 x float> zeroinitializer, <4 x float>* [[PTR1]], align 16, !nontemporal !0
+; CHECK-NEXT:    [[PTR2:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[DST]], i64 2
+; CHECK-NEXT:    store <4 x float> zeroinitializer, <4 x float>* [[PTR2]], align 16, !nontemporal !0
+; CHECK-NEXT:    [[PTR3:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[DST]], i64 3
+; CHECK-NEXT:    store <4 x float> zeroinitializer, <4 x float>* [[PTR3]], align 16, !nontemporal !0
+; CHECK-NEXT:    [[PTR4:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[DST]], i64 4
+; CHECK-NEXT:    store <4 x float> zeroinitializer, <4 x float>* [[PTR4]], align 16, !nontemporal !0
+; CHECK-NEXT:    [[PTR5:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[DST]], i64 5
+; CHECK-NEXT:    store <4 x float> zeroinitializer, <4 x float>* [[PTR5]], align 16, !nontemporal !0
+; CHECK-NEXT:    [[PTR6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[DST]], i64 6
+; CHECK-NEXT:    store <4 x float> zeroinitializer, <4 x float>* [[PTR6]], align 16, !nontemporal !0
+; CHECK-NEXT:    [[PTR7:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[DST]], i64 7
+; CHECK-NEXT:    store <4 x float> zeroinitializer, <4 x float>* [[PTR7]], align 16, !nontemporal !0
+; CHECK-NEXT:    ret void
+;
 entry:
   store <4 x float> zeroinitializer, <4 x float>* %dst, align 16, !nontemporal !0
   %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %dst, i64 1
@@ -35,10 +45,13 @@ entry:
 }
 
 define void @nontemporal_stores_2(<4 x float>* nocapture %dst) {
-; CHECK-LABEL: @nontemporal_stores_2
-; CHECK: store <4 x float> zeroinitializer, <4 x float>* %dst, align 16, !nontemporal !0
-; CHECK: store <4 x float> zeroinitializer, <4 x float>* %ptr1, align 16, !nontemporal !0
-; CHECK-NEXT: ret void
+; CHECK-LABEL: @nontemporal_stores_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store <4 x float> zeroinitializer, <4 x float>* [[DST:%.*]], align 16, !nontemporal !0
+; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[DST]], i64 1
+; CHECK-NEXT:    store <4 x float> zeroinitializer, <4 x float>* [[PTR1]], align 16, !nontemporal !0
+; CHECK-NEXT:    ret void
+;
 entry:
   store <4 x float> zeroinitializer, <4 x float>* %dst, align 16, !nontemporal !0
   %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %dst, i64 1

diff  --git a/llvm/test/Transforms/MemCpyOpt/pr29105.ll b/llvm/test/Transforms/MemCpyOpt/pr29105.ll
index e9e9b611aef2..e83508606e55 100644
--- a/llvm/test/Transforms/MemCpyOpt/pr29105.ll
+++ b/llvm/test/Transforms/MemCpyOpt/pr29105.ll
@@ -1,10 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -memcpyopt -instcombine -S %s | FileCheck %s
 %Foo = type { [2048 x i64] }
 
 ; Make sure that all mempcy calls are converted to memset calls, or removed.
-; CHECK-LABEL: @baz(
-; CHECK-NOT: call void @llvm.memcpy
 define void @baz() unnamed_addr #0 {
+; CHECK-LABEL: @baz(
+; CHECK-NEXT:  entry-block:
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca [[FOO:%.*]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast %Foo* [[TMP2]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 16384, i8* nonnull [[TMP0]])
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* nonnull align 8 dereferenceable(16384) [[TMP0]], i8 0, i64 16384, i1 false)
+; CHECK-NEXT:    call void @bar(%Foo* noalias nocapture nonnull dereferenceable(16384) [[TMP2]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 16384, i8* nonnull [[TMP0]])
+; CHECK-NEXT:    ret void
+;
 entry-block:
   %x.sroa.0 = alloca [2048 x i64], align 8
   %tmp0 = alloca [2048 x i64], align 8

diff  --git a/llvm/test/Transforms/MemCpyOpt/pr37967.ll b/llvm/test/Transforms/MemCpyOpt/pr37967.ll
index 6b6a40873273..8a4e88881d0c 100644
--- a/llvm/test/Transforms/MemCpyOpt/pr37967.ll
+++ b/llvm/test/Transforms/MemCpyOpt/pr37967.ll
@@ -1,16 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -debugify -memcpyopt -check-debugify -S < %s 2>&1 | FileCheck %s
 
 ; CHECK: CheckModuleDebugify: PASS
 
-; CHECK-LABEL: define {{.*}} @_Z3bar3Foo
-; CHECK: [[target:%.*]] = load i8*, i8** bitcast (%struct.Foo** @a to i8**), align 8, !dbg
-; CHECK: %tmpcast = bitcast i8* [[target]] to %struct.Foo*, !dbg
-
 %struct.Foo = type { i64, i64, i64 }
 
 @a = dso_local global %struct.Foo* null, align 8
 
 define dso_local void @_Z3bar3Foo(%struct.Foo* byval(%struct.Foo) align 8 %0) {
+; CHECK-LABEL: @_Z3bar3Foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_FOO:%.*]], align 8, [[DBG13:!dbg !.*]]
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata %struct.Foo* [[AGG_TMP]], [[META9:metadata !.*]], metadata !DIExpression()), [[DBG13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8*, i8** bitcast (%struct.Foo** @a to i8**), align 8, [[DBG14:!dbg !.*]]
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i8* [[TMP1]], [[META11:metadata !.*]], metadata !DIExpression()), [[DBG14]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.Foo* [[AGG_TMP]] to i8*, [[DBG15:!dbg !.*]]
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i8* [[TMP2]], [[META12:metadata !.*]], metadata !DIExpression()), [[DBG15]]
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(24) [[TMP2]], i8* nonnull align 8 dereferenceable(24) [[TMP1]], i64 24, i1 false), [[DBG16:!dbg !.*]]
+; CHECK-NEXT:    [[TMPCAST:%.*]] = bitcast i8* [[TMP1]] to %struct.Foo*, [[DBG16]]
+; CHECK-NEXT:    call void @_Z3bar3Foo(%struct.Foo* nonnull byval(%struct.Foo) align 8 [[TMPCAST]]), [[DBG17:!dbg !.*]]
+; CHECK-NEXT:    ret void, [[DBG18:!dbg !.*]]
+;
 entry:
   %agg.tmp = alloca %struct.Foo, align 8
   %1 = load i8*, i8** bitcast (%struct.Foo** @a to i8**), align 8

diff  --git a/llvm/test/Transforms/MemCpyOpt/process_store.ll b/llvm/test/Transforms/MemCpyOpt/process_store.ll
index e2edef0a94f7..7b647e556b56 100644
--- a/llvm/test/Transforms/MemCpyOpt/process_store.ll
+++ b/llvm/test/Transforms/MemCpyOpt/process_store.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s -memcpyopt -disable-output
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -memcpyopt | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -10,6 +11,17 @@ declare dso_local i32 @f1()
 
 ; Do not crash due to store first in BB.
 define dso_local void @f2() {
+; CHECK-LABEL: @f2(
+; CHECK-NEXT:  for.end:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @b, align 4
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    store i32 [[TMP1:%.*]], i32* @a, align 4
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @f1()
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[CALL]], 0
+; CHECK-NEXT:    [[TMP1]] = load i32, i32* @b, align 4
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+;
 for.end:
   %0 = load i32, i32* @b, align 4
   ret void
@@ -24,6 +36,19 @@ for.body:
 
 ; Do not crash due to call not before store in BB.
 define dso_local void @f3() {
+; CHECK-LABEL: @f3(
+; CHECK-NEXT:  for.end:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @b, align 4
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[T:%.*]] = add i32 [[T2:%.*]], 1
+; CHECK-NEXT:    store i32 [[TMP1:%.*]], i32* @a, align 4
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @f1()
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[CALL]], 0
+; CHECK-NEXT:    [[TMP1]] = load i32, i32* @b, align 4
+; CHECK-NEXT:    [[T2]] = xor i32 [[T]], 5
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+;
 for.end:
   %0 = load i32, i32* @b, align 4
   ret void

diff  --git a/llvm/test/Transforms/MemCpyOpt/profitable-memset.ll b/llvm/test/Transforms/MemCpyOpt/profitable-memset.ll
index 649d2386f960..c45ccb9c9aba 100644
--- a/llvm/test/Transforms/MemCpyOpt/profitable-memset.ll
+++ b/llvm/test/Transforms/MemCpyOpt/profitable-memset.ll
@@ -1,12 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -memcpyopt -S | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 
-; CHECK-LABEL: @foo(
-; CHECK-NOT: store
-; CHECK: call void @llvm.memset.p0i8.i64(i8* align 2 %2, i8 0, i64 8, i1 false)
-
 define void @foo(i64* nocapture %P) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[P:%.*]] to i16*
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[ARRAYIDX]] to i32*
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[TMP0]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 2 [[TMP2]], i8 0, i64 8, i1 false)
+; CHECK-NEXT:    ret void
+;
 entry:
   %0 = bitcast i64* %P to i16*
   %arrayidx = getelementptr inbounds i16, i16* %0, i64 1

diff  --git a/llvm/test/Transforms/MemCpyOpt/smaller.ll b/llvm/test/Transforms/MemCpyOpt/smaller.ll
index 0c82b5201dca..1aed83fdb4d6 100644
--- a/llvm/test/Transforms/MemCpyOpt/smaller.ll
+++ b/llvm/test/Transforms/MemCpyOpt/smaller.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -memcpyopt -S < %s | FileCheck %s
 ; RUN: opt -passes=memcpyopt -S < %s | FileCheck %s
 ; rdar://8875553
@@ -5,8 +6,6 @@
 ; Memcpyopt shouldn't optimize the second memcpy using the first
 ; because the first has a smaller size.
 
-; CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %tmp, i8* align 4 getelementptr inbounds (%struct.s, %struct.s* @cell, i32 0, i32 0, i32 0), i32 16, i1 false)
-
 target datalayout = "e-p:32:32:32"
 
 %struct.s = type { [11 x i8], i32 }
@@ -18,6 +17,16 @@ declare void @check(%struct.s* byval %p) nounwind
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i1) nounwind
 
 define void @foo() nounwind {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_S:%.*]], align 4
+; CHECK-NEXT:    store i32 99, i32* getelementptr inbounds (%struct.s, %struct.s* @cell, i32 0, i32 1), align 4
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 getelementptr inbounds (%struct.s, %struct.s* @cell, i32 0, i32 0, i32 0), i8* align 1 getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i32 0, i32 0), i32 11, i1 false)
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[AGG_TMP]], i32 0, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP]], i8* align 4 getelementptr inbounds (%struct.s, %struct.s* @cell, i32 0, i32 0, i32 0), i32 16, i1 false)
+; CHECK-NEXT:    call void @check(%struct.s* byval [[AGG_TMP]])
+; CHECK-NEXT:    ret void
+;
 entry:
   %agg.tmp = alloca %struct.s, align 4
   store i32 99, i32* getelementptr inbounds (%struct.s, %struct.s* @cell, i32 0, i32 1), align 4

diff  --git a/llvm/test/Transforms/MemCpyOpt/sret.ll b/llvm/test/Transforms/MemCpyOpt/sret.ll
index af625127f56b..f5ffbeaf239f 100644
--- a/llvm/test/Transforms/MemCpyOpt/sret.ll
+++ b/llvm/test/Transforms/MemCpyOpt/sret.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s -basic-aa -memcpyopt -S | not grep "call.*memcpy"
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basic-aa -memcpyopt -S | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i686-apple-darwin9"
@@ -6,6 +7,24 @@ target triple = "i686-apple-darwin9"
 %0 = type { x86_fp80, x86_fp80 }
 
 define void @ccosl(%0* noalias sret %agg.result, %0* byval align 8 %z) nounwind {
+; CHECK-LABEL: @ccosl(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IZ:%.*]] = alloca [[TMP0:%.*]], align 16
+; CHECK-NEXT:    [[MEMTMP:%.*]] = alloca [[TMP0]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[TMP0]], %0* [[Z:%.*]], i32 0, i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load x86_fp80, x86_fp80* [[TMP1]], align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub x86_fp80 0xK80000000000000000000, [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr [[TMP0]], %0* [[IZ]], i32 0, i32 1
+; CHECK-NEXT:    [[REAL:%.*]] = getelementptr [[TMP0]], %0* [[IZ]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr [[TMP0]], %0* [[Z]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = load x86_fp80, x86_fp80* [[TMP7]], align 16
+; CHECK-NEXT:    store x86_fp80 [[TMP3]], x86_fp80* [[REAL]], align 16
+; CHECK-NEXT:    store x86_fp80 [[TMP8]], x86_fp80* [[TMP4]], align 16
+; CHECK-NEXT:    call void @ccoshl(%0* noalias sret [[AGG_RESULT:%.*]], %0* byval align 8 [[IZ]]) [[ATTR0:#.*]]
+; CHECK-NEXT:    [[MEMTMP14:%.*]] = bitcast %0* [[MEMTMP]] to i8*
+; CHECK-NEXT:    [[AGG_RESULT15:%.*]] = bitcast %0* [[AGG_RESULT]] to i8*
+; CHECK-NEXT:    ret void
+;
 entry:
   %iz = alloca %0
   %memtmp = alloca %0, align 16

diff  --git a/llvm/test/Transforms/MemCpyOpt/stackrestore.ll b/llvm/test/Transforms/MemCpyOpt/stackrestore.ll
index 4bead3381ccd..6f7a7c898dd9 100644
--- a/llvm/test/Transforms/MemCpyOpt/stackrestore.ll
+++ b/llvm/test/Transforms/MemCpyOpt/stackrestore.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -memcpyopt < %s | FileCheck %s
 
 ; PR40118: BasicAA didn't realize that stackrestore ends the lifetime of
@@ -14,6 +15,20 @@ target triple = "i686-unknown-windows-msvc19.14.26433"
 ; a call to @external.
 
 define i32 @test_norestore(i32 %n) {
+; CHECK-LABEL: @test_norestore(
+; CHECK-NEXT:    [[TMPMEM:%.*]] = alloca [10 x i8], align 4
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds [10 x i8], [10 x i8]* [[TMPMEM]], i32 0, i32 0
+; CHECK-NEXT:    [[P:%.*]] = alloca i8, i32 [[N:%.*]], align 4
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[P]], i8* align 1 getelementptr inbounds ([9 x i8], [9 x i8]* @str, i32 0, i32 0), i32 9, i1 false)
+; CHECK-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 9
+; CHECK-NEXT:    store i8 0, i8* [[P10]], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP]], i8* [[P]], i32 10, i1 false)
+; CHECK-NEXT:    call void @external()
+; CHECK-NEXT:    [[HEAP:%.*]] = call i8* @malloc(i32 9)
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[HEAP]], i8* [[P]], i32 9, i1 false)
+; CHECK-NEXT:    call void @useit(i8* [[HEAP]])
+; CHECK-NEXT:    ret i32 0
+;
   %tmpmem = alloca [10 x i8], align 4
   %tmp = getelementptr inbounds [10 x i8], [10 x i8]* %tmpmem, i32 0, i32 0
 
@@ -33,15 +48,25 @@ define i32 @test_norestore(i32 %n) {
   ret i32 0
 }
 
-; CHECK-LABEL: define i32 @test_norestore(i32 %n)
-; CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %p, i8* align 1 getelementptr inbounds ([9 x i8], [9 x i8]* @str, i32 0, i32 0), i32 9, i1 false)
-; CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %tmp, i8* %p, i32 10, i1 false)
-; CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %heap, i8* %p, i32 9, i1 false)
-
-
 ; Do not propagate memcpy from %p across the stackrestore.
 
 define i32 @test_stackrestore() {
+; CHECK-LABEL: @test_stackrestore(
+; CHECK-NEXT:    [[TMPMEM:%.*]] = alloca [10 x i8], align 4
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds [10 x i8], [10 x i8]* [[TMPMEM]], i32 0, i32 0
+; CHECK-NEXT:    [[INALLOCA_SAVE:%.*]] = tail call i8* @llvm.stacksave()
+; CHECK-NEXT:    [[ARGMEM:%.*]] = alloca inalloca [10 x i8], align 4
+; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds [10 x i8], [10 x i8]* [[ARGMEM]], i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[P]], i8* align 1 getelementptr inbounds ([9 x i8], [9 x i8]* @str, i32 0, i32 0), i32 9, i1 false)
+; CHECK-NEXT:    [[P10:%.*]] = getelementptr inbounds [10 x i8], [10 x i8]* [[ARGMEM]], i32 0, i32 9
+; CHECK-NEXT:    store i8 0, i8* [[P10]], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP]], i8* [[P]], i32 10, i1 false)
+; CHECK-NEXT:    call void @llvm.stackrestore(i8* [[INALLOCA_SAVE]])
+; CHECK-NEXT:    [[HEAP:%.*]] = call i8* @malloc(i32 9)
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[HEAP]], i8* [[TMP]], i32 9, i1 false)
+; CHECK-NEXT:    call void @useit(i8* [[HEAP]])
+; CHECK-NEXT:    ret i32 0
+;
   %tmpmem = alloca [10 x i8], align 4
   %tmp = getelementptr inbounds [10 x i8], [10 x i8]* %tmpmem, i32 0, i32 0
   %inalloca.save = tail call i8* @llvm.stacksave()
@@ -61,11 +86,6 @@ define i32 @test_stackrestore() {
   ret i32 0
 }
 
-; CHECK-LABEL: define i32 @test_stackrestore()
-; CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %p, i8* align 1 getelementptr inbounds ([9 x i8], [9 x i8]* @str, i32 0, i32 0), i32 9, i1 false)
-; CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %tmp, i8* %p, i32 10, i1 false)
-; CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %heap, i8* %tmp, i32 9, i1 false)
-
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i1)
 declare i8* @llvm.stacksave()
 declare void @llvm.stackrestore(i8*)

diff  --git a/llvm/test/Transforms/MemCpyOpt/store-to-memset-is-nonzero-type.ll b/llvm/test/Transforms/MemCpyOpt/store-to-memset-is-nonzero-type.ll
index f75b63edef35..81d3da0966d9 100644
--- a/llvm/test/Transforms/MemCpyOpt/store-to-memset-is-nonzero-type.ll
+++ b/llvm/test/Transforms/MemCpyOpt/store-to-memset-is-nonzero-type.ll
@@ -65,7 +65,7 @@ define void @vector_fixed_length_nonzero(<16 x i8>* %p) {
 
 define void @vector_scalable_nonzero(<vscale x 4 x i32>* %p) {
 ; CHECK-LABEL: @vector_scalable_nonzero(
-; CHECK-NEXT:    store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* [[P:%.*]]
+; CHECK-NEXT:    store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* [[P:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* %p

diff  --git a/llvm/test/Transforms/MemCpyOpt/store-to-memset.ll b/llvm/test/Transforms/MemCpyOpt/store-to-memset.ll
index 59ed892b60ee..51651e73e2bc 100644
--- a/llvm/test/Transforms/MemCpyOpt/store-to-memset.ll
+++ b/llvm/test/Transforms/MemCpyOpt/store-to-memset.ll
@@ -1,8 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -memcpyopt -S | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-grtev4-linux-gnu"
 
 define i8* @foo(i8* returned %0, i32 %1, i64 %2) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP0:%.*]], i64 [[TMP2:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i64 -32
+; CHECK-NEXT:    [[VV:%.*]] = trunc i32 [[TMP1:%.*]] to i8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i64 2
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i64 3
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i64 4
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i64 5
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i64 6
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i64 7
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i64 8
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i64 9
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i64 10
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i64 11
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i64 12
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i64 13
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i64 14
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i64 15
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i64 16
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, i8* [[TMP20]], i64 1
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, i8* [[TMP20]], i64 2
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, i8* [[TMP20]], i64 3
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, i8* [[TMP20]], i64 4
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, i8* [[TMP20]], i64 5
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, i8* [[TMP20]], i64 6
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, i8* [[TMP20]], i64 7
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, i8* [[TMP20]], i64 8
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, i8* [[TMP20]], i64 9
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, i8* [[TMP20]], i64 10
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, i8* [[TMP20]], i64 11
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, i8* [[TMP20]], i64 12
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, i8* [[TMP20]], i64 13
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, i8* [[TMP20]], i64 14
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i8, i8* [[TMP20]], i64 15
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 1 [[TMP4]], i8 [[VV]], i64 32, i1 false)
+; CHECK-NEXT:    ret i8* [[TMP0]]
+;
 entry:
   %3 = getelementptr inbounds i8, i8* %0, i64 %2
   %4 = getelementptr inbounds i8, i8* %3, i64 -32
@@ -71,7 +111,5 @@ entry:
   %35 = getelementptr inbounds i8, i8* %20, i64 15
   store i8 %vv, i8* %35, align 1
   ret i8* %0
-; CHECK-LABEL: @foo
-; CHECK: call void @llvm.memset.p0i8.i64(i8* align 1 %4, i8 %vv, i64 32, i1 false)
 }
 

diff  --git a/llvm/test/Transforms/MemCpyOpt/vscale-memset.ll b/llvm/test/Transforms/MemCpyOpt/vscale-memset.ll
index 256bd8518dc1..952a57796f87 100644
--- a/llvm/test/Transforms/MemCpyOpt/vscale-memset.ll
+++ b/llvm/test/Transforms/MemCpyOpt/vscale-memset.ll
@@ -9,9 +9,9 @@ define void @foo(i8* %p) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:    [[A:%.*]] = bitcast i8* [[P:%.*]] to <vscale x 16 x i8>*
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* [[A]], i64 0
-; CHECK-NEXT:    store <vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8>* [[TMP0]]
+; CHECK-NEXT:    store <vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8>* [[TMP0]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* [[A]], i64 1
-; CHECK-NEXT:    store <vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8>* [[TMP1]]
+; CHECK-NEXT:    store <vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8>* [[TMP1]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %a = bitcast i8* %p to <vscale x 16 x i8>*


        


More information about the llvm-commits mailing list