[llvm] r358552 - Revert "Temporarily Revert "Add basic loop fusion pass.""

Tue Apr 16 21:53:01 PDT 2019

Added: llvm/trunk/test/Transforms/JumpThreading/threading_prof1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/JumpThreading/threading_prof1.ll?rev=358552&view=auto
==============================================================================

--- llvm/trunk/test/Transforms/JumpThreading/threading_prof1.ll (added)
+++ llvm/trunk/test/Transforms/JumpThreading/threading_prof1.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,99 @@
+; RUN: opt -jump-threading -S < %s | FileCheck %s
+; RUN: opt -passes=jump-threading -S < %s | FileCheck %s
+
+define void @test() {
+; CHECK-LABEL: @test()
+bb:
+  %tmp = call i32 @a()
+  %tmp1 = icmp eq i32 %tmp, 1
+  br i1 %tmp1, label %bb5, label %bb2
+; CHECK: br i1 %tmp1,{{.*}} !prof ![[PROF1:[0-9]+]]
+
+bb2:                                              ; preds = %bb
+  %tmp3 = call i32 @b()
+  %tmp4 = icmp ne i32 %tmp3, 1
+  br label %bb5
+; CHECK: br i1 %tmp4, {{.*}} !prof ![[PROF2:[0-9]+]]
+
+bb5:                                              ; preds = %bb2, %bb
+  %tmp6 = phi i1 [ false, %bb ], [ %tmp4, %bb2 ]
+  br i1 %tmp6, label %bb8, label %bb7, !prof !0
+
+bb7:                                              ; preds = %bb5
+  call void @bar()
+  br label %bb8
+
+bb8:                                              ; preds = %bb7, %bb5
+  ret void
+}
+
+define void @test_single_pred1() {
+; CHECK-LABEL: @test_single_pred1()
+bb:
+  %tmp = call i32 @a()
+  %tmp1 = icmp eq i32 %tmp, 1
+  br i1 %tmp1, label %bb5_1, label %bb2
+; CHECK: br i1 %tmp1,{{.*}} !prof ![[PROF1:[0-9]+]]
+
+bb5_1:                                             
+  br label %bb5;
+
+bb2:                                              
+  %tmp3 = call i32 @b()
+  %tmp4 = icmp ne i32 %tmp3, 1
+  br label %bb5
+; CHECK: br i1 %tmp4, {{.*}} !prof ![[PROF2:[0-9]+]]
+
+bb5:                                             
+  %tmp6 = phi i1 [ false, %bb5_1 ], [ %tmp4, %bb2 ]
+  br i1 %tmp6, label %bb8, label %bb7, !prof !0
+
+bb7:                                            
+  call void @bar()
+  br label %bb8
+
+bb8:                                           
+  ret void
+}
+
+define void @test_single_pred2() {
+; CHECK-LABEL: @test_single_pred2()
+bb:
+  %tmp = call i32 @a()
+  %tmp1 = icmp eq i32 %tmp, 1
+  br i1 %tmp1, label %bb5_1, label %bb2
+; CHECK: br i1 %tmp1,{{.*}} !prof ![[PROF1:[0-9]+]]
+
+bb5_1:                                             
+  br label %bb5_2;
+
+bb5_2:                                             
+  br label %bb5;
+
+bb2:                          
+  %tmp3 = call i32 @b()
+  %tmp4 = icmp ne i32 %tmp3, 1
+  br label %bb5
+; CHECK: br i1 %tmp4, {{.*}} !prof ![[PROF2:[0-9]+]]
+
+bb5:                         
+  %tmp6 = phi i1 [ false, %bb5_2 ], [ %tmp4, %bb2 ]
+  br i1 %tmp6, label %bb8, label %bb7, !prof !0
+
+bb7:                        
+  call void @bar()
+  br label %bb8
+
+bb8:                       
+  ret void
+}
+
+declare void @bar()
+
+declare i32 @a()
+
+declare i32 @b()
+
+!0 = !{!"branch_weights", i32 2146410443, i32 1073205}
+;CHECK: ![[PROF1]] = !{!"branch_weights", i32 1073205, i32 2146410443}
+;CHECK: ![[PROF2]] = !{!"branch_weights", i32 2146410443, i32 1073205}

Added: llvm/trunk/test/Transforms/JumpThreading/threading_prof2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/JumpThreading/threading_prof2.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/JumpThreading/threading_prof2.ll (added)
+++ llvm/trunk/test/Transforms/JumpThreading/threading_prof2.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,42 @@
+; RUN: opt -jump-threading -S < %s | FileCheck %s
+; RUN: opt -passes=jump-threading -S < %s | FileCheck %s
+define void @test() {
+bb:
+  %tmp = call i32 @a()
+  %tmp1 = icmp eq i32 %tmp, 1
+  br i1 %tmp1, label %bb5, label %bb2
+; CHECK: br i1 %tmp1,{{.*}} !prof ![[PROF1:[0-9]+]]
+
+bb2:                                              
+  %tmp3 = call i32 @b()
+  %tmp4 = icmp ne i32 %tmp3, 1
+  br label %bb5
+; CHECK: br i1 %tmp4, {{.*}} !prof ![[PROF2:[0-9]+]]
+
+bb5:                                             
+  %tmp6 = phi i1 [ false, %bb ], [ %tmp4, %bb2 ]
+  br i1 %tmp6, label %bb8, label %bb7, !prof !0
+
+bb7:                                            
+  call void @bar()
+  br label %bb9
+
+bb8: 
+  call void @foo()
+  br label %bb9
+
+bb9:                                           
+  ret void
+}
+
+declare void @bar()
+
+declare void @foo()
+
+declare i32 @a()
+
+declare i32 @b()
+
+!0 = !{!"branch_weights", i32 2146410443, i32 1073205}
+;CHECK: ![[PROF1]] = !{!"branch_weights", i32 1073205, i32 2146410443}
+;CHECK: ![[PROF2]] = !{!"branch_weights", i32 2146410443, i32 1073205}

Added: llvm/trunk/test/Transforms/JumpThreading/update-edge-weight.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/JumpThreading/update-edge-weight.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/JumpThreading/update-edge-weight.ll (added)
+++ llvm/trunk/test/Transforms/JumpThreading/update-edge-weight.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,43 @@
+; RUN: opt -S -jump-threading %s | FileCheck %s
+
+; Test if edge weights are properly updated after jump threading.
+
+; CHECK: !2 = !{!"branch_weights", i32 1629125526, i32 518358122}
+
+define void @foo(i32 %n) !prof !0 {
+entry:
+  %cmp = icmp sgt i32 %n, 10
+  br i1 %cmp, label %if.then.1, label %if.else.1, !prof !1
+
+if.then.1:
+  tail call void @a()
+  br label %if.cond
+
+if.else.1:
+  tail call void @b()
+  br label %if.cond
+
+if.cond:
+  %cmp1 = icmp sgt i32 %n, 5
+  br i1 %cmp1, label %if.then.2, label %if.else.2, !prof !2
+
+if.then.2:
+  tail call void @c()
+  br label %if.end
+
+if.else.2:
+  tail call void @d()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+declare void @a()
+declare void @b()
+declare void @c()
+declare void @d()
+
+!0 = !{!"function_entry_count", i64 1}
+!1 = !{!"branch_weights", i32 10, i32 5}
+!2 = !{!"branch_weights", i32 10, i32 1}

Added: llvm/trunk/test/Transforms/LCSSA/2006-06-03-IncorrectIDFPhis.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LCSSA/2006-06-03-IncorrectIDFPhis.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LCSSA/2006-06-03-IncorrectIDFPhis.ll (added)
+++ llvm/trunk/test/Transforms/LCSSA/2006-06-03-IncorrectIDFPhis.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,22 @@
+; RUN: opt < %s -loop-simplify -lcssa -S | FileCheck %s
+
+        %struct.SetJmpMapEntry = type { i8*, i32, %struct.SetJmpMapEntry* }
+
+define void @__llvm_sjljeh_try_catching_longjmp_exception() {
+; CHECK-LABEL: @__llvm_sjljeh_try_catching_longjmp_exception
+entry:
+        br i1 false, label %UnifiedReturnBlock, label %no_exit
+no_exit:                ; preds = %endif, %entry
+        %SJE.0.0 = phi %struct.SetJmpMapEntry* [ %tmp.24, %endif ], [ null, %entry ]            ; <%struct.SetJmpMapEntry*> [#uses=1]
+        br i1 false, label %then, label %endif
+then:           ; preds = %no_exit
+; CHECK: %SJE.0.0.lcssa = phi %struct.SetJmpMapEntry
+        %tmp.20 = getelementptr %struct.SetJmpMapEntry, %struct.SetJmpMapEntry* %SJE.0.0, i32 0, i32 1          ; <i32*> [#uses=0]
+        ret void
+endif:          ; preds = %no_exit
+        %tmp.24 = load %struct.SetJmpMapEntry*, %struct.SetJmpMapEntry** null            ; <%struct.SetJmpMapEntry*> [#uses=1]
+        br i1 false, label %UnifiedReturnBlock, label %no_exit
+UnifiedReturnBlock:             ; preds = %endif, %entry
+        ret void
+}
+

Added: llvm/trunk/test/Transforms/LCSSA/2006-06-12-MultipleExitsSameBlock.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LCSSA/2006-06-12-MultipleExitsSameBlock.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LCSSA/2006-06-12-MultipleExitsSameBlock.ll (added)
+++ llvm/trunk/test/Transforms/LCSSA/2006-06-12-MultipleExitsSameBlock.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,28 @@
+; RUN: opt < %s -lcssa -S | FileCheck %s
+; RUN: opt < %s -passes=lcssa -S | FileCheck %s
+
+declare i1 @c1()
+
+declare i1 @c2()
+
+define i32 @foo() {
+; CHECK-LABEL: @foo
+entry:
+	br label %loop_begin
+loop_begin:		; preds = %loop_body.2, %entry
+	br i1 true, label %loop_body.1, label %loop_exit2
+loop_body.1:		; preds = %loop_begin
+	%X.1 = add i32 0, 1		; <i32> [#uses=1]
+	%rel.1 = call i1 @c1( )		; <i1> [#uses=1]
+	br i1 %rel.1, label %loop_exit, label %loop_body.2
+loop_body.2:		; preds = %loop_body.1
+	%rel.2 = call i1 @c2( )		; <i1> [#uses=1]
+	br i1 %rel.2, label %loop_exit, label %loop_begin
+loop_exit:		; preds = %loop_body.2, %loop_body.1
+; CHECK: %X.1.lcssa = phi
+	ret i32 %X.1
+loop_exit2:		; preds = %loop_begin
+	ret i32 1
+; CHECK-NOT: %X.1.lcssa1
+}
+

Added: llvm/trunk/test/Transforms/LCSSA/2006-07-09-NoDominator.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LCSSA/2006-07-09-NoDominator.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LCSSA/2006-07-09-NoDominator.ll (added)
+++ llvm/trunk/test/Transforms/LCSSA/2006-07-09-NoDominator.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,25 @@
+; RUN: opt < %s -lcssa
+; RUN: opt < %s -passes=lcssa
+
+	%struct.SetJmpMapEntry = type { i8*, i32, %struct.SetJmpMapEntry* }
+
+define void @__llvm_sjljeh_try_catching_longjmp_exception() {
+entry:
+	br label %loopentry
+loopentry:		; preds = %endif, %entry
+	%SJE.0 = phi %struct.SetJmpMapEntry* [ null, %entry ], [ %tmp.25, %endif ]	; <%struct.SetJmpMapEntry*> [#uses=1]
+	br i1 false, label %no_exit, label %loopexit
+no_exit:		; preds = %loopentry
+	br i1 false, label %then, label %endif
+then:		; preds = %no_exit
+	%tmp.21 = getelementptr %struct.SetJmpMapEntry, %struct.SetJmpMapEntry* %SJE.0, i32 0, i32 1		; <i32*> [#uses=0]
+	br label %return
+endif:		; preds = %no_exit
+	%tmp.25 = load %struct.SetJmpMapEntry*, %struct.SetJmpMapEntry** null		; <%struct.SetJmpMapEntry*> [#uses=1]
+	br label %loopentry
+loopexit:		; preds = %loopentry
+	br label %return
+return:		; preds = %loopexit, %then
+	ret void
+}
+

Added: llvm/trunk/test/Transforms/LCSSA/2006-10-31-UnreachableBlock-2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LCSSA/2006-10-31-UnreachableBlock-2.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LCSSA/2006-10-31-UnreachableBlock-2.ll (added)
+++ llvm/trunk/test/Transforms/LCSSA/2006-10-31-UnreachableBlock-2.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,145 @@
+; RUN: opt < %s -lcssa -disable-output -verify-dom-info -verify-loop-info
+; PR977
+; END.
+declare i32 @opost_block()
+
+define void @write_chan() {
+entry:
+	br i1 false, label %shortcirc_next.0, label %shortcirc_done.0
+shortcirc_next.0:		; preds = %entry
+	br label %shortcirc_done.0
+shortcirc_done.0:		; preds = %shortcirc_next.0, %entry
+	br i1 false, label %shortcirc_next.1, label %shortcirc_done.1
+shortcirc_next.1:		; preds = %shortcirc_done.0
+	br label %shortcirc_done.1
+shortcirc_done.1:		; preds = %shortcirc_next.1, %shortcirc_done.0
+	br i1 false, label %then.0, label %endif.0
+then.0:		; preds = %shortcirc_done.1
+	br i1 false, label %then.1, label %endif.1
+then.1:		; preds = %then.0
+	br label %return
+after_ret.0:		; No predecessors!
+	br label %endif.1
+endif.1:		; preds = %after_ret.0, %then.0
+	br label %endif.0
+endif.0:		; preds = %endif.1, %shortcirc_done.1
+	br label %loopentry.0
+loopentry.0:		; preds = %endif.12, %endif.0
+	br i1 false, label %then.2, label %endif.2
+then.2:		; preds = %loopentry.0
+	br label %loopexit.0
+dead_block_after_break.0:		; No predecessors!
+	br label %endif.2
+endif.2:		; preds = %dead_block_after_break.0, %loopentry.0
+	br i1 false, label %shortcirc_done.2, label %shortcirc_next.2
+shortcirc_next.2:		; preds = %endif.2
+	br i1 false, label %shortcirc_next.3, label %shortcirc_done.3
+shortcirc_next.3:		; preds = %shortcirc_next.2
+	br label %shortcirc_done.3
+shortcirc_done.3:		; preds = %shortcirc_next.3, %shortcirc_next.2
+	br label %shortcirc_done.2
+shortcirc_done.2:		; preds = %shortcirc_done.3, %endif.2
+	br i1 false, label %then.3, label %endif.3
+then.3:		; preds = %shortcirc_done.2
+	br label %loopexit.0
+dead_block_after_break.1:		; No predecessors!
+	br label %endif.3
+endif.3:		; preds = %dead_block_after_break.1, %shortcirc_done.2
+	br i1 false, label %shortcirc_next.4, label %shortcirc_done.4
+shortcirc_next.4:		; preds = %endif.3
+	br label %shortcirc_done.4
+shortcirc_done.4:		; preds = %shortcirc_next.4, %endif.3
+	br i1 false, label %then.4, label %else
+then.4:		; preds = %shortcirc_done.4
+	br label %loopentry.1
+loopentry.1:		; preds = %endif.8, %then.4
+	br i1 false, label %no_exit, label %loopexit.1
+no_exit:		; preds = %loopentry.1
+	%tmp.94 = call i32 @opost_block( )		; <i32> [#uses=1]
+	br i1 false, label %then.5, label %endif.5
+then.5:		; preds = %no_exit
+	br i1 false, label %then.6, label %endif.6
+then.6:		; preds = %then.5
+	br label %loopexit.1
+dead_block_after_break.2:		; No predecessors!
+	br label %endif.6
+endif.6:		; preds = %dead_block_after_break.2, %then.5
+	br label %break_out
+dead_block_after_goto.0:		; No predecessors!
+	br label %endif.5
+endif.5:		; preds = %dead_block_after_goto.0, %no_exit
+	br i1 false, label %then.7, label %endif.7
+then.7:		; preds = %endif.5
+	br label %loopexit.1
+dead_block_after_break.3:		; No predecessors!
+	br label %endif.7
+endif.7:		; preds = %dead_block_after_break.3, %endif.5
+	switch i32 1, label %switchexit [
+		 i32 4, label %label.2
+		 i32 2, label %label.1
+		 i32 1, label %label.0
+	]
+label.0:		; preds = %endif.7
+	br label %switchexit
+dead_block_after_break.4:		; No predecessors!
+	br label %label.1
+label.1:		; preds = %dead_block_after_break.4, %endif.7
+	br label %switchexit
+dead_block_after_break.5:		; No predecessors!
+	br label %label.2
+label.2:		; preds = %dead_block_after_break.5, %endif.7
+	br label %switchexit
+dead_block_after_break.6:		; No predecessors!
+	br label %switchexit
+switchexit:		; preds = %dead_block_after_break.6, %label.2, %label.1, %label.0, %endif.7
+	br i1 false, label %then.8, label %endif.8
+then.8:		; preds = %switchexit
+	br label %loopexit.1
+dead_block_after_break.7:		; No predecessors!
+	br label %endif.8
+endif.8:		; preds = %dead_block_after_break.7, %switchexit
+	br label %loopentry.1
+loopexit.1:		; preds = %then.8, %then.7, %then.6, %loopentry.1
+	br i1 false, label %then.9, label %endif.9
+then.9:		; preds = %loopexit.1
+	br label %endif.9
+endif.9:		; preds = %then.9, %loopexit.1
+	br label %endif.4
+else:		; preds = %shortcirc_done.4
+	br i1 false, label %then.10, label %endif.10
+then.10:		; preds = %else
+	br label %break_out
+dead_block_after_goto.1:		; No predecessors!
+	br label %endif.10
+endif.10:		; preds = %dead_block_after_goto.1, %else
+	br label %endif.4
+endif.4:		; preds = %endif.10, %endif.9
+	br i1 false, label %then.11, label %endif.11
+then.11:		; preds = %endif.4
+	br label %loopexit.0
+dead_block_after_break.8:		; No predecessors!
+	br label %endif.11
+endif.11:		; preds = %dead_block_after_break.8, %endif.4
+	br i1 false, label %then.12, label %endif.12
+then.12:		; preds = %endif.11
+	br label %loopexit.0
+dead_block_after_break.9:		; No predecessors!
+	br label %endif.12
+endif.12:		; preds = %dead_block_after_break.9, %endif.11
+	br label %loopentry.0
+loopexit.0:		; preds = %then.12, %then.11, %then.3, %then.2
+	br label %break_out
+break_out:		; preds = %loopexit.0, %then.10, %endif.6
+	%retval.3 = phi i32 [ 0, %loopexit.0 ], [ %tmp.94, %endif.6 ], [ 0, %then.10 ]		; <i32> [#uses=0]
+	br i1 false, label %cond_true, label %cond_false
+cond_true:		; preds = %break_out
+	br label %cond_continue
+cond_false:		; preds = %break_out
+	br label %cond_continue
+cond_continue:		; preds = %cond_false, %cond_true
+	br label %return
+after_ret.1:		; No predecessors!
+	br label %return
+return:		; preds = %after_ret.1, %cond_continue, %then.1
+	ret void
+}

Added: llvm/trunk/test/Transforms/LCSSA/2006-10-31-UnreachableBlock.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LCSSA/2006-10-31-UnreachableBlock.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LCSSA/2006-10-31-UnreachableBlock.ll (added)
+++ llvm/trunk/test/Transforms/LCSSA/2006-10-31-UnreachableBlock.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,185 @@
+; RUN: opt < %s -lcssa -disable-output
+; RUN: opt < %s -passes=lcssa -disable-output
+; PR977
+; END.
+
+define void @process_backlog() {
+entry:
+	br label %loopentry.preheader
+loopentry.preheader:		; preds = %dead_block_after_break, %entry
+	%work.0.ph = phi i32 [ %inc, %dead_block_after_break ], [ 0, %entry ]		; <i32> [#uses=0]
+	br label %loopentry
+loopentry:		; preds = %endif.1, %loopentry.preheader
+	br i1 false, label %then.i, label %loopentry.__skb_dequeue67.exit_crit_edge
+loopentry.__skb_dequeue67.exit_crit_edge:		; preds = %loopentry
+	br label %__skb_dequeue67.exit
+then.i:		; preds = %loopentry
+	br label %__skb_dequeue67.exit
+__skb_dequeue67.exit:		; preds = %then.i, %loopentry.__skb_dequeue67.exit_crit_edge
+	br i1 false, label %then.0, label %__skb_dequeue67.exit.endif.0_crit_edge
+__skb_dequeue67.exit.endif.0_crit_edge:		; preds = %__skb_dequeue67.exit
+	br label %endif.0
+then.0:		; preds = %__skb_dequeue67.exit
+	br label %job_done
+dead_block_after_goto:		; No predecessors!
+	unreachable
+endif.0:		; preds = %__skb_dequeue67.exit.endif.0_crit_edge
+	br i1 false, label %then.0.i, label %endif.0.endif.0.i_crit_edge
+endif.0.endif.0.i_crit_edge:		; preds = %endif.0
+	br label %endif.0.i
+then.0.i:		; preds = %endif.0
+	br label %endif.0.i
+endif.0.i:		; preds = %then.0.i, %endif.0.endif.0.i_crit_edge
+	br i1 false, label %then.i.i, label %endif.0.i.skb_bond.exit.i_crit_edge
+endif.0.i.skb_bond.exit.i_crit_edge:		; preds = %endif.0.i
+	br label %skb_bond.exit.i
+then.i.i:		; preds = %endif.0.i
+	br label %skb_bond.exit.i
+skb_bond.exit.i:		; preds = %then.i.i, %endif.0.i.skb_bond.exit.i_crit_edge
+	br label %loopentry.0.i
+loopentry.0.i:		; preds = %loopentry.0.i.backedge, %skb_bond.exit.i
+	br i1 false, label %loopentry.0.i.no_exit.0.i_crit_edge, label %loopentry.0.i.loopexit.0.i_crit_edge
+loopentry.0.i.loopexit.0.i_crit_edge:		; preds = %loopentry.0.i
+	br label %loopexit.0.i
+loopentry.0.i.no_exit.0.i_crit_edge:		; preds = %loopentry.0.i
+	br label %no_exit.0.i
+no_exit.0.i:		; preds = %then.3.i.no_exit.0.i_crit_edge, %loopentry.0.i.no_exit.0.i_crit_edge
+	br i1 false, label %no_exit.0.i.shortcirc_done.0.i_crit_edge, label %shortcirc_next.0.i
+no_exit.0.i.shortcirc_done.0.i_crit_edge:		; preds = %no_exit.0.i
+	br label %shortcirc_done.0.i
+shortcirc_next.0.i:		; preds = %no_exit.0.i
+	br label %shortcirc_done.0.i
+shortcirc_done.0.i:		; preds = %shortcirc_next.0.i, %no_exit.0.i.shortcirc_done.0.i_crit_edge
+	br i1 false, label %then.1.i, label %endif.1.i
+then.1.i:		; preds = %shortcirc_done.0.i
+	br i1 false, label %then.2.i, label %then.1.i.endif.2.i_crit_edge
+then.1.i.endif.2.i_crit_edge:		; preds = %then.1.i
+	br label %endif.2.i
+then.2.i:		; preds = %then.1.i
+	br i1 false, label %then.3.i, label %else.0.i
+then.3.i:		; preds = %then.2.i
+	br i1 false, label %then.3.i.no_exit.0.i_crit_edge, label %then.3.i.loopexit.0.i_crit_edge
+then.3.i.loopexit.0.i_crit_edge:		; preds = %then.3.i
+	br label %loopexit.0.i
+then.3.i.no_exit.0.i_crit_edge:		; preds = %then.3.i
+	br label %no_exit.0.i
+else.0.i:		; preds = %then.2.i
+	br label %endif.2.i
+endif.3.i:		; No predecessors!
+	unreachable
+endif.2.i:		; preds = %else.0.i, %then.1.i.endif.2.i_crit_edge
+	br label %loopentry.0.i.backedge
+endif.1.i:		; preds = %shortcirc_done.0.i
+	br label %loopentry.0.i.backedge
+loopentry.0.i.backedge:		; preds = %endif.1.i, %endif.2.i
+	br label %loopentry.0.i
+loopexit.0.i:		; preds = %then.3.i.loopexit.0.i_crit_edge, %loopentry.0.i.loopexit.0.i_crit_edge
+	br label %loopentry.1.i
+loopentry.1.i:		; preds = %loopentry.1.i.backedge, %loopexit.0.i
+	br i1 false, label %loopentry.1.i.no_exit.1.i_crit_edge, label %loopentry.1.i.loopexit.1.i_crit_edge
+loopentry.1.i.loopexit.1.i_crit_edge:		; preds = %loopentry.1.i
+	br label %loopexit.1.i
+loopentry.1.i.no_exit.1.i_crit_edge:		; preds = %loopentry.1.i
+	br label %no_exit.1.i
+no_exit.1.i:		; preds = %then.6.i.no_exit.1.i_crit_edge, %loopentry.1.i.no_exit.1.i_crit_edge
+	br i1 false, label %shortcirc_next.1.i, label %no_exit.1.i.shortcirc_done.1.i_crit_edge
+no_exit.1.i.shortcirc_done.1.i_crit_edge:		; preds = %no_exit.1.i
+	br label %shortcirc_done.1.i
+shortcirc_next.1.i:		; preds = %no_exit.1.i
+	br i1 false, label %shortcirc_next.1.i.shortcirc_done.2.i_crit_edge, label %shortcirc_next.2.i
+shortcirc_next.1.i.shortcirc_done.2.i_crit_edge:		; preds = %shortcirc_next.1.i
+	br label %shortcirc_done.2.i
+shortcirc_next.2.i:		; preds = %shortcirc_next.1.i
+	br label %shortcirc_done.2.i
+shortcirc_done.2.i:		; preds = %shortcirc_next.2.i, %shortcirc_next.1.i.shortcirc_done.2.i_crit_edge
+	br label %shortcirc_done.1.i
+shortcirc_done.1.i:		; preds = %shortcirc_done.2.i, %no_exit.1.i.shortcirc_done.1.i_crit_edge
+	br i1 false, label %then.4.i, label %endif.4.i
+then.4.i:		; preds = %shortcirc_done.1.i
+	br i1 false, label %then.5.i, label %then.4.i.endif.5.i_crit_edge
+then.4.i.endif.5.i_crit_edge:		; preds = %then.4.i
+	br label %endif.5.i
+then.5.i:		; preds = %then.4.i
+	br i1 false, label %then.6.i, label %else.1.i
+then.6.i:		; preds = %then.5.i
+	br i1 false, label %then.6.i.no_exit.1.i_crit_edge, label %then.6.i.loopexit.1.i_crit_edge
+then.6.i.loopexit.1.i_crit_edge:		; preds = %then.6.i
+	br label %loopexit.1.i
+then.6.i.no_exit.1.i_crit_edge:		; preds = %then.6.i
+	br label %no_exit.1.i
+else.1.i:		; preds = %then.5.i
+	br label %endif.5.i
+endif.6.i:		; No predecessors!
+	unreachable
+endif.5.i:		; preds = %else.1.i, %then.4.i.endif.5.i_crit_edge
+	br label %loopentry.1.i.backedge
+endif.4.i:		; preds = %shortcirc_done.1.i
+	br label %loopentry.1.i.backedge
+loopentry.1.i.backedge:		; preds = %endif.4.i, %endif.5.i
+	br label %loopentry.1.i
+loopexit.1.i:		; preds = %then.6.i.loopexit.1.i_crit_edge, %loopentry.1.i.loopexit.1.i_crit_edge
+	br i1 false, label %then.7.i, label %else.2.i
+then.7.i:		; preds = %loopexit.1.i
+	br i1 false, label %then.8.i, label %else.3.i
+then.8.i:		; preds = %then.7.i
+	br label %netif_receive_skb.exit
+else.3.i:		; preds = %then.7.i
+	br label %netif_receive_skb.exit
+endif.8.i:		; No predecessors!
+	unreachable
+else.2.i:		; preds = %loopexit.1.i
+	br i1 false, label %else.2.i.shortcirc_done.i.i_crit_edge, label %shortcirc_next.i.i
+else.2.i.shortcirc_done.i.i_crit_edge:		; preds = %else.2.i
+	br label %shortcirc_done.i.i
+shortcirc_next.i.i:		; preds = %else.2.i
+	br label %shortcirc_done.i.i
+shortcirc_done.i.i:		; preds = %shortcirc_next.i.i, %else.2.i.shortcirc_done.i.i_crit_edge
+	br i1 false, label %then.i1.i, label %shortcirc_done.i.i.kfree_skb65.exit.i_crit_edge
+shortcirc_done.i.i.kfree_skb65.exit.i_crit_edge:		; preds = %shortcirc_done.i.i
+	br label %kfree_skb65.exit.i
+then.i1.i:		; preds = %shortcirc_done.i.i
+	br label %kfree_skb65.exit.i
+kfree_skb65.exit.i:		; preds = %then.i1.i, %shortcirc_done.i.i.kfree_skb65.exit.i_crit_edge
+	br label %netif_receive_skb.exit
+netif_receive_skb.exit:		; preds = %kfree_skb65.exit.i, %else.3.i, %then.8.i
+	br i1 false, label %then.i1, label %netif_receive_skb.exit.dev_put69.exit_crit_edge
+netif_receive_skb.exit.dev_put69.exit_crit_edge:		; preds = %netif_receive_skb.exit
+	br label %dev_put69.exit
+then.i1:		; preds = %netif_receive_skb.exit
+	br label %dev_put69.exit
+dev_put69.exit:		; preds = %then.i1, %netif_receive_skb.exit.dev_put69.exit_crit_edge
+	%inc = add i32 0, 1		; <i32> [#uses=1]
+	br i1 false, label %dev_put69.exit.shortcirc_done_crit_edge, label %shortcirc_next
+dev_put69.exit.shortcirc_done_crit_edge:		; preds = %dev_put69.exit
+	br label %shortcirc_done
+shortcirc_next:		; preds = %dev_put69.exit
+	br label %shortcirc_done
+shortcirc_done:		; preds = %shortcirc_next, %dev_put69.exit.shortcirc_done_crit_edge
+	br i1 false, label %then.1, label %endif.1
+then.1:		; preds = %shortcirc_done
+	ret void
+dead_block_after_break:		; No predecessors!
+	br label %loopentry.preheader
+endif.1:		; preds = %shortcirc_done
+	br label %loopentry
+loopexit:		; No predecessors!
+	unreachable
+after_ret.0:		; No predecessors!
+	br label %job_done
+job_done:		; preds = %after_ret.0, %then.0
+	br label %loopentry.i
+loopentry.i:		; preds = %no_exit.i, %job_done
+	br i1 false, label %no_exit.i, label %clear_bit62.exit
+no_exit.i:		; preds = %loopentry.i
+	br label %loopentry.i
+clear_bit62.exit:		; preds = %loopentry.i
+	br i1 false, label %then.2, label %endif.2
+then.2:		; preds = %clear_bit62.exit
+	ret void
+endif.2:		; preds = %clear_bit62.exit
+	ret void
+after_ret.1:		; No predecessors!
+	ret void
+return:		; No predecessors!
+	unreachable
+}

Added: llvm/trunk/test/Transforms/LCSSA/2007-07-12-LICM-2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LCSSA/2007-07-12-LICM-2.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LCSSA/2007-07-12-LICM-2.ll (added)
+++ llvm/trunk/test/Transforms/LCSSA/2007-07-12-LICM-2.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,17 @@
+; RUN: opt < %s -loop-rotate -licm -loop-unswitch -disable-output
+define i32 @main(i32 %argc, i8** %argv) {
+entry:
+	br label %bb7
+
+bb7:		; preds = %bb7, %entry
+	%tmp39 = load <4 x float>, <4 x float>* null		; <<4 x float>> [#uses=1]
+	%tmp40 = fadd <4 x float> %tmp39, < float 2.000000e+00, float 3.000000e+00, float 1.000000e+00, float 0.000000e+00 >		; <<4 x float>> [#uses=1]
+	%tmp43 = fadd <4 x float> %tmp40, < float 1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 2.000000e+00 >		; <<4 x float>> [#uses=1]
+	%tmp46 = fadd <4 x float> %tmp43, < float 3.000000e+00, float 0.000000e+00, float 2.000000e+00, float 4.000000e+00 >		; <<4 x float>> [#uses=1]
+	%tmp49 = fadd <4 x float> %tmp46, < float 0.000000e+00, float 4.000000e+00, float 6.000000e+00, float 1.000000e+00 >		; <<4 x float>> [#uses=1]
+	store <4 x float> %tmp49, <4 x float>* null
+	br i1 false, label %bb7, label %bb56
+
+bb56:		; preds = %bb7
+	ret i32 0
+}

Added: llvm/trunk/test/Transforms/LCSSA/2007-07-12-LICM-3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LCSSA/2007-07-12-LICM-3.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LCSSA/2007-07-12-LICM-3.ll (added)
+++ llvm/trunk/test/Transforms/LCSSA/2007-07-12-LICM-3.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,24 @@
+; RUN: opt < %s -loop-rotate -licm -loop-unswitch -disable-output
+
+define i32 @main(i32 %argc, i8** %argv) {
+entry:
+        br label %bb
+
+bb:             ; preds = %bb56, %entry
+        br label %bb7
+
+bb7:            ; preds = %bb7, %bb
+        %tmp39 = load <4 x float>, <4 x float>* null         ; <<4 x float>> [#uses=1]
+        %tmp40 = fadd <4 x float> %tmp39, < float 2.000000e+00, float 3.000000e+00, float 1.000000e+00, float 0.000000e+00 >             ; <<4 x float>> [#uses=1]
+        %tmp43 = fadd <4 x float> %tmp40, < float 1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 2.000000e+00 >             ; <<4 x float>> [#uses=1]
+        %tmp46 = fadd <4 x float> %tmp43, < float 3.000000e+00, float 0.000000e+00, float 2.000000e+00, float 4.000000e+00 >             ; <<4 x float>> [#uses=1]
+        %tmp49 = fadd <4 x float> %tmp46, < float 0.000000e+00, float 4.000000e+00, float 6.000000e+00, float 1.000000e+00 >             ; <<4 x float>> [#uses=1]
+        store <4 x float> %tmp49, <4 x float>* null
+        br i1 false, label %bb7, label %bb56
+
+bb56:           ; preds = %bb7
+        br i1 false, label %bb, label %bb64
+
+bb64:           ; preds = %bb56
+        ret i32 0
+}

Added: llvm/trunk/test/Transforms/LCSSA/2007-07-12-LICM.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LCSSA/2007-07-12-LICM.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LCSSA/2007-07-12-LICM.ll (added)
+++ llvm/trunk/test/Transforms/LCSSA/2007-07-12-LICM.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,14 @@
+; RUN: opt < %s -loop-rotate -licm -loop-unswitch -disable-output
+define i32 @main(i32 %argc, i8** %argv) {
+entry:
+	br label %bb7
+
+bb7:		; preds = %bb7, %entry
+	%tmp39 = load <4 x float>, <4 x float>* null		; <<4 x float>> [#uses=1]
+	%tmp40 = fadd <4 x float> %tmp39, < float 2.000000e+00, float 3.000000e+00, float 1.000000e+00, float 0.000000e+00 >		; <<4 x float>> [#uses=0]
+	store <4 x float> zeroinitializer, <4 x float>* null
+	br i1 false, label %bb7, label %bb56
+
+bb56:		; preds = %bb7
+	ret i32 0
+}

Added: llvm/trunk/test/Transforms/LCSSA/avoid-intrinsics-in-catchswitch.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LCSSA/avoid-intrinsics-in-catchswitch.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LCSSA/avoid-intrinsics-in-catchswitch.ll (added)
+++ llvm/trunk/test/Transforms/LCSSA/avoid-intrinsics-in-catchswitch.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,133 @@
+; RUN: opt < %s -debugify -licm -S -o /dev/null
+;
+; The following test is from https://bugs.llvm.org/show_bug.cgi?id=36238
+; This test should pass (not assert or fault). The error that originally
+; provoked this test was regarding the LCSSA pass trying to insert a dbg.value
+; intrinsic into a catchswitch block.
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc19.11.0"
+
+%struct.e = type { i32 }
+%struct.d = type { i8 }
+%class.f = type { %class.b }
+%class.b = type { i8 }
+%struct.k = type opaque
+
+@"\01?l@@3HA" = local_unnamed_addr global i32 0, align 4
+
+define i32 @"\01?m@@YAJXZ"() personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*) {
+entry:
+  %n = alloca %struct.e, align 4
+  %db = alloca i32, align 4
+  %o = alloca %struct.d, align 1
+  %q = alloca i8*, align 8
+  %r = alloca i32, align 4
+  %u = alloca i64, align 8
+  %s = alloca %class.f, align 1
+  %offset = alloca i64, align 8
+  %t = alloca i64, align 8
+  %status = alloca i32, align 4
+  call void (...) @llvm.localescape(%class.f* nonnull %s, i32* nonnull %status)
+  %0 = bitcast %struct.e* %n to i8*
+  %1 = bitcast i32* %db to i8*
+  %2 = getelementptr inbounds %struct.d, %struct.d* %o, i64 0, i32 0
+  %3 = bitcast i8** %q to i8*
+  %4 = bitcast i32* %r to i8*
+  %5 = bitcast i64* %u to i8*
+  %6 = getelementptr inbounds %class.f, %class.f* %s, i64 0, i32 0, i32 0
+  %7 = load i32, i32* @"\01?l@@3HA", align 4, !tbaa !3
+  %call = call %class.f* @"\01??0f@@QEAA at H@Z"(%class.f* nonnull %s, i32 %7)
+  %8 = bitcast i64* %offset to i8*
+  %9 = bitcast i64* %t to i8*
+  %10 = bitcast i32* %status to i8*
+  %11 = bitcast %class.f* %s to %struct.d*
+  %c = getelementptr inbounds %struct.e, %struct.e* %n, i64 0, i32 0
+  br label %for.cond
+
+for.cond:                                         ; preds = %cleanup.cont, %entry
+  %p.0 = phi i32 [ undef, %entry ], [ %call2, %cleanup.cont ]
+  invoke void @"\01?h@@YAXPEAH0HPEAIPEAPEAEPEA_K33PEAUd@@4 at Z"(i32* nonnull %db, i32* nonnull %c, i32 undef, i32* nonnull %r, i8** nonnull %q, i64* nonnull %u, i64* nonnull %offset, i64* nonnull %t, %struct.d* nonnull %11, %struct.d* nonnull %o)
+          to label %__try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %for.cond
+  %12 = catchswitch within none [label %__except.ret] unwind label %ehcleanup
+
+__except.ret:                                     ; preds = %catch.dispatch
+  %13 = catchpad within %12 [i8* bitcast (i32 (i8*, i8*)* @"\01?filt$0 at 0@m@@" to i8*)]
+  catchret from %13 to label %cleanup7
+
+__try.cont:                                       ; preds = %for.cond
+  %tobool = icmp eq i32 %p.0, 0
+  br i1 %tobool, label %if.end, label %cleanup7
+
+if.end:                                           ; preds = %__try.cont
+  %call2 = invoke i32 @"\01?a@@YAJXZ"()
+          to label %cleanup.cont unwind label %ehcleanup
+
+cleanup.cont:                                     ; preds = %if.end
+  br label %for.cond
+
+ehcleanup:                                        ; preds = %if.end, %catch.dispatch
+  %14 = cleanuppad within none []
+  %g.i = getelementptr inbounds %class.f, %class.f* %s, i64 0, i32 0
+  call void @"\01??1b@@QEAA at XZ"(%class.b* nonnull %g.i) [ "funclet"(token %14) ]
+  cleanupret from %14 unwind to caller
+
+cleanup7:                                         ; preds = %__try.cont, %__except.ret
+  %p.2.ph = phi i32 [ 7, %__except.ret ], [ %p.0, %__try.cont ]
+  %g.i32 = getelementptr inbounds %class.f, %class.f* %s, i64 0, i32 0
+  call void @"\01??1b@@QEAA at XZ"(%class.b* nonnull %g.i32)
+  ret i32 %p.2.ph
+}
+
+declare %class.f* @"\01??0f@@QEAA at H@Z"(%class.f* returned, i32) unnamed_addr
+
+define internal i32 @"\01?filt$0 at 0@m@@"(i8* %exception_pointers, i8* %frame_pointer) personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*) {
+entry:
+  %0 = tail call i8* @llvm.eh.recoverfp(i8* bitcast (i32 ()* @"\01?m@@YAJXZ" to i8*), i8* %frame_pointer)
+  %1 = tail call i8* @llvm.localrecover(i8* bitcast (i32 ()* @"\01?m@@YAJXZ" to i8*), i8* %0, i32 0)
+  %2 = tail call i8* @llvm.localrecover(i8* bitcast (i32 ()* @"\01?m@@YAJXZ" to i8*), i8* %0, i32 1)
+  %status = bitcast i8* %2 to i32*
+  %agg.tmp = alloca %class.f, align 1
+  %3 = bitcast i8* %exception_pointers to i32**
+  %4 = load i32*, i32** %3, align 8
+  %5 = load i32, i32* %4, align 4
+  %6 = bitcast i8* %exception_pointers to %struct.k*
+  %7 = getelementptr inbounds %class.f, %class.f* %agg.tmp, i64 0, i32 0, i32 0
+  %8 = load i8, i8* %1, align 1
+  store i8 %8, i8* %7, align 1
+  %call = invoke i32 @"\01?j@@YAJVf@@JPEAUk@@PEAH at Z"(i8 %8, i32 %5, %struct.k* %6, i32* %status)
+          to label %invoke.cont unwind label %ehcleanup
+
+invoke.cont:                                      ; preds = %entry
+  %g.i = getelementptr inbounds %class.f, %class.f* %agg.tmp, i64 0, i32 0
+  call void @"\01??1b@@QEAA at XZ"(%class.b* nonnull %g.i)
+  ret i32 %call
+
+ehcleanup:                                        ; preds = %entry
+  %9 = cleanuppad within none []
+  %g.i2 = getelementptr inbounds %class.f, %class.f* %agg.tmp, i64 0, i32 0
+  call void @"\01??1b@@QEAA at XZ"(%class.b* nonnull %g.i2) [ "funclet"(token %9) ]
+  cleanupret from %9 unwind to caller
+}
+
+declare i8* @llvm.eh.recoverfp(i8*, i8*)
+declare i8* @llvm.localrecover(i8*, i8*, i32)
+declare i32 @"\01?j@@YAJVf@@JPEAUk@@PEAH at Z"(i8, i32, %struct.k*, i32*) local_unnamed_addr
+declare i32 @__C_specific_handler(...)
+declare void @"\01?h@@YAXPEAH0HPEAIPEAPEAEPEA_K33PEAUd@@4 at Z"(i32*, i32*, i32, i32*, i8**, i64*, i64*, i64*, %struct.d*, %struct.d*) local_unnamed_addr
+declare i32 @"\01?a@@YAJXZ"() local_unnamed_addr
+declare void @llvm.localescape(...)
+declare void @"\01??1b@@QEAA at XZ"(%class.b*) unnamed_addr
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 2}
+!1 = !{i32 7, !"PIC Level", i32 2}
+!2 = !{!"clang"}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"int", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C++ TBAA"}

Added: llvm/trunk/test/Transforms/LCSSA/basictest.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LCSSA/basictest.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LCSSA/basictest.ll (added)
+++ llvm/trunk/test/Transforms/LCSSA/basictest.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,30 @@
+; RUN: opt < %s -lcssa -S | FileCheck %s
+; RUN: opt < %s -passes=lcssa -S | FileCheck %s
+; RUN: opt < %s -debugify -lcssa -S | FileCheck -check-prefix=DEBUGIFY %s
+
+define void @lcssa(i1 %S2) {
+; CHECK-LABEL: @lcssa
+entry:
+	br label %loop.interior
+loop.interior:		; preds = %post.if, %entry
+	br i1 %S2, label %if.true, label %if.false
+if.true:		; preds = %loop.interior
+	%X1 = add i32 0, 0		; <i32> [#uses=1]
+	br label %post.if
+if.false:		; preds = %loop.interior
+	%X2 = add i32 0, 1		; <i32> [#uses=1]
+	br label %post.if
+post.if:		; preds = %if.false, %if.true
+	%X3 = phi i32 [ %X1, %if.true ], [ %X2, %if.false ]		; <i32> [#uses=1]
+	br i1 %S2, label %loop.exit, label %loop.interior
+loop.exit:		; preds = %post.if
+; CHECK: %X3.lcssa = phi i32
+; DEBUGIFY: %X3.lcssa = phi i32 {{.*}}, !dbg ![[DbgLoc:[0-9]+]]
+; DEBUGIFY-NEXT: call void @llvm.dbg.value(metadata i32 %X3.lcssa
+; CHECK: %X4 = add i32 3, %X3.lcssa
+	%X4 = add i32 3, %X3		; <i32> [#uses=0]
+	ret void
+}
+
+; Make sure the lcssa phi has %X3's debug location
+; DEBUGIFY: ![[DbgLoc]] = !DILocation(line: 7

Added: llvm/trunk/test/Transforms/LCSSA/indirectbr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LCSSA/indirectbr.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LCSSA/indirectbr.ll (added)
+++ llvm/trunk/test/Transforms/LCSSA/indirectbr.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,574 @@
+; RUN: opt < %s -loop-simplify -lcssa -verify-loop-info -verify-dom-info -S | FileCheck %s
+
+; LCSSA should work correctly in the case of an indirectbr that exits
+; the loop, and the loop has exits with predecessors not within the loop
+; (and btw these edges are unsplittable due to the indirectbr).
+; PR5437
+define i32 @test0() nounwind {
+; CHECK-LABEL: @test0
+entry:
+  br i1 undef, label %"4", label %"3"
+
+"3":                                              ; preds = %entry
+  ret i32 0
+
+"4":                                              ; preds = %entry
+  br i1 undef, label %"6", label %"5"
+
+"5":                                              ; preds = %"4"
+  unreachable
+
+"6":                                              ; preds = %"4"
+  br i1 undef, label %"10", label %"13"
+
+"10":                                             ; preds = %"6"
+  br i1 undef, label %"22", label %"15"
+
+"13":                                             ; preds = %"6"
+  unreachable
+
+"15":                                             ; preds = %"23", %"10"
+  unreachable
+
+"22":                                             ; preds = %"10"
+  br label %"23"
+
+"23":                                             ; preds = %"1375", %"22"
+  %0 = phi i32 [ undef, %"22" ], [ %1, %"1375" ]  ; <i32> [#uses=1]
+  indirectbr i8* undef, [label %"15", label %"24", label %"25", label %"26", label %"27", label %"28", label %"29", label %"30", label %"32", label %"32", label %"33", label %"167", label %"173", label %"173", label %"173", label %"173", label %"173", label %"192", label %"193", label %"194", label %"196", label %"206", label %"231", label %"241", label %"251", label %"261", label %"307", label %"353", label %"354", label %"355", label %"361", label %"367", label %"400", label %"433", label %"466", label %"499", label %"509", label %"519", label %"529", label %"571", label %"589", label %"607", label %"635", label %"655", label %"664", label %"671", label %"680", label %"687", label %"692", label %"698", label %"704", label %"715", label %"715", label %"716", label %"725", label %"725", label %"725", label %"725", label %"724", label %"724", label %"724", label %"724", label %"737", label %"737", label %"737", label %"737", label %"761", label %"758", label %"759", label %"760", label %"766", label %"763", label %"764", label %"765", label %"771", label %"768", label %"769", label %"770", label %"780", label %"777", label %"778", label %"779", label %"821", label %"826", label %"831", label %"832", label %"833", label %"836", label %"836", label %"886", label %"905", label %"978", label %"978", label %"1136", label %"1166", label %"1179", label %"1201", label %"1212", label %"1212", label %"1274", label %"1284", label %"1284", label %"1346", label %"1347", label %"1348", label %"1349", label %"1350", label %"1353", label %"1353", label %"1353", label %"1355", label %"1355", label %"1357", label %"1357", label %"1358", label %"1359", label %"1374", label %"1375", label %"1376", label %"1377", label %"1378", label %"1379", label %"1386", label %"1395", label %"1394", label %"1425", label %"1426", label %"1440", label %"1449", label %"1455", label %"1461", label %"1471", label %"1482", label %"1484", label %"1486", label %"1489", label %"1489", label %"1492", label %"1494", label %"1494", label %"1497", label %"1499", label %"1499", label %"1515", label %"1546", label %"1546", label %"1566", label %"1584", label %"1587", label %"1591", label %"1605", label %"1609", label %"1609", label %"1640", label %"1648", label %"1651", label %"1703", label %"1710", label %"1718", label %"1724", label %"1725", label %"1726", label %"1727", label %"1728", label %"1731", label %"1732", label %"1733", label %"1734", label %"1735", label %"1741", label %"1750", label %"1752", label %"1754", label %"1755", label %"1757", label %"1759", label %"1761", label %"1764", label %"1764", label %"1766", label %"1768", label %"1775", label %"1775", label %"1781", label %"1781", label %"1790", label %"1827", label %"1836", label %"1836", label %"1845", label %"1845", label %"1848", label %"1849", label %"1851", label %"1853", label %"1856", label %"1861", label %"1861"]
+
+"24":                                             ; preds = %"23"
+  unreachable
+
+"25":                                             ; preds = %"23"
+  unreachable
+
+"26":                                             ; preds = %"23"
+  unreachable
+
+"27":                                             ; preds = %"23"
+  unreachable
+
+"28":                                             ; preds = %"23"
+  unreachable
+
+"29":                                             ; preds = %"23"
+  unreachable
+
+"30":                                             ; preds = %"23"
+  unreachable
+
+"32":                                             ; preds = %"23", %"23"
+  unreachable
+
+"33":                                             ; preds = %"23"
+  unreachable
+
+"167":                                            ; preds = %"23"
+  unreachable
+
+"173":                                            ; preds = %"23", %"23", %"23", %"23", %"23"
+  unreachable
+
+"192":                                            ; preds = %"23"
+  unreachable
+
+"193":                                            ; preds = %"23"
+  unreachable
+
+"194":                                            ; preds = %"23"
+  unreachable
+
+"196":                                            ; preds = %"23"
+  unreachable
+
+"206":                                            ; preds = %"23"
+  unreachable
+
+"231":                                            ; preds = %"23"
+  unreachable
+
+"241":                                            ; preds = %"23"
+  unreachable
+
+"251":                                            ; preds = %"23"
+  unreachable
+
+"261":                                            ; preds = %"23"
+  unreachable
+
+"307":                                            ; preds = %"23"
+  unreachable
+
+"353":                                            ; preds = %"23"
+  unreachable
+
+"354":                                            ; preds = %"23"
+  unreachable
+
+"355":                                            ; preds = %"23"
+  unreachable
+
+"361":                                            ; preds = %"23"
+  unreachable
+
+"367":                                            ; preds = %"23"
+  unreachable
+
+"400":                                            ; preds = %"23"
+  unreachable
+
+"433":                                            ; preds = %"23"
+  unreachable
+
+"466":                                            ; preds = %"23"
+  unreachable
+
+"499":                                            ; preds = %"23"
+  unreachable
+
+"509":                                            ; preds = %"23"
+  unreachable
+
+"519":                                            ; preds = %"23"
+  unreachable
+
+"529":                                            ; preds = %"23"
+  unreachable
+
+"571":                                            ; preds = %"23"
+  unreachable
+
+"589":                                            ; preds = %"23"
+  unreachable
+
+"607":                                            ; preds = %"23"
+  unreachable
+
+"635":                                            ; preds = %"23"
+  unreachable
+
+"655":                                            ; preds = %"23"
+  unreachable
+
+"664":                                            ; preds = %"23"
+  unreachable
+
+"671":                                            ; preds = %"23"
+  unreachable
+
+"680":                                            ; preds = %"23"
+  unreachable
+
+"687":                                            ; preds = %"23"
+  unreachable
+
+"692":                                            ; preds = %"23"
+  br label %"1862"
+
+"698":                                            ; preds = %"23"
+  unreachable
+
+"704":                                            ; preds = %"23"
+  unreachable
+
+"715":                                            ; preds = %"23", %"23"
+  unreachable
+
+"716":                                            ; preds = %"23"
+  unreachable
+
+"724":                                            ; preds = %"23", %"23", %"23", %"23"
+  unreachable
+
+"725":                                            ; preds = %"23", %"23", %"23", %"23"
+  unreachable
+
+"737":                                            ; preds = %"23", %"23", %"23", %"23"
+  unreachable
+
+"758":                                            ; preds = %"23"
+  unreachable
+
+"759":                                            ; preds = %"23"
+  unreachable
+
+"760":                                            ; preds = %"23"
+  unreachable
+
+"761":                                            ; preds = %"23"
+  unreachable
+
+"763":                                            ; preds = %"23"
+  unreachable
+
+"764":                                            ; preds = %"23"
+  unreachable
+
+"765":                                            ; preds = %"23"
+  br label %"766"
+
+"766":                                            ; preds = %"765", %"23"
+  unreachable
+
+"768":                                            ; preds = %"23"
+  unreachable
+
+"769":                                            ; preds = %"23"
+  unreachable
+
+"770":                                            ; preds = %"23"
+  unreachable
+
+"771":                                            ; preds = %"23"
+  unreachable
+
+"777":                                            ; preds = %"23"
+  unreachable
+
+"778":                                            ; preds = %"23"
+  unreachable
+
+"779":                                            ; preds = %"23"
+  unreachable
+
+"780":                                            ; preds = %"23"
+  unreachable
+
+"821":                                            ; preds = %"23"
+  unreachable
+
+"826":                                            ; preds = %"23"
+  unreachable
+
+"831":                                            ; preds = %"23"
+  unreachable
+
+"832":                                            ; preds = %"23"
+  unreachable
+
+"833":                                            ; preds = %"23"
+  unreachable
+
+"836":                                            ; preds = %"23", %"23"
+  unreachable
+
+"886":                                            ; preds = %"23"
+  unreachable
+
+"905":                                            ; preds = %"23"
+  unreachable
+
+"978":                                            ; preds = %"23", %"23"
+  unreachable
+
+"1136":                                           ; preds = %"23"
+  unreachable
+
+"1166":                                           ; preds = %"23"
+  unreachable
+
+"1179":                                           ; preds = %"23"
+  unreachable
+
+"1201":                                           ; preds = %"23"
+  unreachable
+
+"1212":                                           ; preds = %"23", %"23"
+  unreachable
+
+"1274":                                           ; preds = %"23"
+  unreachable
+
+"1284":                                           ; preds = %"23", %"23"
+  unreachable
+
+"1346":                                           ; preds = %"23"
+  unreachable
+
+"1347":                                           ; preds = %"23"
+  unreachable
+
+"1348":                                           ; preds = %"23"
+  unreachable
+
+"1349":                                           ; preds = %"23"
+  unreachable
+
+"1350":                                           ; preds = %"23"
+  unreachable
+
+"1353":                                           ; preds = %"23", %"23", %"23"
+  unreachable
+
+"1355":                                           ; preds = %"23", %"23"
+  unreachable
+
+"1357":                                           ; preds = %"23", %"23"
+  unreachable
+
+"1358":                                           ; preds = %"23"
+  unreachable
+
+"1359":                                           ; preds = %"23"
+  unreachable
+
+"1374":                                           ; preds = %"23"
+  unreachable
+
+"1375":                                           ; preds = %"23"
+  %1 = zext i8 undef to i32                       ; <i32> [#uses=1]
+  br label %"23"
+
+"1376":                                           ; preds = %"23"
+  unreachable
+
+"1377":                                           ; preds = %"23"
+  unreachable
+
+"1378":                                           ; preds = %"23"
+  unreachable
+
+"1379":                                           ; preds = %"23"
+  unreachable
+
+"1386":                                           ; preds = %"23"
+  unreachable
+
+"1394":                                           ; preds = %"23"
+  unreachable
+
+"1395":                                           ; preds = %"23"
+  unreachable
+
+"1425":                                           ; preds = %"23"
+  unreachable
+
+"1426":                                           ; preds = %"23"
+  unreachable
+
+"1440":                                           ; preds = %"23"
+  unreachable
+
+"1449":                                           ; preds = %"23"
+  unreachable
+
+"1455":                                           ; preds = %"23"
+  unreachable
+
+"1461":                                           ; preds = %"23"
+  unreachable
+
+"1471":                                           ; preds = %"23"
+  unreachable
+
+"1482":                                           ; preds = %"23"
+  unreachable
+
+"1484":                                           ; preds = %"23"
+  unreachable
+
+"1486":                                           ; preds = %"23"
+  unreachable
+
+"1489":                                           ; preds = %"23", %"23"
+  unreachable
+
+"1492":                                           ; preds = %"23"
+  unreachable
+
+"1494":                                           ; preds = %"23", %"23"
+  unreachable
+
+"1497":                                           ; preds = %"23"
+  unreachable
+
+"1499":                                           ; preds = %"23", %"23"
+  unreachable
+
+"1515":                                           ; preds = %"23"
+  unreachable
+
+"1546":                                           ; preds = %"23", %"23"
+  unreachable
+
+"1566":                                           ; preds = %"23"
+  br i1 undef, label %"1569", label %"1568"
+
+"1568":                                           ; preds = %"1566"
+  unreachable
+
+"1569":                                           ; preds = %"1566"
+  unreachable
+
+"1584":                                           ; preds = %"23"
+  unreachable
+
+"1587":                                           ; preds = %"23"
+  unreachable
+
+"1591":                                           ; preds = %"23"
+  unreachable
+
+"1605":                                           ; preds = %"23"
+  unreachable
+
+"1609":                                           ; preds = %"23", %"23"
+  unreachable
+
+"1640":                                           ; preds = %"23"
+  unreachable
+
+"1648":                                           ; preds = %"23"
+  unreachable
+
+"1651":                                           ; preds = %"23"
+  unreachable
+
+"1703":                                           ; preds = %"23"
+  unreachable
+
+"1710":                                           ; preds = %"23"
+  unreachable
+
+"1718":                                           ; preds = %"23"
+  unreachable
+
+"1724":                                           ; preds = %"23"
+  unreachable
+
+"1725":                                           ; preds = %"23"
+  unreachable
+
+"1726":                                           ; preds = %"23"
+  unreachable
+
+"1727":                                           ; preds = %"23"
+  unreachable
+
+"1728":                                           ; preds = %"23"
+  unreachable
+
+"1731":                                           ; preds = %"23"
+  unreachable
+
+"1732":                                           ; preds = %"23"
+  unreachable
+
+"1733":                                           ; preds = %"23"
+  unreachable
+
+"1734":                                           ; preds = %"23"
+  unreachable
+
+"1735":                                           ; preds = %"23"
+  unreachable
+
+"1741":                                           ; preds = %"23"
+  unreachable
+
+"1750":                                           ; preds = %"23"
+  unreachable
+
+"1752":                                           ; preds = %"23"
+  unreachable
+
+"1754":                                           ; preds = %"23"
+  unreachable
+
+"1755":                                           ; preds = %"23"
+  unreachable
+
+"1757":                                           ; preds = %"23"
+  unreachable
+
+"1759":                                           ; preds = %"23"
+  unreachable
+
+"1761":                                           ; preds = %"23"
+  unreachable
+
+"1764":                                           ; preds = %"23", %"23"
+  %2 = icmp eq i32 %0, 168                        ; <i1> [#uses=0]
+  unreachable
+
+"1766":                                           ; preds = %"23"
+  unreachable
+
+"1768":                                           ; preds = %"23"
+  unreachable
+
+"1775":                                           ; preds = %"23", %"23"
+  unreachable
+
+"1781":                                           ; preds = %"23", %"23"
+  unreachable
+
+"1790":                                           ; preds = %"23"
+  unreachable
+
+"1827":                                           ; preds = %"23"
+  unreachable
+
+"1836":                                           ; preds = %"23", %"23"
+  br label %"1862"
+
+"1845":                                           ; preds = %"23", %"23"
+  unreachable
+
+"1848":                                           ; preds = %"23"
+  unreachable
+
+"1849":                                           ; preds = %"23"
+  unreachable
+
+"1851":                                           ; preds = %"23"
+  unreachable
+
+"1853":                                           ; preds = %"23"
+  unreachable
+
+"1856":                                           ; preds = %"23"
+  unreachable
+
+"1861":                                           ; preds = %"23", %"23"
+  unreachable
+
+"41":                                             ; preds = %"23", %"23"
+  unreachable
+
+"1862":                                           ; preds = %"1836", %"692"
+  unreachable
+}
+
+; An exit for Loop L1 may be the header of a disjoint Loop L2.  Thus, when we
+; create PHIs in one of such exits we are also inserting PHIs in L2 header. This
+; could break LCSSA form for L2 because these inserted PHIs can also have uses
+; in L2 exits. Test that we don't assert/crash on that.
+define void @test1() {
+; CHECK-LABEL: @test1
+  br label %lab1
+
+lab1:
+  %tmp21 = add i32 undef, 677038203
+  br i1 undef, label %lab2, label %exit
+
+lab2:
+  indirectbr i8* undef, [label %lab1, label %lab3]
+
+lab3:
+; CHECK: %tmp21.lcssa1 = phi i32 [ %tmp21.lcssa1, %lab4 ], [ %tmp21, %lab2 ]
+  %tmp12 = phi i32 [ %tmp21, %lab2 ], [ %tmp12, %lab4 ]
+  br i1 undef, label %lab5, label %lab4
+
+lab4:
+  br label %lab3
+
+lab5:
+; CHECK:  %tmp21.lcssa1.lcssa = phi i32 [ %tmp21.lcssa1, %lab3 ]
+  %tmp15 = add i32 %tmp12, undef
+  br label %exit
+
+exit:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LCSSA/invoke-dest.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LCSSA/invoke-dest.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LCSSA/invoke-dest.ll (added)
+++ llvm/trunk/test/Transforms/LCSSA/invoke-dest.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,152 @@
+; RUN: opt < %s -lcssa
+; RUN: opt < %s -passes=lcssa
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
+
+ at .str12 = external constant [3 x i8], align 1		; <[3 x i8]*> [#uses=1]
+ at .str17175 = external constant [4 x i8], align 1		; <[4 x i8]*> [#uses=1]
+ at .str21179 = external constant [12 x i8], align 1		; <[12 x i8]*> [#uses=1]
+ at .str25183 = external constant [10 x i8], align 1		; <[10 x i8]*> [#uses=1]
+ at .str32190 = external constant [92 x i8], align 1		; <[92 x i8]*> [#uses=1]
+ at .str41 = external constant [25 x i8], align 1		; <[25 x i8]*> [#uses=1]
+
+define void @_ZN8EtherBus10initializeEv() personality i32 (...)* @__gxx_personality_v0 {
+entry:
+	br i1 undef, label %_ZN7cObjectnwEj.exit, label %bb.i
+
+bb.i:		; preds = %entry
+	br label %_ZN7cObjectnwEj.exit
+
+_ZN7cObjectnwEj.exit:		; preds = %bb.i, %entry
+	invoke void @_ZN7cObjectC2EPKc(i8* undef, i8* getelementptr ([12 x i8], [12 x i8]* @.str21179, i32 0, i32 0))
+			to label %bb1 unwind label %lpad
+
+bb1:		; preds = %_ZN7cObjectnwEj.exit
+	br i1 undef, label %_ZNK5cGate4sizeEv.exit, label %bb.i110
+
+bb.i110:		; preds = %bb1
+	br label %_ZNK5cGate4sizeEv.exit
+
+_ZNK5cGate4sizeEv.exit:		; preds = %bb.i110, %bb1
+	br i1 undef, label %_ZNK5cGate4sizeEv.exit122, label %bb.i120
+
+bb.i120:		; preds = %_ZNK5cGate4sizeEv.exit
+	br label %_ZNK5cGate4sizeEv.exit122
+
+_ZNK5cGate4sizeEv.exit122:		; preds = %bb.i120, %_ZNK5cGate4sizeEv.exit
+	br i1 undef, label %bb8, label %bb2
+
+bb2:		; preds = %_ZNK5cGate4sizeEv.exit122
+	unreachable
+
+bb8:		; preds = %_ZNK5cGate4sizeEv.exit122
+	%tmp = invoke i8* @_ZN7cModule3parEPKc(i8* undef, i8* getelementptr ([10 x i8], [10 x i8]* @.str25183, i32 0, i32 0))
+			to label %invcont9 unwind label %lpad119		; <i8*> [#uses=1]
+
+invcont9:		; preds = %bb8
+	%tmp1 = invoke i8* @_ZN4cPar11stringValueEv(i8* %tmp)
+			to label %invcont10 unwind label %lpad119		; <i8*> [#uses=1]
+
+invcont10:		; preds = %invcont9
+	invoke void @_ZN8EtherBus8tokenizeEPKcRSt6vectorIdSaIdEE(i8* null, i8* %tmp1, i8* undef)
+			to label %invcont11 unwind label %lpad119
+
+invcont11:		; preds = %invcont10
+	br i1 undef, label %bb12, label %bb18
+
+bb12:		; preds = %invcont11
+	invoke void (i8*, i8*, ...) @_ZN6cEnvir6printfEPKcz(i8* null, i8* getelementptr ([3 x i8], [3 x i8]* @.str12, i32 0, i32 0), i32 undef)
+			to label %bb.i.i159 unwind label %lpad119
+
+bb.i.i159:		; preds = %bb12
+	unreachable
+
+bb18:		; preds = %invcont11
+	br i1 undef, label %bb32, label %bb34
+
+bb32:		; preds = %bb18
+	br i1 undef, label %bb.i.i123, label %bb34
+
+bb.i.i123:		; preds = %bb32
+	br label %bb34
+
+bb34:		; preds = %bb.i.i123, %bb32, %bb18
+	%tmp2 = invoke i8* @_Znaj(i32 undef)
+			to label %invcont35 unwind label %lpad119		; <i8*> [#uses=0]
+
+invcont35:		; preds = %bb34
+	br i1 undef, label %bb49, label %bb61
+
+bb49:		; preds = %invcont35
+	invoke void (i8*, i8*, ...) @_ZNK13cSimpleModule5errorEPKcz(i8* undef, i8* getelementptr ([92 x i8], [92 x i8]* @.str32190, i32 0, i32 0))
+			to label %bb51 unwind label %lpad119
+
+bb51:		; preds = %bb49
+	unreachable
+
+bb61:		; preds = %invcont35
+	br label %bb106
+
+.noexc:		; preds = %bb106
+	invoke void @_ZN7cObjectC2EPKc(i8* undef, i8* getelementptr ([25 x i8], [25 x i8]* @.str41, i32 0, i32 0))
+			to label %bb102 unwind label %lpad123
+
+bb102:		; preds = %.noexc
+	invoke void undef(i8* undef, i8 zeroext 1)
+			to label %invcont103 unwind label %lpad119
+
+invcont103:		; preds = %bb102
+	invoke void undef(i8* undef, double 1.000000e+07)
+			to label %invcont104 unwind label %lpad119
+
+invcont104:		; preds = %invcont103
+	%tmp3 = invoke i32 @_ZN13cSimpleModule11sendDelayedEP8cMessagedPKci(i8* undef, i8* undef, double 0.000000e+00, i8* getelementptr ([4 x i8], [4 x i8]* @.str17175, i32 0, i32 0), i32 undef)
+			to label %invcont105 unwind label %lpad119		; <i32> [#uses=0]
+
+invcont105:		; preds = %invcont104
+	br label %bb106
+
+bb106:		; preds = %invcont105, %bb61
+	%tmp4 = invoke i8* @_Znaj(i32 124)
+			to label %.noexc unwind label %lpad119		; <i8*> [#uses=1]
+
+lpad:		; preds = %_ZN7cObjectnwEj.exit
+        %exn = landingpad {i8*, i32}
+                 cleanup
+	br label %Unwind
+
+lpad119:		; preds = %bb106, %invcont104, %invcont103, %bb102, %bb49, %bb34, %bb12, %invcont10, %invcont9, %bb8
+        %exn119 = landingpad {i8*, i32}
+                 cleanup
+	unreachable
+
+lpad123:		; preds = %.noexc
+        %exn123 = landingpad {i8*, i32}
+                 cleanup
+	%tmp5 = icmp eq i8* %tmp4, null		; <i1> [#uses=1]
+	br i1 %tmp5, label %Unwind, label %bb.i2
+
+bb.i2:		; preds = %lpad123
+	br label %Unwind
+
+Unwind:		; preds = %bb.i2, %lpad123, %lpad
+	unreachable
+}
+
+declare i32 @__gxx_personality_v0(...)
+
+declare void @_ZN8EtherBus8tokenizeEPKcRSt6vectorIdSaIdEE(i8* nocapture, i8*, i8*)
+
+declare i8* @_Znaj(i32)
+
+declare void @_ZN6cEnvir6printfEPKcz(i8* nocapture, i8* nocapture, ...)
+
+declare void @_ZNK13cSimpleModule5errorEPKcz(i8* nocapture, i8* nocapture, ...) noreturn
+
+declare i8* @_ZN7cModule3parEPKc(i8*, i8*)
+
+declare i32 @_ZN13cSimpleModule11sendDelayedEP8cMessagedPKci(i8*, i8*, double, i8*, i32)
+
+declare void @_ZN7cObjectC2EPKc(i8*, i8*)
+
+declare i8* @_ZN4cPar11stringValueEv(i8*)

Added: llvm/trunk/test/Transforms/LCSSA/mixed-catch.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LCSSA/mixed-catch.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LCSSA/mixed-catch.ll (added)
+++ llvm/trunk/test/Transforms/LCSSA/mixed-catch.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,96 @@
+; RUN: opt -lcssa -S < %s | FileCheck %s
+; RUN: opt -passes=lcssa -S < %s | FileCheck %s
+
+; This test is based on the following C++ code:
+;
+; void f()
+; {
+;   for (int i=0; i<12; i++) {
+;     try {
+;       if (i==3)
+;         throw i;
+;     } catch (int) {
+;       continue;
+;     } catch (...) { }
+;     if (i==3) break;
+;   }
+; }
+;
+; The loop info analysis identifies the catch pad for the second catch as being
+; outside the loop (because it returns to %for.end) but the associated
+; catchswitch block is identified as being inside the loop.  Because of this
+; analysis, the LCSSA pass wants to create a PHI node in the catchpad block
+; for the catchswitch value, but this is a token, so it can't.
+
+define void @f() personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+entry:
+  %tmp = alloca i32, align 4
+  %i7 = alloca i32, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, 12
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %cond = icmp eq i32 %i.0, 3
+  br i1 %cond, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  store i32 %i.0, i32* %tmp, align 4
+  %tmp1 = bitcast i32* %tmp to i8*
+  invoke void @_CxxThrowException(i8* %tmp1, %eh.ThrowInfo* nonnull @_TI1H) #1
+          to label %unreachable unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %if.then
+  %tmp2 = catchswitch within none [label %catch, label %catch2] unwind to caller
+
+catch:                                            ; preds = %catch.dispatch
+  %tmp3 = catchpad within %tmp2 [%rtti.TypeDescriptor2* @"\01??_R0H at 8", i32 0, i32* %i7]
+  catchret from %tmp3 to label %for.inc
+
+catch2:                                           ; preds = %catch.dispatch
+  %tmp4 = catchpad within %tmp2 [i8* null, i32 64, i8* null]
+  catchret from %tmp4 to label %for.end
+
+for.inc:                                          ; preds = %catch, %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %catch2, %for.cond
+  ret void
+
+unreachable:                                      ; preds = %if.then
+  unreachable
+}
+
+; CHECK-LABEL: define void @f()
+; CHECK: catch2:
+; CHECK-NOT: phi
+; CHECK:   %tmp4 = catchpad within %tmp2
+; CHECK:   catchret from %tmp4 to label %for.end
+
+%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
+%eh.CatchableType = type { i32, i32, i32, i32, i32, i32, i32 }
+%eh.CatchableTypeArray.1 = type { i32, [1 x i32] }
+%eh.ThrowInfo = type { i32, i32, i32, i32 }
+
+$"\01??_R0H at 8" = comdat any
+
+$"_CT??_R0H at 84" = comdat any
+
+$_CTA1H = comdat any
+
+$_TI1H = comdat any
+
+@"\01??_7type_info@@6B@" = external constant i8*
+@"\01??_R0H at 8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
+ at __ImageBase = external constant i8
+@"_CT??_R0H at 84" = linkonce_odr unnamed_addr constant %eh.CatchableType { i32 1, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%rtti.TypeDescriptor2* @"\01??_R0H at 8" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32), i32 0, i32 -1, i32 0, i32 4, i32 0 }, section ".xdata", comdat
+ at _CTA1H = linkonce_odr unnamed_addr constant %eh.CatchableTypeArray.1 { i32 1, [1 x i32] [i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%eh.CatchableType* @"_CT??_R0H at 84" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32)] }, section ".xdata", comdat
+ at _TI1H = linkonce_odr unnamed_addr constant %eh.ThrowInfo { i32 0, i32 0, i32 0, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%eh.CatchableTypeArray.1* @_CTA1H to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32) }, section ".xdata", comdat
+
+declare void @_CxxThrowException(i8*, %eh.ThrowInfo*)
+
+declare i32 @__CxxFrameHandler3(...)

Added: llvm/trunk/test/Transforms/LCSSA/pr28424.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LCSSA/pr28424.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LCSSA/pr28424.ll (added)
+++ llvm/trunk/test/Transforms/LCSSA/pr28424.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,87 @@
+; RUN: opt < %s -lcssa -S -o - | FileCheck %s
+target triple = "x86_64-unknown-linux-gnu"
+
+; PR28424
+; Here LCSSA adds phi-nodes for %x into the loop exits. Then, SSAUpdater needs
+; to insert phi-nodes to merge these values. That creates a new def, which in
+; its turn needs another LCCSA phi-node, and this test ensures that we insert
+; it.
+
+; CHECK-LABEL: @foo1
+define internal i32 @foo1() {
+entry:
+  br label %header
+
+header:
+  %x = add i32 0, 1
+  br i1 undef, label %if, label %loopexit1
+
+if:
+  br i1 undef, label %latch, label %loopexit2
+
+latch:
+  br i1 undef, label %header, label %loopexit3
+
+; CHECK: loopexit1:
+; CHECK:   %x.lcssa = phi i32 [ %x, %header ]
+loopexit1:
+  br label %loop_with_insert_point
+
+; CHECK: loopexit2:
+; CHECK:   %x.lcssa1 = phi i32 [ %x, %if ]
+loopexit2:
+  br label %exit
+
+; CHECK: loopexit3:
+; CHECK:   %x.lcssa2 = phi i32 [ %x, %latch ]
+loopexit3:
+  br label %loop_with_insert_point
+
+; CHECK: loop_with_insert_point:
+; CHECK:   %x4 = phi i32 [ %x4, %loop_with_insert_point ], [ %x.lcssa2, %loopexit3 ], [ %x.lcssa, %loopexit1 ]
+loop_with_insert_point:
+  br i1 undef, label %loop_with_insert_point, label %bb
+
+; CHECK: bb:
+; CHECK:   %x4.lcssa = phi i32 [ %x4, %loop_with_insert_point ]
+bb:
+  br label %exit
+
+; CHECK: exit:
+; CHECK:   %x3 = phi i32 [ %x4.lcssa, %bb ], [ %x.lcssa1, %loopexit2 ]
+exit:
+  ret i32 %x
+}
+
+; CHECK-LABEL: @foo2
+define internal i32 @foo2() {
+entry:
+  br label %header
+
+header:
+  %x = add i32 0, 1
+  br i1 undef, label %latch, label %loopexit1
+
+latch:
+  br i1 undef, label %header, label %loopexit2
+
+; CHECK: loopexit1:
+; CHECK:   %x.lcssa = phi i32 [ %x, %header ]
+loopexit1:
+  br label %loop_with_insert_point
+
+; CHECK: loopexit2:
+; CHECK:   %x.lcssa1 = phi i32 [ %x, %latch ]
+loopexit2:
+  br label %loop_with_insert_point
+
+; CHECK: loop_with_insert_point:
+; CHECK:   %x2 = phi i32 [ %x2, %loop_with_insert_point ], [ %x.lcssa1, %loopexit2 ], [ %x.lcssa, %loopexit1 ]
+loop_with_insert_point:
+  br i1 undef, label %loop_with_insert_point, label %exit
+
+; CHECK: exit:
+; CHECK:   %x2.lcssa = phi i32 [ %x2, %loop_with_insert_point ]
+exit:
+  ret i32 %x
+}

Added: llvm/trunk/test/Transforms/LCSSA/pr28608.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LCSSA/pr28608.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LCSSA/pr28608.ll (added)
+++ llvm/trunk/test/Transforms/LCSSA/pr28608.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,35 @@
+; RUN: opt < %s -lcssa -disable-output
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; PR28608
+; Check that we don't crash on this test.
+
+define void @foo() {
+entry:
+  br label %bb1
+
+bb1:
+  br label %bb2
+
+bb2:
+  %x = phi i32 [ undef, %bb5 ], [ undef, %bb1 ]
+  br i1 undef, label %bb3, label %bb6
+
+bb3:
+  br i1 undef, label %bb5, label %bb4
+
+bb4:
+  br label %bb6
+
+bb5:
+  br label %bb2
+
+bb6:
+  br label %bb1
+
+exit:
+  %y = add i32 0, %x
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/LCSSA/remove-phis.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LCSSA/remove-phis.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LCSSA/remove-phis.ll (added)
+++ llvm/trunk/test/Transforms/LCSSA/remove-phis.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,56 @@
+; RUN: opt < %s -lcssa -verify -S -o /dev/null
+
+; This bugpoint reduced test case used to assert when removing unused PHI nodes.
+; Just verify that we do not assert/crash.
+
+define void @test() {
+entry:
+  br label %gazank
+
+gazank:
+  %value = phi i16 [ 0, %entry ], [ undef, %gazonk ]
+  br i1 undef, label %gazink, label %qqq
+
+gazink:
+  br i1 undef, label %gazonk, label %infinite.loop.pred
+
+gazonk:
+  br i1 undef, label %exit1, label %gazank
+
+qqq:
+  br i1 undef, label %www, label %exit2
+
+www:
+  br i1 undef, label %qqq, label %foo.pred
+
+foo.pred:
+  br label %foo
+
+foo:
+  br i1 undef, label %bar, label %exit1.pred
+
+bar:
+  br i1 undef, label %foo, label %exit2.pred
+
+unreachable1:
+  br i1 undef, label %foo, label %exit2.pred
+
+exit1.pred:
+  br label %exit1
+
+exit1:
+  ret void
+
+exit2.pred:
+  br label %exit2
+
+exit2:
+  ret void
+
+infinite.loop.pred:
+  br label %infinite.loop
+
+infinite.loop:
+  %dead = phi i16 [ %value, %infinite.loop.pred ], [ 0, %infinite.loop ]
+  br label %infinite.loop
+}

Added: llvm/trunk/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll (added)
+++ llvm/trunk/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,137 @@
+; RUN: opt -S -lcssa < %s | FileCheck %s
+
+; Reproducer for PR39019.
+;
+; Verify that the llvm.dbg.values are updated to use the PHI nodes inserted by
+; LCSSA.
+
+; For the test case @single_exit, we can rewrite all llvm.dbg.value calls
+; to use the inserted PHI.
+
+; CHECK-LABEL: @single_exit(
+
+; CHECK-LABEL: inner.body:
+; CHECK: %add = add nsw i32 0, 2
+; CHECK: call void @llvm.dbg.value(metadata i32 %add, metadata [[VAR:![0-9]+]], metadata !DIExpression())
+
+
+; CHECK-LABEL: outer.exit:
+; CHECK-NEXT: [[PN:%[^ ]*]] = phi i32 [ %add.lcssa, %outer.latch ]
+; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 [[PN]], metadata [[VAR]], metadata !DIExpression())
+; CHECK-NEXT: call void @bar(i32 [[PN]])
+
+; CHECK-LABEL: exit:
+; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 [[PN]], metadata [[VAR]], metadata !DIExpression())
+
+define void @single_exit()  !dbg !6 {
+entry:
+  br label %outer.header, !dbg !12
+
+outer.header:                                     ; preds = %outer.latch, %entry
+  br label %inner.body, !dbg !12
+
+inner.body:                                       ; preds = %inner.body, %outer.header
+  %add = add nsw i32 0, 2, !dbg !12
+  call void @llvm.dbg.value(metadata i32 %add, metadata !9, metadata !DIExpression()), !dbg !12
+  br i1 false, label %inner.body, label %inner.exit, !dbg !12
+
+inner.exit:                                       ; preds = %inner.body
+  br label %outer.latch
+
+outer.latch:                                      ; preds = %inner.exit
+  br i1 false, label %outer.header, label %outer.exit, !dbg !12
+
+outer.exit:                                       ; preds = %outer.latch
+  call void @llvm.dbg.value(metadata i32 %add, metadata !9, metadata !DIExpression()), !dbg !12
+  tail call void @bar(i32 %add), !dbg !12
+  br label %exit
+
+exit:                                             ; preds = %outer.exit
+  call void @llvm.dbg.value(metadata i32 %add, metadata !9, metadata !DIExpression()), !dbg !12
+  ret void, !dbg !12
+}
+
+; For the test case @multi_exit, we cannot update the llvm.dbg.value call in exit,
+; because LCSSA did not insert a PHI node in %exit, as there is no non-debug
+; use.
+
+; CHECK-LABEL: @multi_exit()
+
+; CHECK-LABEL: for.header:
+; CHECK-NEXT: %add = add nsw i32 0, 2
+; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 %add, metadata [[VAR2:![0-9]+]], metadata !DIExpression())
+
+; CHECK-LABEL: for.exit1:
+; CHECK-NEXT: [[PN1:%[^ ]*]] = phi i32 [ %add, %for.header ]
+; CHECK-NEXT: br label %for.exit1.succ
+
+; CHECK-LABEL: for.exit1.succ:
+; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 [[PN1]], metadata [[VAR2]], metadata !DIExpression())
+; CHECK-NEXT: call void @bar(i32 [[PN1]])
+
+; CHECK-LABEL: for.exit2:
+; CHECK-NEXT: [[PN2:%[^ ]*]] = phi i32 [ %add, %for.latch ]
+; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 [[PN2]], metadata [[VAR2]], metadata !DIExpression())
+; CHECK-NEXT: call void @bar(i32 [[PN2]])
+
+; CHECK-LABEL: exit:
+; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 %add, metadata [[VAR2]], metadata !DIExpression())
+
+define void @multi_exit()  !dbg !13 {
+entry:
+  br label %for.header, !dbg !14
+
+for.header:                                       ; preds = %for.latch, %entry
+  %add = add nsw i32 0, 2, !dbg !14
+  call void @llvm.dbg.value(metadata i32 %add, metadata !16, metadata !DIExpression()), !dbg !14
+  br i1 false, label %for.latch, label %for.exit1, !dbg !14
+
+for.latch:                                        ; preds = %for.header
+  br i1 false, label %for.header, label %for.exit2, !dbg !14
+
+for.exit1:                                        ; preds = %for.header
+  br label %for.exit1.succ
+
+for.exit1.succ:                                   ; preds = %for.exit1
+  call void @llvm.dbg.value(metadata i32 %add, metadata !16, metadata !DIExpression()), !dbg !14
+  tail call void @bar(i32 %add), !dbg !14
+  br label %exit
+
+for.exit2:                                        ; preds = %for.latch
+  call void @llvm.dbg.value(metadata i32 %add, metadata !16, metadata !DIExpression()), !dbg !14
+  tail call void @bar(i32 %add), !dbg !14
+  br label %exit
+
+exit:                                             ; preds = %for.exit2, %for.exit1.succ
+  call void @llvm.dbg.value(metadata i32 %add, metadata !16, metadata !DIExpression()), !dbg !14
+  ret void, !dbg !14
+}
+
+; CHECK: [[VAR]] = !DILocalVariable(name: "sum",
+; CHECK: [[VAR2]] = !DILocalVariable(name: "sum2",
+
+declare void @bar(i32)
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 8.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !2, nameTableKind: None)
+!1 = !DIFile(filename: "foo.c", directory: "/")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{!"clang version 8.0.0"}
+!6 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 10, type: !7, scopeLine: 10, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8)
+!7 = !DISubroutineType(types: !2)
+!8 = !{!9}
+!9 = !DILocalVariable(name: "sum", scope: !10, file: !1, line: 11, type: !11)
+!10 = !DILexicalBlockFile(scope: !6, file: !1, discriminator: 0)
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !DILocation(line: 0, scope: !10)
+!13 = distinct !DISubprogram(name: "multi_exit", scope: !1, file: !1, line: 10, type: !7, scopeLine: 10, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8)
+!14 = !DILocation(line: 0, scope: !15)
+!15 = !DILexicalBlockFile(scope: !13, file: !1, discriminator: 0)
+!16 = !DILocalVariable(name: "sum2", scope: !15, file: !1, line: 11, type: !11)

Added: llvm/trunk/test/Transforms/LCSSA/unreachable-use.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LCSSA/unreachable-use.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LCSSA/unreachable-use.ll (added)
+++ llvm/trunk/test/Transforms/LCSSA/unreachable-use.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,29 @@
+; RUN: opt < %s -lcssa -S -verify-loop-info | FileCheck %s
+; PR6546
+
+; LCSSA doesn't need to transform uses in blocks not reachable
+; from the entry block.
+
+; CHECK: %tmp33 = load i1*, i1** %tmp
+
+define fastcc void @dfs() nounwind {
+bb:
+  br label %bb44
+
+bb44:
+  br i1 undef, label %bb7, label %bb45
+
+bb7:
+  %tmp = bitcast i1** undef to i1**
+  br label %bb15
+
+bb15:
+  br label %bb44
+
+bb32:
+  %tmp33 = load i1*, i1** %tmp, align 8
+  br label %bb45
+
+bb45:
+  unreachable
+}

Added: llvm/trunk/test/Transforms/LCSSA/unused-phis.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LCSSA/unused-phis.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LCSSA/unused-phis.ll (added)
+++ llvm/trunk/test/Transforms/LCSSA/unused-phis.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,39 @@
+; RUN: opt < %s -lcssa -S | FileCheck %s
+; RUN: opt < %s -passes=lcssa -S | FileCheck %s
+; CHECK: exit1:
+; CHECK: .lcssa =
+; CHECK: exit2:
+; CHECK: .lcssa1 =
+; CHECK: exit3:
+; CHECK-NOT: .lcssa
+
+; Test to ensure that when there are multiple exit blocks, PHI nodes are
+; only inserted by LCSSA when there is a use dominated by a given exit
+; block.
+
+declare void @printf(i32 %i)
+
+define i32 @unused_phis() nounwind {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [0, %entry], [1, %then2]
+  br i1 undef, label %exit1, label %then1
+
+then1:
+  br i1 undef, label %exit2, label %then2
+
+then2:
+  br i1 undef, label %exit3, label %loop
+
+exit1:
+  call void @printf(i32 %i)
+  ret i32 %i
+
+exit2:
+  ret i32 %i
+
+exit3:
+  ret i32 0
+}

Added: llvm/trunk/test/Transforms/LICM/2003-02-26-LoopExitNotDominated.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/2003-02-26-LoopExitNotDominated.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/2003-02-26-LoopExitNotDominated.ll (added)
+++ llvm/trunk/test/Transforms/LICM/2003-02-26-LoopExitNotDominated.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,18 @@
+; RUN: opt < %s -basicaa -licm -disable-output
+
+;%MoveArray = external global [64 x ulong]
+
+define void @InitMoveArray() {
+bb3:
+	%X = alloca [2 x i64]		; <[2 x i64]*> [#uses=1]
+	br i1 false, label %bb13, label %bb4
+bb4:		; preds = %bb3
+	%reg3011 = getelementptr [2 x i64], [2 x i64]* %X, i64 0, i64 0		; <i64*> [#uses=1]
+	br label %bb8
+bb8:		; preds = %bb8, %bb4
+	store i64 0, i64* %reg3011
+	br i1 false, label %bb8, label %bb13
+bb13:		; preds = %bb8, %bb3
+	ret void
+}
+

Added: llvm/trunk/test/Transforms/LICM/2003-02-27-NestedLoopExitBlocks.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/2003-02-27-NestedLoopExitBlocks.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/2003-02-27-NestedLoopExitBlocks.ll (added)
+++ llvm/trunk/test/Transforms/LICM/2003-02-27-NestedLoopExitBlocks.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,17 @@
+; Exit blocks need to be updated for all nested loops...
+
+; RUN: opt < %s -loop-simplify
+
+define i32 @yyparse() {
+bb0:
+	br i1 false, label %UnifiedExitNode, label %bb19
+bb19:		; preds = %bb28, %bb0
+	br i1 false, label %bb28, label %UnifiedExitNode
+bb28:		; preds = %bb32, %bb19
+	br i1 false, label %bb32, label %bb19
+bb32:		; preds = %bb28
+	br i1 false, label %UnifiedExitNode, label %bb28
+UnifiedExitNode:		; preds = %bb32, %bb19, %bb0
+	ret i32 0
+}
+

Added: llvm/trunk/test/Transforms/LICM/2003-02-27-PreheaderExitNodeUpdate.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/2003-02-27-PreheaderExitNodeUpdate.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/2003-02-27-PreheaderExitNodeUpdate.ll (added)
+++ llvm/trunk/test/Transforms/LICM/2003-02-27-PreheaderExitNodeUpdate.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,16 @@
+; This testcase fails because preheader insertion is not updating exit node 
+; information for loops.
+
+; RUN: opt < %s -licm
+
+define i32 @main(i32 %argc, i8** %argv) {
+bb0:
+	br i1 false, label %bb7, label %bb5
+bb5:		; preds = %bb5, %bb0
+	br i1 false, label %bb5, label %bb7
+bb7:		; preds = %bb7, %bb5, %bb0
+	br i1 false, label %bb7, label %bb10
+bb10:		; preds = %bb7
+	ret i32 0
+}
+

Added: llvm/trunk/test/Transforms/LICM/2003-02-27-PreheaderProblem.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/2003-02-27-PreheaderProblem.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/2003-02-27-PreheaderProblem.ll (added)
+++ llvm/trunk/test/Transforms/LICM/2003-02-27-PreheaderProblem.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,24 @@
+; Here we have a case where there are two loops and LICM is hoisting an 
+; instruction from one loop into the other loop!  This is obviously bad and 
+; happens because preheader insertion doesn't insert a preheader for this
+; case... bad.
+
+; RUN: opt < %s -licm -loop-deletion -simplifycfg -S | \
+; RUN:   not grep "br "
+
+define i32 @main(i32 %argc) {
+; <label>:0
+	br label %bb5
+bb5:		; preds = %bb5, %0
+	%I = phi i32 [ 0, %0 ], [ %I2, %bb5 ]		; <i32> [#uses=1]
+	%I2 = add i32 %I, 1		; <i32> [#uses=2]
+	%c = icmp eq i32 %I2, 10		; <i1> [#uses=1]
+	br i1 %c, label %bb5, label %bb8
+bb8:		; preds = %bb8, %bb5
+	%cann-indvar = phi i32 [ 0, %bb8 ], [ 0, %bb5 ]		; <i32> [#uses=0]
+	%X = add i32 %argc, %argc		; <i32> [#uses=1]
+	br i1 false, label %bb8, label %bb10
+bb10:		; preds = %bb8
+	ret i32 %X
+}
+

Added: llvm/trunk/test/Transforms/LICM/2003-02-27-StoreSinkPHIs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/2003-02-27-StoreSinkPHIs.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/2003-02-27-StoreSinkPHIs.ll (added)
+++ llvm/trunk/test/Transforms/LICM/2003-02-27-StoreSinkPHIs.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,15 @@
+; LICM is adding stores before phi nodes.  bad.
+
+; RUN: opt < %s -licm
+
+define i1 @test(i1 %c) {
+; <label>:0
+	br i1 %c, label %Loop, label %Out
+Loop:		; preds = %Loop, %0
+	store i32 0, i32* null
+	br i1 %c, label %Loop, label %Out
+Out:		; preds = %Loop, %0
+	%X = phi i1 [ %c, %0 ], [ true, %Loop ]		; <i1> [#uses=1]
+	ret i1 %X
+}
+

Added: llvm/trunk/test/Transforms/LICM/2003-02-28-PromoteDifferentType.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/2003-02-28-PromoteDifferentType.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/2003-02-28-PromoteDifferentType.ll (added)
+++ llvm/trunk/test/Transforms/LICM/2003-02-28-PromoteDifferentType.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,15 @@
+; Test that hoisting is disabled for pointers of different types...
+;
+; RUN: opt < %s -licm
+
+define void @test(i32* %P) {
+	br label %Loop
+Loop:		; preds = %Loop, %0
+	store i32 5, i32* %P
+	%P2 = bitcast i32* %P to i8*		; <i8*> [#uses=1]
+	store i8 4, i8* %P2
+	br i1 true, label %Loop, label %Out
+Out:		; preds = %Loop
+	ret void
+}
+

Added: llvm/trunk/test/Transforms/LICM/2003-05-02-LoadHoist.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/2003-05-02-LoadHoist.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/2003-05-02-LoadHoist.ll (added)
+++ llvm/trunk/test/Transforms/LICM/2003-05-02-LoadHoist.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,23 @@
+; This testcase tests for a problem where LICM hoists loads out of a loop 
+; despite the fact that calls to unknown functions may modify what is being 
+; loaded from.  Basically if the load gets hoisted, the subtract gets turned
+; into a constant zero.
+;
+; RUN: opt < %s -licm -gvn -instcombine -S | grep load
+
+ at X = global i32 7		; <i32*> [#uses=2]
+
+declare void @foo()
+
+define i32 @test(i1 %c) {
+	%A = load i32, i32* @X		; <i32> [#uses=1]
+	br label %Loop
+Loop:		; preds = %Loop, %0
+	call void @foo( )
+        ;; Should not hoist this load!
+	%B = load i32, i32* @X		; <i32> [#uses=1]
+	br i1 %c, label %Loop, label %Out
+Out:		; preds = %Loop
+	%C = sub i32 %A, %B		; <i32> [#uses=1]
+	ret i32 %C
+}

Added: llvm/trunk/test/Transforms/LICM/2003-12-11-SinkingToPHI.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/2003-12-11-SinkingToPHI.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/2003-12-11-SinkingToPHI.ll (added)
+++ llvm/trunk/test/Transforms/LICM/2003-12-11-SinkingToPHI.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,16 @@
+; RUN: opt < %s -licm | lli -force-interpreter
+
+define i32 @main() {
+entry:
+	br label %Loop
+Loop:		; preds = %LoopCont, %entry
+	br i1 true, label %LoopCont, label %Out
+LoopCont:		; preds = %Loop
+	%X = add i32 1, 0		; <i32> [#uses=1]
+	br i1 true, label %Out, label %Loop
+Out:		; preds = %LoopCont, %Loop
+	%V = phi i32 [ 2, %Loop ], [ %X, %LoopCont ]		; <i32> [#uses=1]
+	%V2 = sub i32 %V, 1		; <i32> [#uses=1]
+	ret i32 %V2
+}
+

Added: llvm/trunk/test/Transforms/LICM/2004-09-14-AliasAnalysisInvalidate.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/2004-09-14-AliasAnalysisInvalidate.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/2004-09-14-AliasAnalysisInvalidate.ll (added)
+++ llvm/trunk/test/Transforms/LICM/2004-09-14-AliasAnalysisInvalidate.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,19 @@
+; RUN: opt < %s -globals-aa -licm -disable-output
+
+ at PL_regcomp_parse = internal global i8* null		; <i8**> [#uses=2]
+
+define void @test() {
+	br label %Outer
+Outer:		; preds = %Next, %0
+	br label %Inner
+Inner:		; preds = %Inner, %Outer
+	%tmp.114.i.i.i = load i8*, i8** @PL_regcomp_parse		; <i8*> [#uses=1]
+	%tmp.115.i.i.i = load i8, i8* %tmp.114.i.i.i		; <i8> [#uses=0]
+	store i8* null, i8** @PL_regcomp_parse
+	br i1 false, label %Inner, label %Next
+Next:		; preds = %Inner
+	br i1 false, label %Outer, label %Exit
+Exit:		; preds = %Next
+	ret void
+}
+

Added: llvm/trunk/test/Transforms/LICM/2004-11-17-UndefIndexCrash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/2004-11-17-UndefIndexCrash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/2004-11-17-UndefIndexCrash.ll (added)
+++ llvm/trunk/test/Transforms/LICM/2004-11-17-UndefIndexCrash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,20 @@
+; RUN: opt < %s -licm -disable-output
+	%struct.roadlet = type { i8*, %struct.vehicle*, [8 x %struct.roadlet*], [8 x %struct.roadlet* (%struct.roadlet*, %struct.vehicle*, i32)*] }
+	%struct.vehicle = type { %struct.roadlet*, i8*, i32, i32, %union.._631., i32 }
+	%union.._631. = type { i32 }
+
+declare %struct.roadlet* @_Z11return_nullP7roadletP7vehicle9direction(%struct.roadlet*, %struct.vehicle*, i32)
+
+declare %struct.roadlet* @_Z14lane_switch_okP7roadletP7vehicle9direction(%struct.roadlet*, %struct.vehicle*, i32)
+
+define void @main() {
+__main.entry:
+	br label %invoke_cont.3
+invoke_cont.3:		; preds = %invoke_cont.3, %__main.entry
+	%tmp.34.i.i502.7 = getelementptr %struct.roadlet, %struct.roadlet* null, i32 0, i32 3, i32 7		; <%struct.roadlet* (%struct.roadlet*, %struct.vehicle*, i32)**> [#uses=1]
+	store %struct.roadlet* (%struct.roadlet*, %struct.vehicle*, i32)* @_Z11return_nullP7roadletP7vehicle9direction, %struct.roadlet* (%struct.roadlet*, %struct.vehicle*, i32)** %tmp.34.i.i502.7
+	store %struct.roadlet* (%struct.roadlet*, %struct.vehicle*, i32)* @_Z14lane_switch_okP7roadletP7vehicle9direction, %struct.roadlet* (%struct.roadlet*, %struct.vehicle*, i32)** null
+	%tmp.4.i.i339 = getelementptr %struct.roadlet, %struct.roadlet* null, i32 0, i32 3, i32 undef		; <%struct.roadlet* (%struct.roadlet*, %struct.vehicle*, i32)**> [#uses=1]
+	store %struct.roadlet* (%struct.roadlet*, %struct.vehicle*, i32)* @_Z11return_nullP7roadletP7vehicle9direction, %struct.roadlet* (%struct.roadlet*, %struct.vehicle*, i32)** %tmp.4.i.i339
+	br label %invoke_cont.3
+}

Added: llvm/trunk/test/Transforms/LICM/2006-09-12-DeadUserOfSunkInstr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/2006-09-12-DeadUserOfSunkInstr.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/2006-09-12-DeadUserOfSunkInstr.ll (added)
+++ llvm/trunk/test/Transforms/LICM/2006-09-12-DeadUserOfSunkInstr.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,148 @@
+; RUN: opt < %s -licm -disable-output
+; PR908
+; END.
+
+	%struct.alloc_chain = type { i8*, %struct.alloc_chain* }
+	%struct.oggpack_buffer = type { i32, i32, i8*, i8*, i32 }
+	%struct.vorbis_block = type { float**, %struct.oggpack_buffer, i32, i32, i32, i32, i32, i32, i64, i64, %struct.vorbis_dsp_state*, i8*, i32, i32, i32, %struct.alloc_chain*, i32, i32, i32, i32, i8* }
+	%struct.vorbis_dsp_state = type { i32, %struct.vorbis_info*, float**, float**, i32, i32, i32, i32, i32, i32, i32, i32, i32, i64, i64, i64, i64, i64, i64, i8* }
+	%struct.vorbis_info = type { i32, i32, i32, i32, i32, i32, i32, i8* }
+
+define fastcc void @_01forward() {
+entry:
+	br i1 false, label %bb222.preheader, label %bb241
+cond_true67:		; preds = %cond_true87
+	br label %cond_next80
+cond_next80:		; preds = %cond_true87, %cond_true67
+	br label %bb83
+bb83.preheader:		; preds = %cond_true226
+	br i1 false, label %bb83.us.preheader, label %bb83.preheader1
+bb83.us.preheader:		; preds = %bb83.preheader
+	br label %bb83.us
+bb83.us:		; preds = %cond_next80.us, %bb83.us.preheader
+	br i1 false, label %cond_true87.us, label %cond_next92.loopexit2
+cond_next80.us:		; preds = %bb59.loopexit.us, %cond_true67.us
+	br label %bb83.us
+cond_true67.us:		; preds = %bb59.loopexit.us
+	br label %cond_next80.us
+cond_next.us:		; preds = %cond_true56.us, %cond_true38.us
+	br i1 false, label %cond_true56.us, label %bb59.loopexit.us
+cond_true38.us:		; preds = %cond_true56.us
+	br label %cond_next.us
+cond_true56.us:		; preds = %cond_true87.us, %cond_next.us
+	br i1 false, label %cond_true38.us, label %cond_next.us
+cond_true87.us:		; preds = %bb83.us
+	br label %cond_true56.us
+bb59.loopexit.us:		; preds = %cond_next.us
+	br i1 false, label %cond_true67.us, label %cond_next80.us
+bb83.preheader1:		; preds = %bb83.preheader
+	br label %bb83
+bb83:		; preds = %bb83.preheader1, %cond_next80
+	br i1 false, label %cond_next92.loopexit, label %cond_true87
+cond_true87:		; preds = %bb83
+	br i1 false, label %cond_true67, label %cond_next80
+cond_next92.loopexit:		; preds = %bb83
+	br label %cond_next92
+cond_next92.loopexit2:		; preds = %bb83.us
+	br label %cond_next92
+cond_next92:		; preds = %cond_true226, %cond_next92.loopexit2, %cond_next92.loopexit
+	br i1 false, label %cond_true218.loopexit, label %bb222
+cond_true139:		; preds = %cond_true202
+	br i1 false, label %cond_next195, label %cond_true155
+cond_true155:		; preds = %cond_true139
+	br i1 false, label %cond_true249.i.preheader, label %_encodepart.exit
+cond_true.i:		; preds = %cond_true115.i
+	br i1 false, label %bb60.i.preheader, label %cond_next97.i
+bb60.i.preheader:		; preds = %cond_true.i
+	br label %bb60.i
+bb60.i:		; preds = %cond_true63.i, %bb60.i.preheader
+	br i1 false, label %cond_true63.i, label %cond_next97.i.loopexit
+cond_true63.i:		; preds = %bb60.i
+	br i1 false, label %bb60.i, label %cond_next97.i.loopexit
+bb86.i.preheader:		; preds = %cond_true115.i
+	br label %bb86.i
+bb86.i:		; preds = %cond_true93.i, %bb86.i.preheader
+	br i1 false, label %cond_true93.i, label %cond_next97.i.loopexit3
+cond_true93.i:		; preds = %bb86.i
+	br i1 false, label %cond_next97.i.loopexit3, label %bb86.i
+cond_next97.i.loopexit:		; preds = %cond_true63.i, %bb60.i
+	br label %cond_next97.i
+cond_next97.i.loopexit3:		; preds = %cond_true93.i, %bb86.i
+	br label %cond_next97.i
+cond_next97.i:		; preds = %cond_next97.i.loopexit3, %cond_next97.i.loopexit, %cond_true.i
+	br i1 false, label %bb118.i.loopexit, label %cond_true115.i
+cond_true115.i.preheader:		; preds = %cond_true249.i
+	br label %cond_true115.i
+cond_true115.i:		; preds = %cond_true115.i.preheader, %cond_next97.i
+	br i1 false, label %cond_true.i, label %bb86.i.preheader
+bb118.i.loopexit:		; preds = %cond_next97.i
+	br label %bb118.i
+bb118.i:		; preds = %cond_true249.i, %bb118.i.loopexit
+	br i1 false, label %cond_next204.i, label %cond_true128.i
+cond_true128.i:		; preds = %bb118.i
+	br i1 false, label %cond_true199.i.preheader, label %cond_next204.i
+cond_true199.i.preheader:		; preds = %cond_true128.i
+	br label %cond_true199.i
+cond_true199.i.us:		; No predecessors!
+	br i1 false, label %cond_true167.i.us, label %cond_next187.i.us
+cond_next187.i.us:		; preds = %bb170.i.loopexit.us, %bb170.i.us.cond_next187.i.us_crit_edge, %cond_true199.i.us
+	unreachable
+bb170.i.us.cond_next187.i.us_crit_edge:		; preds = %bb170.i.loopexit.us
+	br label %cond_next187.i.us
+cond_true167.i.us:		; preds = %cond_true167.i.us, %cond_true199.i.us
+	br i1 false, label %cond_true167.i.us, label %bb170.i.loopexit.us
+bb170.i.loopexit.us:		; preds = %cond_true167.i.us
+	br i1 false, label %cond_next187.i.us, label %bb170.i.us.cond_next187.i.us_crit_edge
+cond_true199.i:		; preds = %cond_true199.i, %cond_true199.i.preheader
+	br i1 false, label %cond_next204.i.loopexit, label %cond_true199.i
+cond_next204.i.loopexit:		; preds = %cond_true199.i
+	br label %cond_next204.i
+cond_next204.i:		; preds = %cond_next204.i.loopexit, %cond_true128.i, %bb118.i
+	br label %bb233.i
+cond_true230.i:		; No predecessors!
+	%exitcond155 = icmp eq i32 0, %tmp16.i		; <i1> [#uses=0]
+	unreachable
+bb233.i:		; preds = %cond_next204.i
+	br i1 false, label %_encodepart.exit.loopexit, label %cond_true249.i
+cond_true249.i.preheader:		; preds = %cond_true155
+	br label %cond_true249.i
+cond_true249.i:		; preds = %cond_true249.i.preheader, %bb233.i
+	%tmp16.i = bitcast i32 0 to i32		; <i32> [#uses=1]
+	br i1 false, label %cond_true115.i.preheader, label %bb118.i
+_encodepart.exit.loopexit:		; preds = %bb233.i
+	br label %_encodepart.exit
+_encodepart.exit:		; preds = %_encodepart.exit.loopexit, %cond_true155
+	br label %cond_next195
+cond_next195:		; preds = %cond_true202, %_encodepart.exit, %cond_true139
+	br i1 false, label %bb205.loopexit, label %cond_true202
+cond_true202.preheader:		; preds = %cond_true218
+	br label %cond_true202
+cond_true202:		; preds = %cond_true202.preheader, %cond_next195
+	br i1 false, label %cond_next195, label %cond_true139
+bb205.loopexit:		; preds = %cond_next195
+	br label %bb205
+bb205:		; preds = %cond_true218, %bb205.loopexit
+	br i1 false, label %cond_true218, label %bb222.outer105.loopexit
+cond_true218.loopexit:		; preds = %cond_next92
+	br label %cond_true218
+cond_true218:		; preds = %cond_true218.loopexit, %bb205
+	br i1 false, label %cond_true202.preheader, label %bb205
+bb222.preheader:		; preds = %entry
+	br label %bb222.outer
+bb222.outer:		; preds = %bb229, %bb222.preheader
+	br label %bb222.outer105
+bb222.outer105.loopexit:		; preds = %bb205
+	br label %bb222.outer105
+bb222.outer105:		; preds = %bb222.outer105.loopexit, %bb222.outer
+	br label %bb222
+bb222:		; preds = %bb222.outer105, %cond_next92
+	br i1 false, label %cond_true226, label %bb229
+cond_true226:		; preds = %bb222
+	br i1 false, label %bb83.preheader, label %cond_next92
+bb229:		; preds = %bb222
+	br i1 false, label %bb222.outer, label %bb241.loopexit
+bb241.loopexit:		; preds = %bb229
+	br label %bb241
+bb241:		; preds = %bb241.loopexit, %entry
+	ret void
+}

Added: llvm/trunk/test/Transforms/LICM/2007-05-22-VolatileSink.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/2007-05-22-VolatileSink.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/2007-05-22-VolatileSink.ll (added)
+++ llvm/trunk/test/Transforms/LICM/2007-05-22-VolatileSink.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,56 @@
+; RUN: opt < %s -licm -S | grep "store volatile"
+; PR1435
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
+target triple = "i686-apple-darwin8"
+
+define void @Transpose(i32* %DataIn, i32* %DataOut) {
+entry:
+	%buffer = alloca [64 x i32], align 16		; <[64 x i32]*> [#uses=2]
+	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
+	br label %bb6
+
+bb:		; preds = %bb6
+	%tmp2 = load volatile i32, i32* %DataIn		; <i32> [#uses=1]
+	%tmp3 = getelementptr [64 x i32], [64 x i32]* %buffer, i32 0, i32 %i.0		; <i32*> [#uses=1]
+	store i32 %tmp2, i32* %tmp3
+	%tmp5 = add i32 %i.0, 1		; <i32> [#uses=1]
+	br label %bb6
+
+bb6:		; preds = %bb, %entry
+	%i.0 = phi i32 [ 0, %entry ], [ %tmp5, %bb ]		; <i32> [#uses=3]
+	%tmp8 = icmp sle i32 %i.0, 63		; <i1> [#uses=1]
+	%tmp89 = zext i1 %tmp8 to i8		; <i8> [#uses=1]
+	%toBool = icmp ne i8 %tmp89, 0		; <i1> [#uses=1]
+	br i1 %toBool, label %bb, label %bb30
+
+bb12:		; preds = %bb22
+	%tmp14 = mul i32 %j.1, 8		; <i32> [#uses=1]
+	%tmp16 = add i32 %tmp14, %i.1		; <i32> [#uses=1]
+	%tmp17 = getelementptr [64 x i32], [64 x i32]* %buffer, i32 0, i32 %tmp16		; <i32*> [#uses=1]
+	%tmp18 = load i32, i32* %tmp17		; <i32> [#uses=1]
+	store volatile i32 %tmp18, i32* %DataOut
+	%tmp21 = add i32 %j.1, 1		; <i32> [#uses=1]
+	br label %bb22
+
+bb22:		; preds = %bb30, %bb12
+	%j.1 = phi i32 [ %tmp21, %bb12 ], [ 0, %bb30 ]		; <i32> [#uses=4]
+	%tmp24 = icmp sle i32 %j.1, 7		; <i1> [#uses=1]
+	%tmp2425 = zext i1 %tmp24 to i8		; <i8> [#uses=1]
+	%toBool26 = icmp ne i8 %tmp2425, 0		; <i1> [#uses=1]
+	br i1 %toBool26, label %bb12, label %bb27
+
+bb27:		; preds = %bb22
+	%tmp29 = add i32 %i.1, 1		; <i32> [#uses=1]
+	br label %bb30
+
+bb30:		; preds = %bb27, %bb6
+	%j.0 = phi i32 [ %j.1, %bb27 ], [ undef, %bb6 ]		; <i32> [#uses=0]
+	%i.1 = phi i32 [ %tmp29, %bb27 ], [ 0, %bb6 ]		; <i32> [#uses=3]
+	%tmp32 = icmp sle i32 %i.1, 7		; <i1> [#uses=1]
+	%tmp3233 = zext i1 %tmp32 to i8		; <i8> [#uses=1]
+	%toBool34 = icmp ne i8 %tmp3233, 0		; <i1> [#uses=1]
+	br i1 %toBool34, label %bb22, label %return
+
+return:		; preds = %bb30
+	ret void
+}

Added: llvm/trunk/test/Transforms/LICM/2007-07-30-AliasSet.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/2007-07-30-AliasSet.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/2007-07-30-AliasSet.ll (added)
+++ llvm/trunk/test/Transforms/LICM/2007-07-30-AliasSet.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,39 @@
+; RUN: opt < %s -licm -loop-unswitch -disable-output
+	%struct.III_scalefac_t = type { [22 x i32], [13 x [3 x i32]] }
+	%struct.gr_info = type { i32, i32, i32, i32, i32, i32, i32, i32, [3 x i32], [3 x i32], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32*, [4 x i32] }
+
+define i32 @scale_bitcount_lsf(%struct.III_scalefac_t* %scalefac, %struct.gr_info* %cod_info) {
+entry:
+	br i1 false, label %bb28, label %bb133.preheader
+
+bb133.preheader:		; preds = %entry
+	ret i32 0
+
+bb28:		; preds = %entry
+	br i1 false, label %bb63.outer, label %bb79
+
+bb63.outer:		; preds = %bb73, %bb28
+	br i1 false, label %bb35, label %bb73
+
+bb35:		; preds = %cond_next60, %bb63.outer
+	%window.34 = phi i32 [ %tmp62, %cond_next60 ], [ 0, %bb63.outer ]		; <i32> [#uses=1]
+	%tmp44 = getelementptr [4 x i32], [4 x i32]* null, i32 0, i32 0		; <i32*> [#uses=1]
+	%tmp46 = load i32, i32* %tmp44, align 4		; <i32> [#uses=0]
+	br i1 false, label %cond_true50, label %cond_next60
+
+cond_true50:		; preds = %bb35
+	%tmp59 = getelementptr [4 x i32], [4 x i32]* null, i32 0, i32 0		; <i32*> [#uses=1]
+	store i32 0, i32* %tmp59, align 4
+	br label %cond_next60
+
+cond_next60:		; preds = %cond_true50, %bb35
+	%tmp62 = add i32 %window.34, 1		; <i32> [#uses=1]
+	br i1 false, label %bb35, label %bb73
+
+bb73:		; preds = %cond_next60, %bb63.outer
+	%tmp76 = icmp slt i32 0, 0		; <i1> [#uses=1]
+	br i1 %tmp76, label %bb63.outer, label %bb79
+
+bb79:		; preds = %bb73, %bb28
+	ret i32 0
+}

Added: llvm/trunk/test/Transforms/LICM/2007-09-17-PromoteValue.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/2007-09-17-PromoteValue.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/2007-09-17-PromoteValue.ll (added)
+++ llvm/trunk/test/Transforms/LICM/2007-09-17-PromoteValue.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,61 @@
+; ModuleID = 'PR1657.bc'
+; Do not promote getelementptr because it may exposes load from a null pointer 
+; and store from a null pointer  which are covered by 
+; icmp eq %struct.decision* null, null condition.
+; RUN: opt < %s -licm -S | not grep promoted
+	%struct.decision = type { i8, %struct.decision* }
+
+define i32 @main() {
+entry:
+	br label %blah.i
+
+blah.i:		; preds = %cond_true.i, %entry
+	%tmp3.i = icmp eq %struct.decision* null, null		; <i1> [#uses=1]
+	br i1 %tmp3.i, label %clear_modes.exit, label %cond_true.i
+
+cond_true.i:		; preds = %blah.i
+	%tmp1.i = getelementptr %struct.decision, %struct.decision* null, i32 0, i32 0		; <i8*> [#uses=1]
+	store i8 0, i8* %tmp1.i
+	br label %blah.i
+
+clear_modes.exit:		; preds = %blah.i
+	call void @exit( i32 0 )
+	unreachable
+}
+
+define i32 @f(i8* %ptr) {
+entry:
+        br label %loop.head
+
+loop.head:              ; preds = %cond.true, %entry
+        %x = phi i8* [ %ptr, %entry ], [ %ptr.i, %cond.true ]           ; <i8*> [#uses=1]
+        %tmp3.i = icmp ne i8* %ptr, %x          ; <i1> [#uses=1]
+        br i1 %tmp3.i, label %cond.true, label %exit
+
+cond.true:              ; preds = %loop.head
+        %ptr.i = getelementptr i8, i8* %ptr, i32 0          ; <i8*> [#uses=2]
+        store i8 0, i8* %ptr.i
+        br label %loop.head
+
+exit:           ; preds = %loop.head
+        ret i32 0
+}
+
+define i32 @f2(i8* %p, i8* %q) {
+entry:
+        br label %loop.head
+
+loop.head:              ; preds = %cond.true, %entry
+        %tmp3.i = icmp eq i8* null, %q            ; <i1> [#uses=1]
+        br i1 %tmp3.i, label %exit, label %cond.true
+
+cond.true:              ; preds = %loop.head
+        %ptr.i = getelementptr i8, i8* %p, i32 0          ; <i8*> [#uses=2]
+        store i8 0, i8* %ptr.i
+        br label %loop.head
+
+exit:           ; preds = %loop.head
+        ret i32 0
+}
+
+declare void @exit(i32)

Added: llvm/trunk/test/Transforms/LICM/2007-09-24-PromoteNullValue.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/2007-09-24-PromoteNullValue.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/2007-09-24-PromoteNullValue.ll (added)
+++ llvm/trunk/test/Transforms/LICM/2007-09-24-PromoteNullValue.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,46 @@
+; Do not promote null value because it may be unsafe to do so.
+; RUN: opt < %s -licm -S | not grep promoted
+
+define i32 @f(i32 %foo, i32 %bar, i32 %com) {
+entry:
+	%tmp2 = icmp eq i32 %foo, 0		; <i1> [#uses=1]
+	br i1 %tmp2, label %cond_next, label %cond_true
+
+cond_true:		; preds = %entry
+	br label %return
+
+cond_next:		; preds = %entry
+	br label %bb
+
+bb:		; preds = %bb15, %cond_next
+	switch i32 %bar, label %bb15 [
+		 i32 1, label %bb6
+	]
+
+bb6:		; preds = %bb
+	%tmp8 = icmp eq i32 %com, 0		; <i1> [#uses=1]
+	br i1 %tmp8, label %cond_next14, label %cond_true11
+
+cond_true11:		; preds = %bb6
+	br label %return
+
+cond_next14:		; preds = %bb6
+	store i8 0, i8* null
+	br label %bb15
+
+bb15:		; preds = %cond_next14, %bb
+	br label %bb
+
+return:		; preds = %cond_true11, %cond_true
+	%storemerge = phi i32 [ 0, %cond_true ], [ undef, %cond_true11 ]		; <i32> [#uses=1]
+	ret i32 %storemerge
+}
+
+define i32 @kdMain() {
+entry:
+	%tmp1 = call i32 @f( i32 0, i32 1, i32 1 )		; <i32> [#uses=0]
+	call void @exit( i32 0 )
+	unreachable
+}
+
+declare void @exit(i32)

Added: llvm/trunk/test/Transforms/LICM/2007-10-01-PromoteSafeValue.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/2007-10-01-PromoteSafeValue.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/2007-10-01-PromoteSafeValue.ll (added)
+++ llvm/trunk/test/Transforms/LICM/2007-10-01-PromoteSafeValue.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,23 @@
+; RUN: opt < %s -licm -S | FileCheck %s
+; Promote value if at least one use is safe
+
+
+define i32 @f2(i32* %p, i8* %q) {
+entry:
+        br label %loop.head
+
+loop.head:              ; preds = %cond.true, %entry
+        store i32 20, i32* %p
+        %tmp3.i = icmp eq i8* null, %q            ; <i1> [#uses=1]
+        br i1 %tmp3.i, label %exit, label %cond.true
+        
+cond.true:              ; preds = %loop.head
+        store i32 40, i32* %p
+        br label %loop.head
+
+; CHECK: exit:
+; CHECK: store i32 20, i32* %p
+exit:           ; preds = %loop.head
+        ret i32 0
+}
+

Added: llvm/trunk/test/Transforms/LICM/2008-05-20-AliasSetVAArg.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/2008-05-20-AliasSetVAArg.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/2008-05-20-AliasSetVAArg.ll (added)
+++ llvm/trunk/test/Transforms/LICM/2008-05-20-AliasSetVAArg.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,30 @@
+; RUN: opt < %s -licm -disable-output
+; PR2346
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i686-pc-linux-gnu"
+	%struct._zval_struct = type { %union._double, i32, i8, i8, i8, i8 }
+	%union._double = type { double }
+
+define i8* @zend_fetch_resource(%struct._zval_struct** %passed_id, i32 %default_id, i8* %resource_type_name, i32* %found_resource_type, i32 %num_resource_types, ...) {
+entry:
+	br label %whilebody.i.i
+
+whilebody.i.i:		; preds = %whilebody.i.i, %entry
+	br i1 false, label %ifthen.i.i, label %whilebody.i.i
+
+ifthen.i.i:		; preds = %whilebody.i.i
+	br label %forcond
+
+forcond:		; preds = %forbody, %ifthen.i.i
+	br i1 false, label %forbody, label %afterfor
+
+forbody:		; preds = %forcond
+	va_arg i8** null, i32		; <i32>:0 [#uses=0]
+	br i1 false, label %ifthen59, label %forcond
+
+ifthen59:		; preds = %forbody
+	unreachable
+
+afterfor:		; preds = %forcond
+	ret i8* null
+}

Added: llvm/trunk/test/Transforms/LICM/2008-07-22-LoadGlobalConstant.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/2008-07-22-LoadGlobalConstant.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/2008-07-22-LoadGlobalConstant.ll (added)
+++ llvm/trunk/test/Transforms/LICM/2008-07-22-LoadGlobalConstant.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,39 @@
+; RUN: opt < %s -basicaa -licm -S | FileCheck %s
+
+ at a = external constant float*
+
+define void @test(i32 %count) {
+entry:
+        br label %forcond
+
+; CHECK:  %tmp3 = load float*, float** @a
+; CHECK:  br label %forcond
+
+forcond:
+        %i.0 = phi i32 [ 0, %entry ], [ %inc, %forbody ]
+        %cmp = icmp ult i32 %i.0, %count
+        br i1 %cmp, label %forbody, label %afterfor
+
+; CHECK:  %i.0 = phi i32 [ 0, %entry ], [ %inc, %forbody ]
+; CHECK:  %cmp = icmp ult i32 %i.0, %count
+; CHECK:  br i1 %cmp, label %forbody, label %afterfor
+
+forbody:
+        %tmp3 = load float*, float** @a
+        %arrayidx = getelementptr float, float* %tmp3, i32 %i.0
+        %tmp7 = uitofp i32 %i.0 to float
+        store float %tmp7, float* %arrayidx
+        %inc = add i32 %i.0, 1
+        br label %forcond
+
+; CHECK:  %arrayidx = getelementptr float, float* %tmp3, i32 %i.0
+; CHECK:  %tmp7 = uitofp i32 %i.0 to float
+; CHECK:  store float %tmp7, float* %arrayidx
+; CHECK:  %inc = add i32 %i.0, 1
+; CHECK:  br label %forcond
+
+afterfor:
+        ret void
+}
+
+; CHECK:  ret void

Added: llvm/trunk/test/Transforms/LICM/2009-12-10-LICM-Indbr-Crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/2009-12-10-LICM-Indbr-Crash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/2009-12-10-LICM-Indbr-Crash.ll (added)
+++ llvm/trunk/test/Transforms/LICM/2009-12-10-LICM-Indbr-Crash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,21 @@
+; Test for rdar://7452967
+; RUN: opt < %s -licm -disable-output
+define void @foo (i8* %v)
+{
+  entry:
+    br i1 undef, label %preheader, label %return
+
+  preheader:
+    br i1 undef, label %loop, label %return
+
+  loop:
+    indirectbr i8* undef, [label %preheader, label %stuff]
+
+  stuff:
+    %0 = load i8, i8* undef, align 1
+    br label %loop
+
+  return:
+    ret void
+
+}

Added: llvm/trunk/test/Transforms/LICM/2011-04-06-HoistMissedASTUpdate.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/2011-04-06-HoistMissedASTUpdate.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/2011-04-06-HoistMissedASTUpdate.ll (added)
+++ llvm/trunk/test/Transforms/LICM/2011-04-06-HoistMissedASTUpdate.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,32 @@
+; RUN: opt < %s -basicaa -licm -S | FileCheck %s
+; PR9630
+
+ at g_39 = external global i16, align 2
+
+declare i32* @func_84(i32** nocapture) nounwind readonly
+
+declare i32** @func_108(i32*** nocapture) nounwind readonly
+
+define void @func() nounwind {
+entry:
+  br label %for.body4.lr.ph
+
+for.body4.lr.ph:
+  br label %for.body4
+
+; CHECK: for.body4:
+; CHECK: load volatile i16, i16* @g_39
+
+for.body4:
+  %l_612.11 = phi i32* [ undef, %for.body4.lr.ph ], [ %call19, %for.body4 ]
+  %tmp7 = load volatile i16, i16* @g_39, align 2
+  %call = call i32** @func_108(i32*** undef)
+  %call19 = call i32* @func_84(i32** %call)
+  br i1 false, label %for.body4, label %for.cond.loopexit
+
+for.cond.loopexit:
+  br i1 false, label %for.body4.lr.ph, label %for.end26
+
+for.end26:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LICM/2011-04-06-PromoteResultOfPromotion.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/2011-04-06-PromoteResultOfPromotion.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/2011-04-06-PromoteResultOfPromotion.ll (added)
+++ llvm/trunk/test/Transforms/LICM/2011-04-06-PromoteResultOfPromotion.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,39 @@
+; RUN: opt < %s -tbaa -licm -S | FileCheck %s
+; PR9634
+
+ at g_58 = common global i32 0, align 4
+ at g_116 = common global i32* null, align 8
+
+define void @f() nounwind {
+
+; CHECK: entry:
+; CHECK: alloca [9 x i16]
+; CHECK: load i32, i32* @g_58
+; CHECK: br label %for.body
+
+entry:
+  %l_87.i = alloca [9 x i16], align 16
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc
+  %inc12 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  store i32* @g_58, i32** @g_116, align 8, !tbaa !0
+  %tmp2 = load i32*, i32** @g_116, align 8, !tbaa !0
+  %tmp3 = load i32, i32* %tmp2, !tbaa !4
+  %or = or i32 %tmp3, 10
+  store i32 %or, i32* %tmp2, !tbaa !4
+  %inc = add nsw i32 %inc12, 1
+  %cmp = icmp slt i32 %inc, 4
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.inc
+  ret void
+}
+
+!0 = !{!5, !5, i64 0}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
+!3 = !{!"short", !1}
+!4 = !{!6, !6, i64 0}
+!5 = !{!"any pointer", !1}
+!6 = !{!"int", !1}

Added: llvm/trunk/test/Transforms/LICM/2011-04-09-RAUW-AST.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/2011-04-09-RAUW-AST.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/2011-04-09-RAUW-AST.ll (added)
+++ llvm/trunk/test/Transforms/LICM/2011-04-09-RAUW-AST.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,49 @@
+; RUN: opt < %s -loop-rotate -licm -S | FileCheck %s
+; PR9604
+
+ at g_3 = global i32 0, align 4
+ at .str = private unnamed_addr constant [4 x i8] c"%d\0A\00"
+
+define i32 @main() nounwind {
+entry:
+  %tmp = load i32, i32* @g_3, align 4
+  %tobool = icmp eq i32 %tmp, 0
+  br i1 %tobool, label %for.cond, label %if.then
+
+if.then:                                          ; preds = %entry
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc10, %if.then, %entry
+  %g.0 = phi i32* [ %g.0, %for.inc10 ], [ @g_3, %entry ], [ null, %if.then ]
+  %x.0 = phi i32 [ %inc12, %for.inc10 ], [ 0, %entry ], [ 0, %if.then ]
+  %cmp = icmp slt i32 %x.0, 5
+  br i1 %cmp, label %for.cond4, label %for.end13
+
+for.cond4:                                        ; preds = %for.body7, %for.cond
+  %y.0 = phi i32 [ %inc, %for.body7 ], [ 0, %for.cond ]
+  %cmp6 = icmp slt i32 %y.0, 5
+  br i1 %cmp6, label %for.body7, label %for.inc10
+
+; CHECK: for.body7:
+; CHECK-NEXT: phi
+; CHECK-NEXT: store i32 0
+; CHECK-NEXT: store i32 1
+
+for.body7:                                        ; preds = %for.cond4
+  store i32 0, i32* @g_3, align 4
+  store i32 1, i32* %g.0, align 4
+  %inc = add nsw i32 %y.0, 1
+  br label %for.cond4
+
+for.inc10:                                        ; preds = %for.cond4
+  %inc12 = add nsw i32 %x.0, 1
+  br label %for.cond
+
+for.end13:                                        ; preds = %for.cond
+  %tmp14 = load i32, i32* @g_3, align 4
+  %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %tmp14) nounwind
+  ret i32 0
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind
+

Added: llvm/trunk/test/Transforms/LICM/2011-07-06-Alignment.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/2011-07-06-Alignment.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/2011-07-06-Alignment.ll (added)
+++ llvm/trunk/test/Transforms/LICM/2011-07-06-Alignment.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,26 @@
+; RUN: opt -licm -S < %s | FileCheck %s
+
+ at A = common global [1024 x float] zeroinitializer, align 4
+
+define i32 @main() nounwind {
+entry:
+  br label %for.cond
+
+for.cond:
+  %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr [1024 x float], [1024 x float]* @A, i64 0, i64 3
+  %vecidx = bitcast float* %arrayidx to <4 x float>*
+  store <4 x float> zeroinitializer, <4 x float>* %vecidx, align 4
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp ne i64 %indvar, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.body:
+  br label %for.cond
+
+for.end:
+  ret i32 0
+}
+
+;CHECK: store <4 x float> {{.*}} align 4
+

Added: llvm/trunk/test/Transforms/LICM/2014-09-10-doFinalizationAssert.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/2014-09-10-doFinalizationAssert.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/2014-09-10-doFinalizationAssert.ll (added)
+++ llvm/trunk/test/Transforms/LICM/2014-09-10-doFinalizationAssert.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,30 @@
+; RUN: opt < %s -scalar-evolution -licm -loop-unroll -disable-output
+; Test triggered an assertion in doFinalization() because loop unroll was deleting
+; the inner loop which caused the loop to not get removed from the
+; LoopToAliasSetMap.
+; Test case taken from test/Transforms/LoopUnroll/unloop.ll.
+
+declare i1 @check() nounwind
+define void @skiplevelexit() nounwind {
+entry:
+  br label %outer
+
+outer:
+  br label %inner
+
+inner:
+  %iv = phi i32 [ 0, %outer ], [ %inc, %tail ]
+  %inc = add i32 %iv, 1
+  call zeroext i1 @check()
+  br i1 true, label %outer.backedge, label %tail
+
+tail:
+  br i1 false, label %inner, label %exit
+
+outer.backedge:
+  br label %outer
+
+exit:
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/LICM/AliasSetMemSet.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/AliasSetMemSet.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/AliasSetMemSet.ll (added)
+++ llvm/trunk/test/Transforms/LICM/AliasSetMemSet.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,51 @@
+; RUN: opt < %s -loop-deletion -licm -loop-idiom -disable-output
+; Check no assertion when loop-idiom deletes the MemSet already analyzed by licm
+define void @set_array() {
+  br i1 false, label %bb3.preheader.lr.ph, label %bb9
+
+bb3.preheader.lr.ph:                              ; preds = %0
+  br label %bb3.preheader
+
+bb4:                                              ; preds = %bb4.lr.ph, %bb7
+  %j.3.06 = phi i8 [ %j.3.17, %bb4.lr.ph ], [ %_tmp13, %bb7 ]
+  br label %bb6
+
+bb6:                                              ; preds = %bb4, %bb6
+  %k.4.04 = phi i8 [ 0, %bb4 ], [ %_tmp9, %bb6 ]
+  %_tmp31 = sext i8 %j.3.06 to i64
+  %_tmp4 = mul i64 %_tmp31, 10
+  %_tmp5 = getelementptr i8, i8* undef, i64 %_tmp4
+  %_tmp7 = getelementptr i8, i8* %_tmp5, i8 %k.4.04
+  store i8 42, i8* %_tmp7
+  %_tmp9 = add i8 %k.4.04, 1
+  %_tmp11 = icmp slt i8 %_tmp9, 10
+  br i1 %_tmp11, label %bb6, label %bb7
+
+bb7:                                              ; preds = %bb6
+  %_tmp13 = add i8 %j.3.06, 1
+  %_tmp15 = icmp slt i8 %_tmp13, 2
+  br i1 %_tmp15, label %bb4, label %bb3.bb1.loopexit_crit_edge
+
+bb3.bb1.loopexit_crit_edge:                       ; preds = %bb7
+  %split = phi i8 [ %_tmp13, %bb7 ]
+  br label %bb1.loopexit
+
+bb1.loopexit:                                     ; preds = %bb3.bb1.loopexit_crit_edge, %bb3.preheader
+  %j.3.0.lcssa = phi i8 [ %split, %bb3.bb1.loopexit_crit_edge ], [ %j.3.17, %bb3.preheader ]
+  br i1 false, label %bb3.preheader, label %bb1.bb9_crit_edge
+
+bb3.preheader:                                    ; preds = %bb3.preheader.lr.ph, %bb1.loopexit
+  %j.3.17 = phi i8 [ undef, %bb3.preheader.lr.ph ], [ %j.3.0.lcssa, %bb1.loopexit ]
+  %_tmp155 = icmp slt i8 %j.3.17, 2
+  br i1 %_tmp155, label %bb4.lr.ph, label %bb1.loopexit
+
+bb4.lr.ph:                                        ; preds = %bb3.preheader
+  br label %bb4
+
+bb1.bb9_crit_edge:                                ; preds = %bb1.loopexit
+  br label %bb9
+
+bb9:                                              ; preds = %bb1.bb9_crit_edge, %0
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/LICM/PR19798.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/PR19798.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/PR19798.ll (added)
+++ llvm/trunk/test/Transforms/LICM/PR19798.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,22 @@
+; RUN: opt -licm -S < %s | FileCheck %s
+
+define void @f() {
+; CHECK-LABEL: @f(
+entry:
+  br label %bb0
+
+bb0:
+  %tobool7 = icmp eq i1 undef, undef
+  br label %bb1
+
+bb1:
+  br i1 undef, label %bb0, label %bb0
+
+unreachable:
+; CHECK-LABEL: unreachable:
+; CHECK:   br i1 undef, label %unreachable, label %unreachable
+  br i1 %tobool7, label %unreachable, label %unreachable
+
+bb3:
+  unreachable
+}

Added: llvm/trunk/test/Transforms/LICM/PR21582.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/PR21582.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/PR21582.ll (added)
+++ llvm/trunk/test/Transforms/LICM/PR21582.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,40 @@
+; RUN: opt < %s -basicaa -licm -S | FileCheck %s
+ at b = external global i32, align 4
+ at fn3.i = external global i32, align 4
+
+declare i32 @g() nounwind
+
+define i32 @f() {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.end, %entry
+; CHECK-LABEL: for.cond:
+; CHECK: store i32 0, i32* @b
+  store i32 0, i32* @b, align 4
+  br i1 true, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %for.cond
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %g.15 = phi i32 [ undef, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx2 = getelementptr inbounds i32, i32* @fn3.i, i64 0
+  %0 = load i32, i32* %arrayidx2, align 4
+  %call = call i32 @g()
+  br i1 false, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %for.cond
+  %whatever = phi i32 [ %call, %for.end.loopexit ], [ undef, %for.cond ]
+  br i1 false, label %for.cond, label %if.then
+
+if.then:                                          ; preds = %for.end
+; CHECK-LABEL: if.then:
+; CHECK: phi i32 [ {{.*}}, %for.end ]
+; CHECK-NOT: store i32 0, i32* @b
+; CHECK: ret i32
+  ret i32 %whatever
+}

Added: llvm/trunk/test/Transforms/LICM/PR24013.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/PR24013.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/PR24013.ll (added)
+++ llvm/trunk/test/Transforms/LICM/PR24013.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,19 @@
+; RUN: opt -licm -S < %s | FileCheck %s
+
+define void @f(i1 zeroext %p1) {
+; CHECK-LABEL: @f(
+entry:
+  br label %lbl
+
+lbl.loopexit:                                     ; No predecessors!
+  br label %lbl
+
+lbl:                                              ; preds = %lbl.loopexit, %entry
+  %phi = phi i32 [ %conv, %lbl.loopexit ], [ undef, %entry ]
+; CHECK: phi i32 [ undef, {{.*}} ], [ undef
+  br label %if.then.5
+
+if.then.5:                                        ; preds = %if.then.5, %lbl
+  %conv = zext i1 undef to i32
+  br label %if.then.5
+}

Added: llvm/trunk/test/Transforms/LICM/Preserve-LCSSA.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/Preserve-LCSSA.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/Preserve-LCSSA.ll (added)
+++ llvm/trunk/test/Transforms/LICM/Preserve-LCSSA.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,25 @@
+; RUN: opt < %s -loop-rotate -licm -loop-unswitch -disable-output -verify-loop-info -verify-dom-info
+
+define i32 @stringSearch_Clib(i32 %count) {
+entry:
+	br i1 false, label %bb36, label %bb44
+
+bb4:		; preds = %bb36
+	br i1 false, label %cond_next, label %cond_true
+
+cond_true:		; preds = %bb4
+	ret i32 0
+
+cond_next:		; preds = %bb4
+	ret i32 0
+
+bb36:		; preds = %bb41, %entry
+	br i1 false, label %bb4, label %bb41
+
+bb41:		; preds = %bb36
+	%ttmp2 = icmp slt i32 0, %count		; <i1> [#uses=1]
+	br i1 %ttmp2, label %bb36, label %bb44
+
+bb44:		; preds = %bb41, %entry
+	ret i32 0
+}

Added: llvm/trunk/test/Transforms/LICM/alias-set-tracker-loss.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/alias-set-tracker-loss.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/alias-set-tracker-loss.ll (added)
+++ llvm/trunk/test/Transforms/LICM/alias-set-tracker-loss.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,39 @@
+; RUN: opt -S -licm -loop-unroll < %s
+;
+; This test contains a carefully rotated set of three nested loops. The middle
+; loop can be unrolled leaving one copy of the inner loop inside the outer
+; loop. Because of how LICM works, when this middle loop is unrolled and
+; removed, its alias set tracker is destroyed and no longer available when LICM
+; runs on the outer loop.
+
+define void @f() {
+entry:
+  br label %l1
+
+l2.l1.loopexit_crit_edge:
+  br label %l1.loopexit
+
+l1.loopexit:
+  br label %l1.backedge
+
+l1:
+  br i1 undef, label %l1.backedge, label %l2.preheader
+
+l1.backedge:
+  br label %l1
+
+l2.preheader:
+  br i1 true, label %l1.loopexit, label %l3.preheader.lr.ph
+
+l3.preheader.lr.ph:
+  br label %l3.preheader
+
+l2.loopexit:
+  br i1 true, label %l2.l1.loopexit_crit_edge, label %l3.preheader
+
+l3.preheader:
+  br label %l3
+
+l3:
+  br i1 true, label %l3, label %l2.loopexit
+}

Added: llvm/trunk/test/Transforms/LICM/argmemonly-call.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/argmemonly-call.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/argmemonly-call.ll (added)
+++ llvm/trunk/test/Transforms/LICM/argmemonly-call.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,159 @@
+; RUN: opt -S -basicaa -licm -licm-n2-threshold=0 %s | FileCheck %s
+; RUN: opt -licm -basicaa -licm-n2-threshold=200 < %s -S | FileCheck %s --check-prefix=ALIAS-N2
+; RUN: opt -aa-pipeline=basic-aa -licm-n2-threshold=0 -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -licm-n2-threshold=200 -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s --check-prefix=ALIAS-N2
+; RUN: opt -S -basicaa -licm -licm-n2-threshold=0 -enable-mssa-loop-dependency=true -verify-memoryssa %s | FileCheck %s --check-prefix=ALIAS-N2
+
+declare i32 @foo() readonly argmemonly nounwind
+declare i32 @foo2() readonly nounwind
+declare i32 @bar(i32* %loc2) readonly argmemonly nounwind
+
+define void @test(i32* %loc) {
+; CHECK-LABEL: @test
+; CHECK: @foo
+; CHECK-LABEL: loop:
+; ALIAS-N2-LABEL: @test
+; ALIAS-N2: @foo
+; ALIAS-N2-LABEL: loop:
+  br label %loop
+
+loop:
+  %res = call i32 @foo()
+  store i32 %res, i32* %loc
+  br label %loop
+}
+
+; Negative test: show argmemonly is required
+define void @test_neg(i32* %loc) {
+; CHECK-LABEL: @test_neg
+; CHECK-LABEL: loop:
+; CHECK: @foo
+; ALIAS-N2-LABEL: @test_neg
+; ALIAS-N2-LABEL: loop:
+; ALIAS-N2: @foo
+  br label %loop
+
+loop:
+  %res = call i32 @foo2()
+  store i32 %res, i32* %loc
+  br label %loop
+}
+
+define void @test2(i32* noalias %loc, i32* noalias %loc2) {
+; CHECK-LABEL: @test2
+; CHECK: @bar
+; CHECK-LABEL: loop:
+; ALIAS-N2-LABEL: @test2
+; ALIAS-N2: @bar
+; ALIAS-N2-LABEL: loop:
+  br label %loop
+
+loop:
+  %res = call i32 @bar(i32* %loc2)
+  store i32 %res, i32* %loc
+  br label %loop
+}
+
+; Negative test: %might clobber gep
+define void @test3(i32* %loc) {
+; CHECK-LABEL: @test3
+; CHECK-LABEL: loop:
+; CHECK: @bar
+; ALIAS-N2-LABEL: @test3
+; ALIAS-N2-LABEL: loop:
+; ALIAS-N2: @bar
+  br label %loop
+
+loop:
+  %res = call i32 @bar(i32* %loc)
+  %gep = getelementptr i32, i32 *%loc, i64 1000000
+  store i32 %res, i32* %gep
+  br label %loop
+}
+
+
+; Negative test: %loc might alias %loc2
+define void @test4(i32* %loc, i32* %loc2) {
+; CHECK-LABEL: @test4
+; CHECK-LABEL: loop:
+; CHECK: @bar
+; ALIAS-N2-LABEL: @test4
+; ALIAS-N2-LABEL: loop:
+; ALIAS-N2: @bar
+  br label %loop
+
+loop:
+  %res = call i32 @bar(i32* %loc2)
+  store i32 %res, i32* %loc
+  br label %loop
+}
+
+declare i32 @foo_new(i32*) readonly
+; With the default AST mechanism used by LICM for alias analysis,
+; we clump foo_new with bar.
+; With the N2 Alias analysis diagnostic tool, we are able to hoist the
+; argmemonly bar call out of the loop.
+; Using MemorySSA we can also hoist bar.
+
+define void @test5(i32* %loc2, i32* noalias %loc) {
+; ALIAS-N2-LABEL: @test5
+; ALIAS-N2: @bar
+; ALIAS-N2-LABEL: loop:
+
+; CHECK-LABEL: @test5
+; CHECK-LABEL: loop:
+; CHECK:  @bar
+  br label %loop
+
+loop:
+  %res1 = call i32 @bar(i32* %loc2)
+  %res = call i32 @foo_new(i32* %loc2)
+  store volatile i32 %res1, i32* %loc
+  br label %loop
+}
+
+
+; memcpy doesn't write to it's source argument, so loads to that location
+; can still be hoisted
+define void @test6(i32* noalias %loc, i32* noalias %loc2) {
+; CHECK-LABEL: @test6
+; CHECK: %val = load i32, i32* %loc2
+; CHECK-LABEL: loop:
+; CHECK: @llvm.memcpy
+; ALIAS-N2-LABEL: @test6
+; ALIAS-N2: %val = load i32, i32* %loc2
+; ALIAS-N2-LABEL: loop:
+; ALIAS-N2: @llvm.memcpy
+  br label %loop
+
+loop:
+  %val = load i32, i32* %loc2
+  store i32 %val, i32* %loc
+  %dest = bitcast i32* %loc to i8*
+  %src = bitcast i32* %loc2 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dest, i8* %src, i64 8, i1 false)
+  br label %loop
+}
+
+define void @test7(i32* noalias %loc, i32* noalias %loc2) {
+; CHECK-LABEL: @test7
+; CHECK: %val = load i32, i32* %loc2
+; CHECK-LABEL: loop:
+; CHECK: @custom_memcpy
+; ALIAS-N2-LABEL: @test7
+; ALIAS-N2: %val = load i32, i32* %loc2
+; ALIAS-N2-LABEL: loop:
+; ALIAS-N2: @custom_memcpy
+  br label %loop
+
+loop:
+  %val = load i32, i32* %loc2
+  store i32 %val, i32* %loc
+  %dest = bitcast i32* %loc to i8*
+  %src = bitcast i32* %loc2 to i8*
+  call void @custom_memcpy(i8* %dest, i8* %src)
+  br label %loop
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1)
+declare void @custom_memcpy(i8* nocapture writeonly, i8* nocapture readonly) argmemonly nounwind

Added: llvm/trunk/test/Transforms/LICM/assume.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/assume.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/assume.ll (added)
+++ llvm/trunk/test/Transforms/LICM/assume.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,99 @@
+; RUN: opt -licm -basicaa < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
+
+define void @f_0(i1 %p) nounwind ssp {
+; CHECK-LABEL: @f_0(
+entry:
+  br label %for.body
+
+for.body:
+  br i1 undef, label %if.then, label %for.cond.backedge
+
+for.cond.backedge:
+  br i1 undef, label %for.end104, label %for.body
+
+if.then:
+  br i1 undef, label %if.then27, label %if.end.if.end.split_crit_edge.critedge
+
+if.then27:
+; CHECK: tail call void @llvm.assume
+  tail call void @llvm.assume(i1 %p)
+  br label %for.body61.us
+
+if.end.if.end.split_crit_edge.critedge:
+  br label %for.body61
+
+for.body61.us:
+  br i1 undef, label %for.cond.backedge, label %for.body61.us
+
+for.body61:
+  br i1 undef, label %for.cond.backedge, label %for.body61
+
+for.end104:
+  ret void
+}
+
+define void @f_1(i1 %cond, i32* %ptr) {
+; CHECK-LABEL: @f_1(
+; CHECK-LABEL: entry:
+; CHECK: call void @llvm.assume(i1 %cond)
+; CHECK: %val = load i32, i32* %ptr
+; CHECK-LABEL: loop:
+
+entry:
+  br label %loop
+
+loop:
+  %x = phi i32 [ 0, %entry ], [ %x.inc, %loop ]
+  call void @llvm.assume(i1 %cond)
+  %val = load i32, i32* %ptr
+  %x.inc = add i32 %x, %val
+  br label %loop
+}
+
+; Can't hoist because the call may throw and the assume
+; may never execute.
+define void @f_2(i1 %cond, i32* %ptr) {
+; CHECK-LABEL: @f_2(
+; CHECK-LABEL: entry:
+; CHECK-LABEL: loop:
+; CHECK: call void @llvm.assume(i1 %cond)
+; CHECK: %val = load i32, i32* %ptr
+
+entry:
+  br label %loop
+
+loop:
+  %x = phi i32 [ 0, %entry ], [ %x.inc, %loop ]
+  call void @maythrow()
+  call void @llvm.assume(i1 %cond)
+  %val = load i32, i32* %ptr
+  %x.inc = add i32 %x, %val
+  br label %loop
+}
+
+; Note: resulting loop could be peeled and then hoisted, but
+; by default assume is captured in phi cycle.
+define void @f_3(i1 %cond, i32* %ptr) {
+; CHECK-LABEL: @f_3(
+; CHECK-LABEL: entry:
+; CHECK: %val = load i32, i32* %ptr
+; CHECK-LABEL: loop:
+; CHECK: call void @llvm.assume(i1 %x.cmp)
+
+entry:
+  br label %loop
+
+loop:
+  %x = phi i32 [ 0, %entry ], [ %x.inc, %loop ]
+  %x.cmp = phi i1 [%cond, %entry], [%cond.next, %loop]
+  call void @llvm.assume(i1 %x.cmp)
+  %val = load i32, i32* %ptr
+  %cond.next = icmp eq i32 %val, 5
+  %x.inc = add i32 %x, %val
+  br label %loop
+}
+
+
+declare void @maythrow()
+declare void @llvm.assume(i1)

Added: llvm/trunk/test/Transforms/LICM/atomics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/atomics.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/atomics.ll (added)
+++ llvm/trunk/test/Transforms/LICM/atomics.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,223 @@
+; RUN: opt < %s -S -basicaa -licm | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
+
+; Check that we can hoist unordered loads
+define i32 @test1(i32* nocapture %y) nounwind uwtable ssp {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ %inc, %loop ], [ 0, %entry ]
+  %val = load atomic i32, i32* %y unordered, align 4
+  %inc = add nsw i32 %i, 1
+  %exitcond = icmp eq i32 %inc, %val
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %val
+; CHECK-LABEL: define i32 @test1(
+; CHECK: load atomic
+; CHECK-NEXT: br label %loop
+}
+
+; Check that we don't sink/hoist monotonic loads
+; (Strictly speaking, it's not forbidden, but it's supposed to be possible to
+; use monotonic for spinlock-like constructs.)
+define i32 @test2(i32* nocapture %y) nounwind uwtable ssp {
+entry:
+  br label %loop
+
+loop:
+  %val = load atomic i32, i32* %y monotonic, align 4
+  %exitcond = icmp ne i32 %val, 0
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %val
+; CHECK-LABEL: define i32 @test2(
+; CHECK: load atomic
+; CHECK-NEXT: %exitcond = icmp ne
+; CHECK-NEXT: br i1 %exitcond, label %end, label %loop
+}
+
+; Check that we hoist unordered around monotonic.
+; (The noalias shouldn't be necessary in theory, but LICM isn't quite that
+; smart yet.)
+define i32 @test3(i32* nocapture noalias %x, i32* nocapture %y) nounwind uwtable ssp {
+entry:
+  br label %loop
+
+loop:
+  %vala = load atomic i32, i32* %y monotonic, align 4
+  %valb = load atomic i32, i32* %x unordered, align 4
+  %exitcond = icmp ne i32 %vala, %valb
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %vala
+; CHECK-LABEL: define i32 @test3(
+; CHECK: load atomic i32, i32* %x unordered
+; CHECK-NEXT: br label %loop
+}
+
+; We can sink an unordered store
+define i32 @test4(i32* nocapture noalias %x, i32* nocapture %y) nounwind uwtable ssp {
+entry:
+  br label %loop
+
+loop:
+  %vala = load atomic i32, i32* %y monotonic, align 4
+  store atomic i32 %vala, i32* %x unordered, align 4
+  %exitcond = icmp ne i32 %vala, 0
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %vala
+; CHECK-LABEL: define i32 @test4(
+; CHECK-LABEL: loop:
+; CHECK: load atomic i32, i32* %y monotonic
+; CHECK-NOT: store
+; CHECK-LABEL: end:
+; CHECK-NEXT:   %[[LCSSAPHI:.*]] = phi i32 [ %vala
+; CHECK:   store atomic i32 %[[LCSSAPHI]], i32* %x unordered, align 4
+}
+
+; We currently don't handle ordered atomics.
+define i32 @test5(i32* nocapture noalias %x, i32* nocapture %y) nounwind uwtable ssp {
+entry:
+  br label %loop
+
+loop:
+  %vala = load atomic i32, i32* %y monotonic, align 4
+  store atomic i32 %vala, i32* %x release, align 4
+  %exitcond = icmp ne i32 %vala, 0
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %vala
+; CHECK-LABEL: define i32 @test5(
+; CHECK: load atomic i32, i32* %y monotonic
+; CHECK-NEXT: store atomic
+}
+
+; We currently don't touch volatiles
+define i32 @test6(i32* nocapture noalias %x, i32* nocapture %y) nounwind uwtable ssp {
+entry:
+  br label %loop
+
+loop:
+  %vala = load atomic i32, i32* %y monotonic, align 4
+  store volatile i32 %vala, i32* %x, align 4
+  %exitcond = icmp ne i32 %vala, 0
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %vala
+; CHECK-LABEL: define i32 @test6(
+; CHECK: load atomic i32, i32* %y monotonic
+; CHECK-NEXT: store volatile
+}
+
+; We currently don't touch volatiles
+define i32 @test6b(i32* nocapture noalias %x, i32* nocapture %y) nounwind uwtable ssp {
+entry:
+  br label %loop
+
+loop:
+  %vala = load atomic i32, i32* %y monotonic, align 4
+  store atomic volatile i32 %vala, i32* %x unordered, align 4
+  %exitcond = icmp ne i32 %vala, 0
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %vala
+; CHECK-LABEL: define i32 @test6b(
+; CHECK: load atomic i32, i32* %y monotonic
+; CHECK-NEXT: store atomic volatile
+}
+
+; Mixing unorder atomics and normal loads/stores is
+; current unimplemented
+define i32 @test7(i32* nocapture noalias %x, i32* nocapture %y) nounwind uwtable ssp {
+entry:
+  br label %loop
+
+loop:
+  store i32 5, i32* %x
+  %vala = load atomic i32, i32* %y monotonic, align 4
+  store atomic i32 %vala, i32* %x unordered, align 4
+  %exitcond = icmp ne i32 %vala, 0
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %vala
+; CHECK-LABEL: define i32 @test7(
+; CHECK: store i32 5, i32* %x
+; CHECK-NEXT: load atomic i32, i32* %y
+; CHECK-NEXT: store atomic i32
+}
+
+; Three provably noalias locations - we can sink normal and unordered, but
+;  not monotonic
+define i32 @test7b(i32* nocapture noalias %x, i32* nocapture %y, i32* noalias nocapture %z) nounwind uwtable ssp {
+entry:
+  br label %loop
+
+loop:
+  store i32 5, i32* %x
+  %vala = load atomic i32, i32* %y monotonic, align 4
+  store atomic i32 %vala, i32* %z unordered, align 4
+  %exitcond = icmp ne i32 %vala, 0
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %vala
+; CHECK-LABEL: define i32 @test7b(
+; CHECK-LABEL: entry:
+; CHECK: store i32 5, i32* %x
+; CHECK-LABEL: loop:
+; CHECK: load atomic i32, i32* %y monotonic
+; CHECK-LABEL: end:
+; CHECK: store atomic i32 %{{.+}}, i32* %z unordered, align 4
+}
+
+
+define i32 @test8(i32* nocapture noalias %x, i32* nocapture %y) {
+entry:
+  br label %loop
+
+loop:
+  %vala = load atomic i32, i32* %y monotonic, align 4
+  store atomic i32 %vala, i32* %x unordered, align 4
+  fence release
+  %exitcond = icmp ne i32 %vala, 0
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %vala
+; CHECK-LABEL: define i32 @test8(
+; CHECK-LABEL: loop:
+; CHECK: load atomic i32, i32* %y monotonic
+; CHECK-NEXT: store atomic
+; CHECK-NEXT: fence
+}
+
+; Exact semantics of monotonic accesses are a bit vague in the C++ spec,
+; for the moment, be conservative and don't touch them.
+define i32 @test9(i32* nocapture noalias %x, i32* nocapture %y) {
+entry:
+  br label %loop
+
+loop:
+  %vala = load atomic i32, i32* %y monotonic, align 4
+  store atomic i32 %vala, i32* %x monotonic, align 4
+  %exitcond = icmp ne i32 %vala, 0
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %vala
+; CHECK-LABEL: define i32 @test9(
+; CHECK-LABEL: loop:
+; CHECK: load atomic i32, i32* %y monotonic
+; CHECK-NEXT:   store atomic i32 %vala, i32* %x monotonic, align 4
+}

Added: llvm/trunk/test/Transforms/LICM/basictest.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/basictest.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/basictest.ll (added)
+++ llvm/trunk/test/Transforms/LICM/basictest.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,16 @@
+; RUN: opt < %s -licm | llvm-dis
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s | llvm-dis
+
+define void @testfunc(i32 %i) {
+; <label>:0
+	br label %Loop
+Loop:		; preds = %Loop, %0
+	%j = phi i32 [ 0, %0 ], [ %Next, %Loop ]		; <i32> [#uses=1]
+	%i2 = mul i32 %i, 17		; <i32> [#uses=1]
+	%Next = add i32 %j, %i2		; <i32> [#uses=2]
+	%cond = icmp eq i32 %Next, 0		; <i1> [#uses=1]
+	br i1 %cond, label %Out, label %Loop
+Out:		; preds = %Loop
+	ret void
+}
+

Added: llvm/trunk/test/Transforms/LICM/bisect-state.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/bisect-state.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/bisect-state.ll (added)
+++ llvm/trunk/test/Transforms/LICM/bisect-state.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,15 @@
+; Make sure we don't crash in LICM.
+; RUN: opt %s -licm -opt-bisect-limit=1
+
+define void @patatino() {
+for.cond1:
+  br label %for.body
+for.body:
+  br label %for.cond5
+for.cond5:
+  br i1 true, label %if.end, label %for.end
+if.end:
+  br label %for.cond5
+for.end:
+  br label %for.body
+}

Added: llvm/trunk/test/Transforms/LICM/call-hoisting.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/call-hoisting.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/call-hoisting.ll (added)
+++ llvm/trunk/test/Transforms/LICM/call-hoisting.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,259 @@
+; RUN: opt -S -basicaa -licm %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
+
+declare i32 @load(i32* %p) argmemonly readonly nounwind
+
+define void @test_load(i32* noalias %loc, i32* noalias %sink) {
+; CHECK-LABEL: @test_load
+; CHECK-LABEL: entry:
+; CHECK: call i32 @load
+; CHECK-LABEL: loop:
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  %ret = call i32 @load(i32* %loc)
+  store volatile i32 %ret, i32* %sink
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+
+declare void @store(i32 %val, i32* %p) argmemonly writeonly nounwind
+
+define void @test(i32* %loc) {
+; CHECK-LABEL: @test
+; CHECK-LABEL: loop:
+; CHECK: call void @store
+; CHECK-LABEL: exit:
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  call void @store(i32 0, i32* %loc)
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_multiexit(i32* %loc, i1 %earlycnd) {
+; CHECK-LABEL: @test_multiexit
+; CHECK-LABEL: loop:
+; CHECK: call void @store
+; CHECK-LABEL: backedge:
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %backedge]
+  call void @store(i32 0, i32* %loc)
+  %iv.next = add i32 %iv, 1
+  br i1 %earlycnd, label %exit1, label %backedge
+  
+backedge:
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit2
+
+exit1:
+  ret void
+exit2:
+  ret void
+}
+
+define void @neg_lv_value(i32* %loc) {
+; CHECK-LABEL: @neg_lv_value
+; CHECK-LABEL: loop:
+; CHECK: call void @store
+; CHECK-LABEL: exit:
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  call void @store(i32 %iv, i32* %loc)
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @neg_lv_addr(i32* %loc) {
+; CHECK-LABEL: @neg_lv_addr
+; CHECK-LABEL: loop:
+; CHECK: call void @store
+; CHECK-LABEL: exit:
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  %p = getelementptr i32, i32* %loc, i32 %iv
+  call void @store(i32 0, i32* %p)
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @neg_mod(i32* %loc) {
+; CHECK-LABEL: @neg_mod
+; CHECK-LABEL: loop:
+; CHECK: call void @store
+; CHECK-LABEL: exit:
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  call void @store(i32 0, i32* %loc)
+  store i32 %iv, i32* %loc
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @neg_ref(i32* %loc) {
+; CHECK-LABEL: @neg_ref
+; CHECK-LABEL: loop:
+; CHECK: call void @store
+; CHECK-LABEL: exit1:
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %backedge]
+  call void @store(i32 0, i32* %loc)
+  %v = load i32, i32* %loc
+  %earlycnd = icmp eq i32 %v, 198
+  br i1 %earlycnd, label %exit1, label %backedge
+  
+backedge:
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit2
+
+exit1:
+  ret void
+exit2:
+  ret void
+}
+
+declare void @modref()
+
+define void @neg_modref(i32* %loc) {
+; CHECK-LABEL: @neg_modref
+; CHECK-LABEL: loop:
+; CHECK: call void @store
+; CHECK-LABEL: exit:
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  call void @store(i32 0, i32* %loc)
+  call void @modref()
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @neg_fence(i32* %loc) {
+; CHECK-LABEL: @neg_fence
+; CHECK-LABEL: loop:
+; CHECK: call void @store
+; CHECK-LABEL: exit:
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  call void @store(i32 0, i32* %loc)
+  fence seq_cst
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+declare void @not_nounwind(i32 %v, i32* %p) writeonly argmemonly
+declare void @not_argmemonly(i32 %v, i32* %p) writeonly nounwind
+declare void @not_writeonly(i32 %v, i32* %p) argmemonly nounwind
+
+define void @neg_not_nounwind(i32* %loc) {
+; CHECK-LABEL: @neg_not_nounwind
+; CHECK-LABEL: loop:
+; CHECK: call void @not_nounwind
+; CHECK-LABEL: exit:
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  call void @not_nounwind(i32 0, i32* %loc)
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @neg_not_argmemonly(i32* %loc) {
+; CHECK-LABEL: @neg_not_argmemonly
+; CHECK-LABEL: loop:
+; CHECK: call void @not_argmemonly
+; CHECK-LABEL: exit:
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  call void @not_argmemonly(i32 0, i32* %loc)
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @neg_not_writeonly(i32* %loc) {
+; CHECK-LABEL: @neg_not_writeonly
+; CHECK-LABEL: loop:
+; CHECK: call void @not_writeonly
+; CHECK-LABEL: exit:
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  call void @not_writeonly(i32 0, i32* %loc)
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/LICM/constexpr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/constexpr.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/constexpr.ll (added)
+++ llvm/trunk/test/Transforms/LICM/constexpr.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,47 @@
+; RUN: opt < %s -S -basicaa -licm | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
+; This fixes PR22460
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+ at in = internal unnamed_addr global i32* null, align 8
+ at out = internal unnamed_addr global i32* null, align 8
+
+; CHECK-LABEL: @bar
+; CHECK: entry:
+; CHECK: load i64, i64* bitcast (i32** @in to i64*)
+; CHECK: do.body:
+; CHECK-NOT: load
+
+define i64 @bar(i32 %N) {
+entry:
+  br label %do.body
+
+do.body:                                          ; preds = %l2, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %l2 ]
+  %total = phi i64 [ 0, %entry ], [ %next, %l2 ]
+  %c = icmp eq i32 %N, 6
+  br i1 %c, label %l1, label %do.body.l2_crit_edge
+
+do.body.l2_crit_edge:                             ; preds = %do.body
+  %inval.pre = load i32*, i32** @in, align 8
+  br label %l2
+
+l1:                                               ; preds = %do.body
+  %v1 = load i64, i64* bitcast (i32** @in to i64*), align 8
+  store i64 %v1, i64* bitcast (i32** @out to i64*), align 8
+  %0 = inttoptr i64 %v1 to i32*
+  br label %l2
+
+l2:                                               ; preds = %do.body.l2_crit_edge, %l1
+  %inval = phi i32* [ %inval.pre, %do.body.l2_crit_edge ], [ %0, %l1 ]
+  %int = ptrtoint i32* %inval to i64
+  %next = add i64 %total, %int
+  %inc = add nsw i32 %i.0, 1
+  %cmp = icmp slt i32 %inc, %N
+  br i1 %cmp, label %do.body, label %do.end
+
+do.end:                                           ; preds = %l2
+  ret i64 %total
+}

Added: llvm/trunk/test/Transforms/LICM/crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/crash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/crash.ll (added)
+++ llvm/trunk/test/Transforms/LICM/crash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,75 @@
+; RUN: opt -licm -disable-output < %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -disable-output < %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin10.0.0"
+
+
+; PR8068
+ at g_12 = external global i8, align 1
+define void @test1() nounwind ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.cond, %bb.nph
+  store i8 0, i8* @g_12, align 1
+  %tmp6 = load i8, i8* @g_12, align 1
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body
+  store i8 %tmp6, i8* @g_12, align 1
+  br i1 false, label %for.cond.for.end10_crit_edge, label %for.body
+
+for.cond.for.end10_crit_edge:                     ; preds = %for.cond
+  br label %for.end10
+
+for.end10:                                        ; preds = %for.cond.for.end10_crit_edge, %entry
+  ret void
+}
+
+; PR8067
+ at g_8 = external global i32, align 4
+
+define void @test2() noreturn nounwind ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %tmp7 = load i32, i32* @g_8, align 4
+  store i32* @g_8, i32** undef, align 16
+  store i32 undef, i32* @g_8, align 4
+  br label %for.body
+}
+
+; PR8102
+define void @test3() {
+entry:
+  %__first = alloca { i32* }
+  br i1 undef, label %for.cond, label %for.end
+
+for.cond:                                         ; preds = %for.cond, %entry
+  %tmp1 = getelementptr { i32*}, { i32*}* %__first, i32 0, i32 0
+  %tmp2 = load i32*, i32** %tmp1, align 4
+  %call = tail call i32* @test3helper(i32* %tmp2)
+  %tmp3 = getelementptr { i32*}, { i32*}* %__first, i32 0, i32 0
+  store i32* %call, i32** %tmp3, align 4
+  br i1 false, label %for.cond, label %for.end
+
+for.end:                                          ; preds = %for.cond, %entry
+  ret void
+}
+
+declare i32* @test3helper(i32*)
+
+
+; PR8602
+ at g_47 = external global i32, align 4
+
+define void @test4() noreturn nounwind {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  store volatile i32* @g_47, i32** undef, align 8
+  store i32 undef, i32* @g_47, align 4
+  br label %1
+}

Added: llvm/trunk/test/Transforms/LICM/debug-value.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/debug-value.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/debug-value.ll (added)
+++ llvm/trunk/test/Transforms/LICM/debug-value.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,65 @@
+; RUN: opt -licm -basicaa < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
+
+define void @dgefa() nounwind ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.cond.backedge, %entry
+  br i1 undef, label %if.then, label %for.cond.backedge, !dbg !11
+
+for.cond.backedge:                                ; preds = %for.body61, %for.body61.us, %for.body
+  br i1 undef, label %for.end104, label %for.body, !dbg !15
+
+if.then:                                          ; preds = %for.body
+  br i1 undef, label %if.then27, label %if.end.if.end.split_crit_edge.critedge, !dbg !16
+
+if.then27:                                        ; preds = %if.then
+; CHECK: tail call void @llvm.dbg.value
+  tail call void @llvm.dbg.value(metadata double undef, metadata !19, metadata !DIExpression()), !dbg !21
+  br label %for.body61.us
+
+if.end.if.end.split_crit_edge.critedge:           ; preds = %if.then
+  br label %for.body61
+
+for.body61.us:                                    ; preds = %for.body61.us, %if.then27
+  br i1 undef, label %for.cond.backedge, label %for.body61.us, !dbg !23
+
+for.body61:                                       ; preds = %for.body61, %if.end.if.end.split_crit_edge.critedge
+  br i1 undef, label %for.cond.backedge, label %for.body61, !dbg !23
+
+for.end104:                                       ; preds = %for.cond.backedge
+  ret void, !dbg !24
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone
+
+!llvm.module.flags = !{!26}
+!llvm.dbg.cu = !{!2}
+
+!0 = distinct !DISubprogram(name: "idamax", line: 112, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !2, file: !25, scope: !1, type: !3)
+!1 = !DIFile(filename: "/Volumes/Lalgate/work/llvm/projects/llvm-test/SingleSource/Benchmarks/CoyoteBench/lpbench.c", directory: "/private/tmp")
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 2.9 (trunk 127169)", isOptimized: true, emissionKind: FullDebug, file: !25)
+!3 = !DISubroutineType(types: !4)
+!4 = !{!5}
+!5 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!6 = distinct !DISubprogram(name: "dscal", line: 206, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !2, file: !25, scope: !1, type: !7)
+!7 = !DISubroutineType(types: !{null})
+!9 = distinct !DISubprogram(name: "daxpy", line: 230, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !2, file: !25, scope: !1, type: !7)
+!10 = distinct !DISubprogram(name: "dgefa", line: 267, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !2, file: !25, scope: !1, type: !7)
+!11 = !DILocation(line: 281, column: 9, scope: !12)
+!12 = distinct !DILexicalBlock(line: 272, column: 5, file: !25, scope: !13)
+!13 = distinct !DILexicalBlock(line: 271, column: 5, file: !25, scope: !14)
+!14 = distinct !DILexicalBlock(line: 267, column: 1, file: !25, scope: !10)
+!15 = !DILocation(line: 271, column: 5, scope: !14)
+!16 = !DILocation(line: 284, column: 10, scope: !17)
+!17 = distinct !DILexicalBlock(line: 282, column: 9, file: !25, scope: !12)
+!18 = !{double undef}
+!19 = !DILocalVariable(name: "temp", line: 268, scope: !14, file: !1, type: !20)
+!20 = !DIBasicType(tag: DW_TAG_base_type, name: "double", size: 64, align: 64, encoding: DW_ATE_float)
+!21 = !DILocation(line: 286, column: 14, scope: !22)
+!22 = distinct !DILexicalBlock(line: 285, column: 13, file: !25, scope: !17)
+!23 = !DILocation(line: 296, column: 13, scope: !17)
+!24 = !DILocation(line: 313, column: 1, scope: !14)
+!25 = !DIFile(filename: "/Volumes/Lalgate/work/llvm/projects/llvm-test/SingleSource/Benchmarks/CoyoteBench/lpbench.c", directory: "/private/tmp")
+!26 = !{i32 1, !"Debug Info Version", i32 3}

Added: llvm/trunk/test/Transforms/LICM/dropped-tbaa.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/dropped-tbaa.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/dropped-tbaa.ll (added)
+++ llvm/trunk/test/Transforms/LICM/dropped-tbaa.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,90 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -scoped-noalias -tbaa -licm -S | FileCheck %s
+
+; This test case case is generated from the following C code with -fstrict-aliasing,
+; and after passing through -inline -mem2reg -loop-rotate -instcombine
+; void add(double *restrict data, int *restrict addend) {
+;    *data += *addend;
+; }
+;
+; void foo(double *data, int *addend) {
+;    for (int i = 0; i < 1000; ++i) {
+;        *data += *addend;
+;        add(data, addend);
+;    }
+; }
+; We want to make sure the load of addend gets hoisted, independent of the second load
+; load having different noalias metadata.
+
+define void @foo(double* %data, i32* %addend) #0 {
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ADDEND:%.*]], align 4, !tbaa !1
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP1]] to double
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ADDEND]], align 4, !tbaa !1, !alias.scope !5, !noalias !8
+; CHECK-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[TMP2]] to double
+entry:
+  %i = alloca i32, align 4
+  %0 = bitcast i32* %i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #2
+  store i32 0, i32* %i, align 4, !tbaa !1
+  br i1 true, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.cond.for.cond.cleanup_crit_edge:              ; preds = %for.inc
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.for.cond.cleanup_crit_edge, %entry
+  %1 = bitcast i32* %i to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %1) #2
+  br label %for.end
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.inc
+  %2 = load i32, i32* %addend, align 4, !tbaa !1
+  %conv = sitofp i32 %2 to double
+  %3 = load i32, i32* %i, align 4, !tbaa !1
+  %idxprom = sext i32 %3 to i64
+  %arrayidx = getelementptr inbounds double, double* %data, i64 %idxprom
+  %4 = load double, double* %arrayidx, align 8, !tbaa !5
+  %add = fadd double %4, %conv
+  store double %add, double* %arrayidx, align 8, !tbaa !5
+  %idxprom1 = sext i32 %3 to i64
+  %arrayidx2 = getelementptr inbounds double, double* %data, i64 %idxprom1
+  %5 = load i32, i32* %addend, align 4, !tbaa !1, !alias.scope !7, !noalias !10
+  %conv.i = sitofp i32 %5 to double
+  %6 = load double, double* %arrayidx2, align 8, !tbaa !5, !alias.scope !10, !noalias !7
+  %add.i = fadd double %6, %conv.i
+  store double %add.i, double* %arrayidx2, align 8, !tbaa !5, !alias.scope !10, !noalias !7
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %7 = load i32, i32* %i, align 4, !tbaa !1
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, i32* %i, align 4, !tbaa !1
+  %cmp = icmp slt i32 %inc, 1000
+  br i1 %cmp, label %for.body, label %for.cond.for.cond.cleanup_crit_edge
+
+for.end:                                          ; preds = %for.cond.cleanup
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0
+
+attributes #0 = { argmemonly nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 5.0.0  (llvm/trunk 299971)"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"double", !3, i64 0}
+!7 = !{!8}
+!8 = distinct !{!8, !9, !"add: %addend"}
+!9 = distinct !{!9, !"add"}
+!10 = !{!11}
+!11 = distinct !{!11, !9, !"add: %data"}

Added: llvm/trunk/test/Transforms/LICM/explicit_guards.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/explicit_guards.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/explicit_guards.ll (added)
+++ llvm/trunk/test/Transforms/LICM/explicit_guards.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -make-guards-explicit -basicaa -licm < %s        | FileCheck %s
+; RUN: opt -S -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,make-guards-explicit,loop(licm)' < %s | FileCheck %s
+
+; Test interaction between explicit guards and LICM: make sure that we do not
+; hoist explicit conditions while we can hoist invariant loads in presence of
+; explicit guards.
+
+declare void @llvm.experimental.guard(i1,...)
+
+; Make sure that we do not hoist widenable_cond out of loop.
+define void @do_not_hoist_widenable_cond(i1 %cond, i32 %N, i32 %M) {
+; CHECK-LABEL: @do_not_hoist_widenable_cond(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[GUARDED:%.*]] ]
+; CHECK-NEXT:    [[GUARD_COND:%.*]] = icmp slt i32 [[IV]], [[N:%.*]]
+; CHECK-NEXT:    [[WIDENABLE_COND:%.*]] = call i1 @llvm.experimental.widenable.condition()
+; CHECK-NEXT:    [[EXIPLICIT_GUARD_COND:%.*]] = and i1 [[GUARD_COND]], [[WIDENABLE_COND]]
+; CHECK-NEXT:    br i1 [[EXIPLICIT_GUARD_COND]], label [[GUARDED]], label [[DEOPT:%.*]], !prof !0
+; CHECK:       deopt:
+; CHECK-NEXT:    call void (...) @llvm.experimental.deoptimize.isVoid() [ "deopt"() ]
+; CHECK-NEXT:    ret void
+; CHECK:       guarded:
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV]], [[M:%.*]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %guard_cond = icmp slt i32 %iv, %N
+  call void(i1, ...) @llvm.experimental.guard(i1 %guard_cond) [ "deopt"() ]
+  %loop_cond = icmp slt i32 %iv, %M
+  %iv.next = add i32 %iv, 1
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @hoist_invariant_load(i1 %cond, i32* %np, i32 %M) {
+; CHECK-LABEL: @hoist_invariant_load(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[N:%.*]] = load i32, i32* [[NP:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[GUARDED:%.*]] ]
+; CHECK-NEXT:    [[GUARD_COND:%.*]] = icmp slt i32 [[IV]], [[N]]
+; CHECK-NEXT:    [[WIDENABLE_COND:%.*]] = call i1 @llvm.experimental.widenable.condition()
+; CHECK-NEXT:    [[EXIPLICIT_GUARD_COND:%.*]] = and i1 [[GUARD_COND]], [[WIDENABLE_COND]]
+; CHECK-NEXT:    br i1 [[EXIPLICIT_GUARD_COND]], label [[GUARDED]], label [[DEOPT:%.*]], !prof !0
+; CHECK:       deopt:
+; CHECK-NEXT:    call void (...) @llvm.experimental.deoptimize.isVoid() [ "deopt"() ]
+; CHECK-NEXT:    ret void
+; CHECK:       guarded:
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV]], [[M:%.*]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %N = load i32, i32* %np
+  %guard_cond = icmp slt i32 %iv, %N
+  call void(i1, ...) @llvm.experimental.guard(i1 %guard_cond) [ "deopt"() ]
+  %loop_cond = icmp slt i32 %iv, %M
+  %iv.next = add i32 %iv, 1
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LICM/extra-copies.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/extra-copies.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/extra-copies.ll (added)
+++ llvm/trunk/test/Transforms/LICM/extra-copies.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,30 @@
+; RUN: opt < %s -licm -S | FileCheck %s
+; RUN: opt -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
+; PR19835
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @f(i32 %x) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %storemerge4 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %mul = mul nsw i32 %x, %x
+  %add2 = add nsw i32 %mul, %x
+  %mul3 = add nsw i32 %add2, %mul
+  %inc = add nsw i32 %storemerge4, 1
+  %cmp = icmp slt i32 %inc, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %a9.0.lcssa = phi i32 [ %mul3, %for.body ]
+  ret i32 %a9.0.lcssa
+}
+
+; Test that there is exactly one copy of mul nsw i32 %x, %x in the exit block.
+; CHECK: define i32 @f(i32 [[X:%.*]])
+; CHECK: for.end:
+; CHECK-NOT: mul nsw i32 [[X]], [[X]]
+; CHECK: mul nsw i32 [[X]], [[X]]
+; CHECK-NOT: mul nsw i32 [[X]], [[X]]

Added: llvm/trunk/test/Transforms/LICM/fence.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/fence.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/fence.ll (added)
+++ llvm/trunk/test/Transforms/LICM/fence.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,120 @@
+; RUN: opt -licm -basicaa < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
+
+define void @test1(i64 %n) {
+; CHECK-LABEL: @test1
+; CHECK: fence
+; CHECK-LABEL: loop:
+entry:
+  br label %loop
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  fence release
+  %iv.next = add i64 %iv, 1
+  %test = icmp slt i64 %iv, %n
+  br i1 %test, label %loop, label %exit
+exit:
+  ret void
+}
+
+define void @test2(i64 %n) {
+; CHECK-LABEL: @test2
+; CHECK: fence
+; CHECK-LABEL: loop:
+entry:
+  br label %loop
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  fence acquire
+  %iv.next = add i64 %iv, 1
+  %test = icmp slt i64 %iv, %n
+  br i1 %test, label %loop, label %exit
+exit:
+  ret void
+}
+
+define void @test3(i64 %n) {
+; CHECK-LABEL: @test3
+; CHECK: fence
+; CHECK-LABEL: loop:
+entry:
+  br label %loop
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  fence acq_rel
+  %iv.next = add i64 %iv, 1
+  %test = icmp slt i64 %iv, %n
+  br i1 %test, label %loop, label %exit
+exit:
+  ret void
+}
+
+define void @test4(i64 %n) {
+; CHECK-LABEL: @test4
+; CHECK: fence
+; CHECK-LABEL: loop:
+entry:
+  br label %loop
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  fence seq_cst
+  %iv.next = add i64 %iv, 1
+  %test = icmp slt i64 %iv, %n
+  br i1 %test, label %loop, label %exit
+exit:
+  ret void
+}
+
+define void @testneg1(i64 %n, i64* %p) {
+; CHECK-LABEL: @testneg1
+; CHECK-LABEL: loop:
+; CHECK: fence
+entry:
+  br label %loop
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  store i64 %iv, i64* %p
+  fence release
+  %iv.next = add i64 %iv, 1
+  %test = icmp slt i64 %iv, %n
+  br i1 %test, label %loop, label %exit
+exit:
+  ret void
+}
+
+define void @testneg2(i64* %p) {
+; CHECK-LABEL: @testneg2
+; CHECK-LABEL: loop:
+; CHECK: fence
+entry:
+  br label %loop
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  fence acquire
+  %n = load i64, i64* %p
+  %iv.next = add i64 %iv, 1
+  %test = icmp slt i64 %iv, %n
+  br i1 %test, label %loop, label %exit
+exit:
+  ret void
+}
+
+; Note: While a false negative for LICM on it's own, O3 does get this
+; case by combining the fences.
+define void @testfn1(i64 %n, i64* %p) {
+; CHECK-LABEL: @testfn1
+; CHECK-LABEL: loop:
+; CHECK: fence
+entry:
+  br label %loop
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  fence release
+  fence release
+  %iv.next = add i64 %iv, 1
+  %test = icmp slt i64 %iv, %n
+  br i1 %test, label %loop, label %exit
+exit:
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/LICM/funclet.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/funclet.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/funclet.ll (added)
+++ llvm/trunk/test/Transforms/LICM/funclet.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,110 @@
+; RUN: opt < %s -licm -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
+
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i386-pc-windows-msvc18.0.0"
+
+define void @test1(i32* %s, i1 %b) personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %0 = call i32 @pure_computation()
+  br i1 %b, label %try.cont, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  invoke void @may_throw()
+          to label %while.cond unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %while.body
+  %.lcssa1 = phi i32 [ %0, %while.body ]
+  %cs = catchswitch within none [label %catch] unwind to caller
+
+catch:                                            ; preds = %catch.dispatch
+  %cp = catchpad within %cs [i8* null, i32 64, i8* null]
+  store i32 %.lcssa1, i32* %s
+  catchret from %cp to label %try.cont
+
+try.cont:                                         ; preds = %catch, %while.cond
+  ret void
+}
+
+; CHECK-LABEL: define void @test1(
+; CHECK: %[[CALL:.*]] = call i32 @pure_computation()
+; CHECK: phi i32 [ %[[CALL]]
+
+define void @test2(i32* %s, i1 %b) personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %0 = call i32 @pure_computation()
+  br i1 %b, label %try.cont, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  invoke void @may_throw()
+          to label %while.cond unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %while.body
+  %.lcssa1 = phi i32 [ %0, %while.body ]
+  %cp = cleanuppad within none []
+  store i32 %.lcssa1, i32* %s
+  cleanupret from %cp unwind to caller
+
+try.cont:                                         ; preds = %catch, %while.cond
+  ret void
+}
+
+; CHECK-LABEL: define void @test2(
+; CHECK:      %[[CP:.*]] = cleanuppad within none []
+; CHECK-NEXT: %[[CALL:.*]] = call i32 @pure_computation() [ "funclet"(token %[[CP]]) ]
+; CHECK-NEXT: store i32 %[[CALL]], i32* %s
+; CHECK-NEXT: cleanupret from %[[CP]] unwind to caller
+
+define void @test3(i1 %a, i1 %b, i1 %c) personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  %.frame = alloca i8, align 4
+  %.frame2 = alloca i8, align 4
+  %bc = bitcast i8* %.frame to i32*
+  %bc2 = bitcast i8* %.frame2 to i32*
+  br i1 %a, label %try.success.or.caught, label %forbody
+
+catch.object.Throwable:                           ; preds = %catch.dispatch
+  %cp = catchpad within %cs [i8* null, i32 64, i8* null]
+  unreachable
+
+try.success.or.caught:                            ; preds = %forcond.backedge, %0
+  ret void
+
+postinvoke:                                       ; preds = %forbody
+  br i1 %b, label %else, label %forcond.backedge
+
+forcond.backedge:                                 ; preds = %else, %postinvoke
+  br i1 %c, label %try.success.or.caught, label %forbody
+
+catch.dispatch:                                   ; preds = %else, %forbody
+  %cs = catchswitch within none [label %catch.object.Throwable] unwind to caller
+
+forbody:                                          ; preds = %forcond.backedge, %0
+  store i32 1, i32* %bc, align 4
+  store i32 2, i32* %bc2, align 4
+  invoke void @may_throw()
+          to label %postinvoke unwind label %catch.dispatch
+
+else:                                             ; preds = %postinvoke
+  invoke void @may_throw()
+          to label %forcond.backedge unwind label %catch.dispatch
+}
+
+; CHECK-LABEL: define void @test3(
+; CHECK-LABEL: forbody.preheader:
+; CHECK:      store i32 1, i32* %bc, align 4
+; CHECK:      store i32 2, i32* %bc2, align 4
+; CHECK:      catchswitch within none
+; CHECK-LABEL: forbody:
+
+declare void @may_throw()
+
+declare i32 @pure_computation() nounwind argmemonly readonly
+
+declare i32 @__CxxFrameHandler3(...)

Added: llvm/trunk/test/Transforms/LICM/guards.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/guards.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/guards.ll (added)
+++ llvm/trunk/test/Transforms/LICM/guards.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,540 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; REQUIRES: asserts
+; RUN: opt -licm -basicaa -ipt-expensive-asserts=true < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -ipt-expensive-asserts=true < %s -S | FileCheck %s
+
+; Hoist guard and load.
+define void @test1(i1 %cond, i32* %ptr) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[COND:%.*]]) [ "deopt"(i32 0) ]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[PTR:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[X:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[X_INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[X_INC]] = add i32 [[X]], [[VAL]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+
+entry:
+  br label %loop
+
+loop:
+  %x = phi i32 [ 0, %entry ], [ %x.inc, %loop ]
+  call void (i1, ...) @llvm.experimental.guard(i1 %cond) ["deopt" (i32 0)]
+  %val = load i32, i32* %ptr
+  %x.inc = add i32 %x, %val
+  br label %loop
+}
+
+; Can't hoist over a side effect
+define void @test2(i1 %cond, i32* %ptr) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[X:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[X_INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    store i32 0, i32* [[PTR:%.*]]
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[COND:%.*]]) [ "deopt"(i32 0) ]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[PTR]]
+; CHECK-NEXT:    [[X_INC]] = add i32 [[X]], [[VAL]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+
+entry:
+  br label %loop
+
+loop:
+  %x = phi i32 [ 0, %entry ], [ %x.inc, %loop ]
+  store i32 0, i32* %ptr
+  call void (i1, ...) @llvm.experimental.guard(i1 %cond) ["deopt" (i32 0)]
+  %val = load i32, i32* %ptr
+  %x.inc = add i32 %x, %val
+  br label %loop
+}
+
+; Can't hoist over a side effect
+define void @test2b(i1 %cond, i32* %ptr) {
+; CHECK-LABEL: @test2b(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i32 1
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[X:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[X_INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    store i32 0, i32* [[P2]]
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[COND:%.*]]) [ "deopt"(i32 0) ]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[PTR]]
+; CHECK-NEXT:    [[X_INC]] = add i32 [[X]], [[VAL]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+
+entry:
+  br label %loop
+
+loop:
+  %x = phi i32 [ 0, %entry ], [ %x.inc, %loop ]
+  %p2 = getelementptr i32, i32* %ptr, i32 1
+  store i32 0, i32* %p2
+  call void (i1, ...) @llvm.experimental.guard(i1 %cond) ["deopt" (i32 0)]
+  %val = load i32, i32* %ptr
+  %x.inc = add i32 %x, %val
+  br label %loop
+}
+
+; Hoist guard. Cannot hoist load because of aliasing.
+define void @test3(i1 %cond, i32* %ptr) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[COND:%.*]]) [ "deopt"(i32 0) ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[X:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[X_INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[PTR:%.*]]
+; CHECK-NEXT:    store i32 0, i32* [[PTR]]
+; CHECK-NEXT:    [[X_INC]] = add i32 [[X]], [[VAL]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+
+entry:
+  br label %loop
+
+loop:
+  %x = phi i32 [ 0, %entry ], [ %x.inc, %loop ]
+  call void (i1, ...) @llvm.experimental.guard(i1 %cond) ["deopt" (i32 0)]
+  %val = load i32, i32* %ptr
+  store i32 0, i32* %ptr
+  %x.inc = add i32 %x, %val
+  br label %loop
+}
+
+; Hoist load and guard.
+define void @test4(i1 %c, i32* %p) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* [[P:%.*]]
+; CHECK-NEXT:    [[INVARIANT_COND:%.*]] = icmp ne i32 [[A]], 100
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[INVARIANT_COND]]) [ "deopt"() ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       if.false:
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %iv.next = add i32 %iv, 1
+  br i1 %c, label %if.true, label %if.false
+
+if.true:
+  br label %backedge
+
+if.false:
+  br label %backedge
+
+backedge:
+  %a = load i32, i32* %p
+  %invariant_cond = icmp ne i32 %a, 100
+  call void (i1, ...) @llvm.experimental.guard(i1 %invariant_cond) [ "deopt"() ]
+  %loop_cond = icmp slt i32 %iv.next, 1000
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Do not hoist across a conditionally executed side effect.
+define void @test4a(i1 %c, i32* %p, i32* %q) {
+; CHECK-LABEL: @test4a(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    store i32 123, i32* [[Q:%.*]]
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       if.false:
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* [[P:%.*]]
+; CHECK-NEXT:    [[INVARIANT_COND:%.*]] = icmp ne i32 [[A]], 100
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[INVARIANT_COND]]) [ "deopt"() ]
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %iv.next = add i32 %iv, 1
+  br i1 %c, label %if.true, label %if.false
+
+if.true:
+  store i32 123, i32* %q
+  br label %backedge
+
+if.false:
+  br label %backedge
+
+backedge:
+  %a = load i32, i32* %p
+  %invariant_cond = icmp ne i32 %a, 100
+  call void (i1, ...) @llvm.experimental.guard(i1 %invariant_cond) [ "deopt"() ]
+  %loop_cond = icmp slt i32 %iv.next, 1000
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Do not hoist a conditionally executed guard.
+define void @test4b(i1 %c, i32* %p, i32* %q) {
+; CHECK-LABEL: @test4b(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* [[P:%.*]]
+; CHECK-NEXT:    [[INVARIANT_COND:%.*]] = icmp ne i32 [[A]], 100
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[INVARIANT_COND]]) [ "deopt"() ]
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       if.false:
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %iv.next = add i32 %iv, 1
+  br i1 %c, label %if.true, label %if.false
+
+if.true:
+  %a = load i32, i32* %p
+  %invariant_cond = icmp ne i32 %a, 100
+  call void (i1, ...) @llvm.experimental.guard(i1 %invariant_cond) [ "deopt"() ]
+  br label %backedge
+
+if.false:
+  br label %backedge
+
+backedge:
+  %loop_cond = icmp slt i32 %iv.next, 1000
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Hoist store, load and guard.
+define void @test4c(i1 %c, i32* %p, i8* noalias %s) {
+; CHECK-LABEL: @test4c(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i8 0, i8* [[S:%.*]]
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* [[P:%.*]]
+; CHECK-NEXT:    [[INVARIANT_COND:%.*]] = icmp ne i32 [[A]], 100
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[INVARIANT_COND]]) [ "deopt"() ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       if.false:
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %iv.next = add i32 %iv, 1
+  store i8 0, i8* %s
+  br i1 %c, label %if.true, label %if.false
+
+if.true:
+  br label %backedge
+
+if.false:
+  br label %backedge
+
+backedge:
+  %a = load i32, i32* %p
+  %invariant_cond = icmp ne i32 %a, 100
+  call void (i1, ...) @llvm.experimental.guard(i1 %invariant_cond) [ "deopt"() ]
+  %loop_cond = icmp slt i32 %iv.next, 1000
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Check that we don't hoist across a store in a conditionally executed block.
+define void @test4d(i1 %c, i32* %p, i8* noalias %s) {
+; CHECK-LABEL: @test4d(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* [[P:%.*]]
+; CHECK-NEXT:    [[INVARIANT_COND:%.*]] = icmp ne i32 [[A]], 100
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    store i8 0, i8* [[S:%.*]]
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       if.false:
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[INVARIANT_COND]]) [ "deopt"() ]
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %iv.next = add i32 %iv, 1
+  br i1 %c, label %if.true, label %if.false
+
+if.true:
+  store i8 0, i8* %s
+  br label %backedge
+
+if.false:
+  br label %backedge
+
+backedge:
+  %a = load i32, i32* %p
+  %invariant_cond = icmp ne i32 %a, 100
+  call void (i1, ...) @llvm.experimental.guard(i1 %invariant_cond) [ "deopt"() ]
+  %loop_cond = icmp slt i32 %iv.next, 1000
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Check that we don't hoist across a store before the guard in the backedge.
+define void @test4e(i1 %c, i32* %p, i8* noalias %s) {
+; CHECK-LABEL: @test4e(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* [[P:%.*]]
+; CHECK-NEXT:    [[INVARIANT_COND:%.*]] = icmp ne i32 [[A]], 100
+; CHECK-NEXT:    store i8 0, i8* [[S:%.*]]
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[INVARIANT_COND]]) [ "deopt"() ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       if.false:
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %iv.next = add i32 %iv, 1
+  br i1 %c, label %if.true, label %if.false
+
+if.true:
+  br label %backedge
+
+if.false:
+  br label %backedge
+
+backedge:
+  %a = load i32, i32* %p
+  %invariant_cond = icmp ne i32 %a, 100
+  store i8 0, i8* %s
+  call void (i1, ...) @llvm.experimental.guard(i1 %invariant_cond) [ "deopt"() ]
+  %loop_cond = icmp slt i32 %iv.next, 1000
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can hoist the guard in spite of store which happens after.
+define void @test4f(i1 %c, i32* %p, i8* noalias %s) {
+; CHECK-LABEL: @test4f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* [[P:%.*]]
+; CHECK-NEXT:    [[INVARIANT_COND:%.*]] = icmp ne i32 [[A]], 100
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[INVARIANT_COND]]) [ "deopt"() ]
+; CHECK-NEXT:    store i8 0, i8* [[S:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       if.false:
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %iv.next = add i32 %iv, 1
+  br i1 %c, label %if.true, label %if.false
+
+if.true:
+  br label %backedge
+
+if.false:
+  br label %backedge
+
+backedge:
+  %a = load i32, i32* %p
+  %invariant_cond = icmp ne i32 %a, 100
+  call void (i1, ...) @llvm.experimental.guard(i1 %invariant_cond) [ "deopt"() ]
+  store i8 0, i8* %s
+  %loop_cond = icmp slt i32 %iv.next, 1000
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Do not hoist an invariant guard across a variant guard.
+define void @test5(i1 %c, i32* %p, i32* %q) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* [[P:%.*]]
+; CHECK-NEXT:    [[INVARIANT_COND:%.*]] = icmp ne i32 [[A]], 100
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[VARIANT_COND:%.*]] = icmp ne i32 [[A]], [[IV]]
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[VARIANT_COND]]) [ "deopt"() ]
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[INVARIANT_COND]]) [ "deopt"() ]
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %iv.next = add i32 %iv, 1
+  %a = load i32, i32* %p
+  %invariant_cond = icmp ne i32 %a, 100
+  %variant_cond = icmp ne i32 %a, %iv
+  call void (i1, ...) @llvm.experimental.guard(i1 %variant_cond) [ "deopt"() ]
+  call void (i1, ...) @llvm.experimental.guard(i1 %invariant_cond) [ "deopt"() ]
+  br label %backedge
+
+backedge:
+  %loop_cond = icmp slt i32 %iv.next, 1000
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Hoist an invariant guard, leave the following variant guard in the loop.
+define void @test5a(i1 %c, i32* %p, i32* %q) {
+; CHECK-LABEL: @test5a(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* [[P:%.*]]
+; CHECK-NEXT:    [[INVARIANT_COND:%.*]] = icmp ne i32 [[A]], 100
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[INVARIANT_COND]]) [ "deopt"() ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[VARIANT_COND:%.*]] = icmp ne i32 [[A]], [[IV]]
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[VARIANT_COND]]) [ "deopt"() ]
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %iv.next = add i32 %iv, 1
+  %a = load i32, i32* %p
+  %invariant_cond = icmp ne i32 %a, 100
+  %variant_cond = icmp ne i32 %a, %iv
+  call void (i1, ...) @llvm.experimental.guard(i1 %invariant_cond) [ "deopt"() ]
+  call void (i1, ...) @llvm.experimental.guard(i1 %variant_cond) [ "deopt"() ]
+  br label %backedge
+
+backedge:
+  %loop_cond = icmp slt i32 %iv.next, 1000
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+declare void @llvm.experimental.guard(i1, ...)

Added: llvm/trunk/test/Transforms/LICM/hoist-bitcast-load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/hoist-bitcast-load.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/hoist-bitcast-load.ll (added)
+++ llvm/trunk/test/Transforms/LICM/hoist-bitcast-load.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,242 @@
+; RUN: opt -S -basicaa -licm < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(simplify-cfg,licm)' -S < %s | FileCheck %s
+; RUN: opt -S -basicaa -licm -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Make sure the basic alloca pointer hoisting works:
+; CHECK-LABEL: @test1
+; CHECK: load i32, i32* %c, align 4
+; CHECK: for.body:
+
+; Function Attrs: nounwind uwtable
+define void @test1(i32* nocapture %a, i32* nocapture readonly %b, i32 %n) #0 {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  %c = alloca i32
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %1 = load i32, i32* %c, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
+; Make sure the basic alloca pointer hoisting works through a bitcast to a
+; pointer to a smaller type:
+; CHECK-LABEL: @test2
+; CHECK: load i32, i32* %c, align 4
+; CHECK: for.body:
+
+; Function Attrs: nounwind uwtable
+define void @test2(i32* nocapture %a, i32* nocapture readonly %b, i32 %n) #0 {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  %ca = alloca i64
+  %c = bitcast i64* %ca to i32*
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %1 = load i32, i32* %c, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
+; Make sure the basic alloca pointer hoisting works through an addrspacecast
+; CHECK-LABEL: @test2_addrspacecast
+; CHECK: load i32, i32 addrspace(1)* %c, align 4
+; CHECK: for.body:
+
+; Function Attrs: nounwind uwtable
+define void @test2_addrspacecast(i32 addrspace(1)* nocapture %a, i32 addrspace(1)* nocapture readonly %b, i32 %n) #0 {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  %ca = alloca i64
+  %c = addrspacecast i64* %ca to i32 addrspace(1)*
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %indvars.iv
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %1 = load i32, i32 addrspace(1)* %c, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %indvars.iv
+  %2 = load i32, i32 addrspace(1)* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32 addrspace(1)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
+; Make sure the basic alloca pointer hoisting works through a bitcast to a
+; pointer to a smaller type (where the bitcast also needs to be hoisted):
+; CHECK-LABEL: @test3
+; CHECK: load i32, i32* %c, align 4
+; CHECK: for.body:
+
+; Function Attrs: nounwind uwtable
+define void @test3(i32* nocapture %a, i32* nocapture readonly %b, i32 %n) #0 {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  %ca = alloca i64
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %c = bitcast i64* %ca to i32*
+  %1 = load i32, i32* %c, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
+; Make sure the basic alloca pointer hoisting does not happen through a bitcast
+; to a pointer to a larger type:
+; CHECK-LABEL: @test4
+; CHECK: for.body:
+; CHECK: load i32, i32* %c, align 4
+
+; Function Attrs: nounwind uwtable
+define void @test4(i32* nocapture %a, i32* nocapture readonly %b, i32 %n) #0 {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  %ca = alloca i16
+  %c = bitcast i16* %ca to i32*
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %1 = load i32, i32* %c, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
+; Don't crash on bitcasts to unsized types.
+; CHECK-LABEL: @test5
+; CHECK: for.body:
+; CHECK: load i32, i32* %c, align 4
+
+%atype = type opaque
+
+; Function Attrs: nounwind uwtable
+define void @test5(i32* nocapture %a, i32* nocapture readonly %b, i32 %n) #0 {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  %ca = alloca i16
+  %cab = bitcast i16* %ca to %atype*
+  %c = bitcast %atype* %cab to i32*
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %1 = load i32, i32* %c, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
+attributes #0 = { nounwind uwtable }
+

Added: llvm/trunk/test/Transforms/LICM/hoist-debuginvariant.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/hoist-debuginvariant.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/hoist-debuginvariant.ll (added)
+++ llvm/trunk/test/Transforms/LICM/hoist-debuginvariant.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,55 @@
+; RUN: opt < %s -licm -S | FileCheck %s
+; RUN: opt < %s -strip-debug -licm -S | FileCheck %s
+; RUN: opt < %s -licm -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Verify that the sdiv is hoisted out of the loop
+; even in the presence of a preceding debug intrinsic.
+
+ at a = global i32 0
+ at b = global i32 0
+ at c = global i32 0
+
+define void @fn1() !dbg !6 {
+; CHECK-LABEL: @fn1(
+; CHECK-NEXT: [[_TMP2:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT: [[_TMP3:%.*]] = load i32, i32* @b, align 4
+; CHECK-NEXT: [[_TMP4:%.*]] = sdiv i32 [[_TMP2]], [[_TMP3]]
+; CHECK-NEXT: br label [[BB3:%.*]]
+
+  br label %bb3
+
+bb3:                                              ; preds = %bb3, %0
+  call void @llvm.dbg.value(metadata i32* @c, metadata !10, metadata !DIExpression(DW_OP_deref)), !dbg !12
+  %_tmp2 = load i32, i32* @a, align 4
+  %_tmp3 = load i32, i32* @b, align 4
+  %_tmp4 = sdiv i32 %_tmp2, %_tmp3
+  store i32 %_tmp4, i32* @c, align 4
+  %_tmp6 = load volatile i32, i32* @c, align 4
+  br label %bb3
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata) #0
+
+attributes #0 = { nounwind readnone speculatable }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "foo", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2)
+!1 = !DIFile(filename: "foo.c", directory: "/")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{!"foo"}
+!6 = distinct !DISubprogram(name: "fn1", scope: !1, file: !1, line: 3, type: !7, isLocal: false, isDefinition: true, scopeLine: 3, isOptimized: false, unit: !0, retainedNodes: !2)
+!7 = !DISubroutineType(types: !8)
+!8 = !{!9}
+!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!10 = !DILocalVariable(name: "f", scope: !11, line: 5, type: !9)
+!11 = distinct !DILexicalBlock(scope: !6, file: !1, line: 4, column: 12)
+!12 = !DILocation(line: 5, column: 9, scope: !11)

Added: llvm/trunk/test/Transforms/LICM/hoist-deref-load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/hoist-deref-load.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/hoist-deref-load.ll (added)
+++ llvm/trunk/test/Transforms/LICM/hoist-deref-load.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,728 @@
+; RUN: opt -S -basicaa -licm < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(simplify-cfg,licm)' -S < %s | FileCheck %s
+; RUN: opt -S -basicaa -licm -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(simplify-cfg,licm)' -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; This test represents the following function:
+; void test1(int * __restrict__ a, int * __restrict__ b, int &c, int n) {
+;   for (int i = 0; i < n; ++i)
+;     if (a[i] > 0)
+;       a[i] = c*b[i];
+; }
+; and we want to hoist the load of %c out of the loop. This can be done only
+; because the dereferenceable attribute is on %c.
+
+; CHECK-LABEL: @test1
+; CHECK: load i32, i32* %c, align 4
+; CHECK: for.body:
+
+define void @test1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* nocapture readonly nonnull dereferenceable(4) %c, i32 %n) #0 {
+entry:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %1 = load i32, i32* %c, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
+; This is the same as @test1, but without the dereferenceable attribute on %c.
+; Without this attribute, we should not hoist the load of %c.
+
+; CHECK-LABEL: @test2
+; CHECK: if.then:
+; CHECK: load i32, i32* %c, align 4
+
+define void @test2(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* nocapture readonly nonnull %c, i32 %n) #0 {
+entry:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %1 = load i32, i32* %c, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
+; This test represents the following function:
+; void test3(int * restrict a, int * restrict b, int c[static 3], int n) {
+;   for (int i = 0; i < n; ++i)
+;     if (a[i] > 0)
+;       a[i] = c[2]*b[i];
+; }
+; and we want to hoist the load of c[2] out of the loop. This can be done only
+; because the dereferenceable attribute is on %c.
+
+; CHECK-LABEL: @test3
+; CHECK: load i32, i32* %c2, align 4
+; CHECK: for.body:
+
+define void @test3(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* nocapture readonly dereferenceable(12) %c, i32 %n) #0 {
+entry:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %c2 = getelementptr inbounds i32, i32* %c, i64 2
+  %1 = load i32, i32* %c2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
+; This is the same as @test3, but with a dereferenceable attribute on %c with a
+; size too small to cover c[2] (and so we should not hoist it).
+
+; CHECK-LABEL: @test4
+; CHECK: if.then:
+; CHECK: load i32, i32* %c2, align 4
+
+define void @test4(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* nocapture readonly dereferenceable(11) %c, i32 %n) #0 {
+entry:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %c2 = getelementptr inbounds i32, i32* %c, i64 2
+  %1 = load i32, i32* %c2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
+; This test represents the following function:
+; void test1(int * __restrict__ a, int *b, int &c, int n) {
+;   if (c != null)
+;     for (int i = 0; i < n; ++i)
+;       if (a[i] > 0)
+;         a[i] = c*b[i];
+; }
+; and we want to hoist the load of %c out of the loop. This can be done only
+; because the dereferenceable_or_null attribute is on %c and there is a null
+; check on %c.
+
+; CHECK-LABEL: @test5
+; CHECK: load i32, i32* %c, align 4
+; CHECK: for.body:
+
+define void @test5(i32* noalias %a, i32* %b, i32* dereferenceable_or_null(4) %c, i32 %n) #0 {
+entry:
+  %not_null = icmp ne i32* %c, null
+  br i1 %not_null, label %not.null, label %for.end
+
+not.null:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body, label %for.end
+
+for.body:                                         ; preds = %not.null, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %not.null ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %1 = load i32, i32* %c, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry, %not.null
+  ret void
+}
+
+; This is the same as @test5, but without the null check on %c.
+; Without this check, we should not hoist the load of %c.
+
+; This test case has an icmp on c but the use of this comparison is
+; not a branch. 
+
+; CHECK-LABEL: @test6
+; CHECK: if.then:
+; CHECK: load i32, i32* %c, align 4
+
+define i1 @test6(i32* noalias %a, i32* %b, i32* dereferenceable_or_null(4) %c, i32 %n) #0 {
+entry:
+  %not_null = icmp ne i32* %c, null
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %1 = load i32, i32* %c, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret i1 %not_null
+}
+
+; This test represents the following function:
+; void test1(int * __restrict__ a, int *b, int **cptr, int n) {
+;   c = *cptr;
+;   for (int i = 0; i < n; ++i)
+;     if (a[i] > 0)
+;       a[i] = (*c)*b[i];
+; }
+; and we want to hoist the load of %c out of the loop. This can be done only
+; because the dereferenceable meatdata on the c = *cptr load.
+
+; CHECK-LABEL: @test7
+; CHECK: load i32, i32* %c, align 4
+; CHECK: for.body:
+
+define void @test7(i32* noalias %a, i32* %b, i32** %cptr, i32 %n) #0 {
+entry:
+  %c = load i32*, i32** %cptr, !dereferenceable !0
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %1 = load i32, i32* %c, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
+; This test represents the following function:
+; void test1(int * __restrict__ a, int *b, int **cptr, int n) {
+;   c = *cptr;
+;   if (c != null)
+;     for (int i = 0; i < n; ++i)
+;       if (a[i] > 0)
+;         a[i] = (*c)*b[i];
+; }
+; and we want to hoist the load of %c out of the loop. This can be done only
+; because the dereferenceable_or_null meatdata on the c = *cptr load and there 
+; is a null check on %c.
+
+; CHECK-LABEL: @test8
+; CHECK: load i32, i32* %c, align 4
+; CHECK: for.body:
+
+define void @test8(i32* noalias %a, i32* %b, i32** %cptr, i32 %n) #0 {
+entry:
+  %c = load i32*, i32** %cptr, !dereferenceable_or_null !0
+  %not_null = icmp ne i32* %c, null
+  br i1 %not_null, label %not.null, label %for.end
+
+not.null:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body, label %for.end
+
+for.body:                                         ; preds = %not.null, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %not.null ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %1 = load i32, i32* %c, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry, %not.null
+  ret void
+}
+
+; This is the same as @test8, but without the null check on %c.
+; Without this check, we should not hoist the load of %c.
+
+; CHECK-LABEL: @test9
+; CHECK: if.then:
+; CHECK: load i32, i32* %c, align 4
+
+define void @test9(i32* noalias %a, i32* %b, i32** %cptr, i32 %n) #0 {
+entry:
+  %c = load i32*, i32** %cptr, !dereferenceable_or_null !0
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %1 = load i32, i32* %c, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
+; In this test we should be able to only hoist load from %cptr. We can't hoist
+; load from %c because it's dereferenceability can depend on %cmp1 condition.
+; By moving it out of the loop we break this dependency and can not rely
+; on the dereferenceability anymore.
+; In other words this test checks that we strip dereferenceability  metadata
+; after hoisting an instruction.
+
+; CHECK-LABEL: @test10
+; CHECK: %c = load i32*, i32** %cptr
+; CHECK-NOT: dereferenceable
+; CHECK: if.then:
+; CHECK: load i32, i32* %c, align 4
+
+define void @test10(i32* noalias %a, i32* %b, i32** dereferenceable(8) %cptr, i32 %n) #0 {
+entry:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %c = load i32*, i32** %cptr, !dereferenceable !0
+  %1 = load i32, i32* %c, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
+define void @test11(i32* noalias %a, i32* %b, i32** dereferenceable(8) %cptr, i32 %n) #0 {
+; CHECK-LABEL: @test11(
+entry:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body, label %for.end
+
+; CHECK: for.body.preheader:
+; CHECK:  %c = load i32*, i32** %cptr, !dereferenceable !0
+; CHECK:  %d = load i32, i32* %c, align 4
+
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  %c = load i32*, i32** %cptr, !dereferenceable !0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %d = load i32, i32* %c, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %e = load i32, i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %e, %d
+  store i32 %mul, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
+declare void @llvm.experimental.guard(i1, ...)
+
+define void @test12(i32* noalias %a, i32* %b, i32* dereferenceable_or_null(4) %c, i32 %n) #0 {
+; Prove non-null ness of %c via a guard, not a branch.
+
+; CHECK-LABEL: @test12(
+entry:
+  %not_null = icmp ne i32* %c, null
+  call void(i1, ...) @llvm.experimental.guard(i1 %not_null) [ "deopt"() ]
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body, label %for.end
+
+; CHECK: for.body.preheader:
+; CHECK-NEXT:  [[VAL:%[^ ]]] = load i32, i32* %c, align 4
+; CHECK-NEXT:  br label %for.body
+
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %1 = load i32, i32* %c, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry, %entry
+  ret void
+}
+
+define void @test13(i32* noalias %a, i32* %b, i32* dereferenceable_or_null(4) %c, i32 %n) #0 {
+; Like @test12, but has a post-dominating guard, which cannot be used
+; to prove %c is nonnull at the point of the load.
+
+; CHECK-LABEL: @test13(
+entry:
+  %not_null = icmp ne i32* %c, null
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body, label %for.end
+
+; CHECK: for.body.preheader:
+; CHECK-NOT:  load i32, i32* %c
+; CHECK:  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+; CHECK: if.then:
+; CHECK:  load i32, i32* %c
+; CHECK:  br label %for.inc
+  %1 = load i32, i32* %c, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry, %entry
+  call void(i1, ...) @llvm.experimental.guard(i1 %not_null) [ "deopt"() ]
+  ret void
+}
+
+; Check that branch by condition "null check AND something" allows to hoist the
+; load.
+define void @test14(i32* noalias %a, i32* %b, i32* dereferenceable_or_null(4) %c, i32 %n, i1 %dummy_cond) #0 {
+
+; CHECK-LABEL: @test14
+; CHECK: load i32, i32* %c, align 4
+; CHECK: for.body:
+
+entry:
+  %not_null = icmp ne i32* %c, null
+  %dummy_and = and i1 %not_null, %dummy_cond
+  br i1 %dummy_and, label %not.null, label %for.end
+
+not.null:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body, label %for.end
+
+for.body:                                         ; preds = %not.null, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %not.null ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %1 = load i32, i32* %c, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry, %not.null
+  ret void
+}
+
+; Check that guard by condition "null check AND something" allows to hoist the
+; load.
+define void @test15(i32* noalias %a, i32* %b, i32* dereferenceable_or_null(4) %c, i32 %n, i1 %dummy_cond) #0 {
+
+; CHECK-LABEL: @test15
+; CHECK: load i32, i32* %c, align 4
+; CHECK: for.body:
+
+entry:
+  %not_null = icmp ne i32* %c, null
+  %dummy_and = and i1 %not_null, %dummy_cond
+  call void(i1, ...) @llvm.experimental.guard(i1 %dummy_and) [ "deopt"() ]
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %1 = load i32, i32* %c, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
+; Ensure that (c == null && other_cond) does not automatically mean that c is
+; non-null in false branch. So the condition ((c == null && other_cond) == false)
+; is not sufficient to conclude that c != null.
+define void @test16(i32* noalias %a, i32* %b, i32* dereferenceable_or_null(4) %c, i32 %n, i1 %dummy_cond) #0 {
+
+; CHECK-LABEL: @test16
+; CHECK: for.body:
+; CHECK: load i32, i32* %c, align 4
+
+entry:
+  %not_null = icmp eq i32* %c, null
+  %dummy_and = and i1 %not_null, %dummy_cond
+  br i1 %dummy_and, label %for.end, label %not.null
+
+not.null:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body, label %for.end
+
+for.body:                                         ; preds = %not.null, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %not.null ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %1 = load i32, i32* %c, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry, %not.null
+  ret void
+}
+
+; Ensure that (c == null && other_cond) does not automatically mean that c is
+; non-null in false branch. So the condition ((c == null && other_cond) == false)
+; is not sufficient to conclude that c != null.
+define void @test17(i32* noalias %a, i32* %b, i32* dereferenceable_or_null(4) %c, i32 %n, i1 %dummy_cond) #0 {
+
+; CHECK-LABEL: @test17
+; CHECK: for.body:
+; CHECK: load i32, i32* %c, align 4
+
+entry:
+  %not_null = icmp eq i32* %c, null
+  %dummy_and = and i1 %not_null, %dummy_cond
+  call void(i1, ...) @llvm.experimental.guard(i1 %dummy_and) [ "deopt"() ]
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %1 = load i32, i32* %c, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
+attributes #0 = { nounwind uwtable }
+!0 = !{i64 4}

Added: llvm/trunk/test/Transforms/LICM/hoist-fast-fdiv.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/hoist-fast-fdiv.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/hoist-fast-fdiv.ll (added)
+++ llvm/trunk/test/Transforms/LICM/hoist-fast-fdiv.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,61 @@
+; RUN: opt -licm -S < %s | FileCheck %s
+; RUN: opt -licm -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s | FileCheck %s
+
+; Function Attrs: noinline norecurse nounwind readnone ssp uwtable
+define zeroext i1 @invariant_denom(double %v) #0 {
+entry:
+; CHECK-LABEL: @invariant_denom(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: fdiv fast double 1.000000e+00, %v
+  br label %loop
+
+loop:                                       ; preds = %entry, %loop
+  %v3 = phi i32 [ 0, %entry ], [ %v11, %loop ]
+  %v4 = phi i32 [ 0, %entry ], [ %v12, %loop ]
+  %v5 = uitofp i32 %v4 to double
+
+; CHECK-LABEL: loop:
+; CHECK: fmul fast double
+; CHECK-NOT: fdiv
+  %v6 = fdiv fast double %v5, %v
+  %v7 = fptoui double %v6 to i64
+  %v8 = and i64 %v7, 1
+  %v9 = xor i64 %v8, 1
+  %v10 = trunc i64 %v9 to i32
+  %v11 = add i32 %v10, %v3
+  %v12 = add nuw i32 %v4, 1
+  %v13 = icmp eq i32 %v12, -1
+  br i1 %v13, label %end, label %loop
+
+end:                                      ; preds = %loop
+  %v15 = phi i32 [ %v11, %loop ]
+  %v16 = icmp ne i32 %v15, 0
+  ret i1 %v16
+}
+
+define void @invariant_fdiv(float* %out, float %arg) {
+; CHECK-LABEL: @invariant_fdiv(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %div = fdiv fast float 4.000000e+00, %arg
+; CHECK-NEXT: fmul fast float %div, 0x41F0000000000000
+entry:
+  br label %loop
+
+loop:                                              ; preds = %loop, %entry
+  %ind = phi i32 [ 0, %entry ], [ %inc, %loop ]
+
+; CHECK-LABEL: loop:
+; CHECK: getelementptr
+; CHECK-NOT: fdiv
+; CHECK-NOT: fmul
+  %div = fdiv fast float 4.000000e+00, %arg
+  %mul = fmul fast float %div, 0x41F0000000000000
+  %gep = getelementptr inbounds float, float* %out, i32 %ind
+  store float %mul, float* %gep, align 4
+  %inc = add nuw nsw i32 %ind, 1
+  %cond = icmp eq i32 %inc, 1024
+  br i1 %cond, label %exit, label %loop
+
+exit:                                              ; preds = %loop
+  ret void
+}

Added: llvm/trunk/test/Transforms/LICM/hoist-invariant-load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/hoist-invariant-load.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/hoist-invariant-load.ll (added)
+++ llvm/trunk/test/Transforms/LICM/hoist-invariant-load.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,41 @@
+; REQUIRES: asserts
+; RUN: opt < %s -licm -disable-basicaa -stats -S 2>&1 | grep "1 licm"
+; RUN: opt < %s -licm -enable-mssa-loop-dependency=true -verify-memoryssa -disable-basicaa -stats -S 2>&1 | grep "1 licm"
+
+@"\01L_OBJC_METH_VAR_NAME_" = internal global [4 x i8] c"foo\00", section "__TEXT,__objc_methname,cstring_literals", align 1
+@"\01L_OBJC_SELECTOR_REFERENCES_" = internal global i8* getelementptr inbounds ([4 x i8], [4 x i8]* @"\01L_OBJC_METH_VAR_NAME_", i32 0, i32 0), section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_IMAGE_INFO" = internal constant [2 x i32] [i32 0, i32 16], section "__DATA, __objc_imageinfo, regular, no_dead_strip"
+ at llvm.used = appending global [3 x i8*] [i8* getelementptr inbounds ([4 x i8], [4 x i8]* @"\01L_OBJC_METH_VAR_NAME_", i32 0, i32 0), i8* bitcast (i8** @"\01L_OBJC_SELECTOR_REFERENCES_" to i8*), i8* bitcast ([2 x i32]* @"\01L_OBJC_IMAGE_INFO" to i8*)], section "llvm.metadata"
+
+define void @test(i8* %x) uwtable ssp {
+entry:
+  %x.addr = alloca i8*, align 8
+  %i = alloca i32, align 4
+  store i8* %x, i8** %x.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp ult i32 %0, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i8*, i8** %x.addr, align 8
+  %2 = load i8*, i8** @"\01L_OBJC_SELECTOR_REFERENCES_", !invariant.load !0
+  %call = call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* %1, i8* %2)
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %3 = load i32, i32* %i, align 4
+  %inc = add i32 %3, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+declare i8* @objc_msgSend(i8*, i8*, ...) nonlazybind
+
+!0 = !{}

Added: llvm/trunk/test/Transforms/LICM/hoist-mustexec.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/hoist-mustexec.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/hoist-mustexec.ll (added)
+++ llvm/trunk/test/Transforms/LICM/hoist-mustexec.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,605 @@
+; REQUIRES: asserts
+; RUN: opt -S -basicaa -licm -ipt-expensive-asserts=true < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(licm)' -ipt-expensive-asserts=true -S %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @f() nounwind
+declare void @llvm.experimental.guard(i1,...)
+
+; constant fold on first ieration
+define i32 @test1(i32* noalias nocapture readonly %a) nounwind uwtable {
+; CHECK-LABEL: @test1(
+entry:
+; CHECK: %i1 = load i32, i32* %a, align 4
+; CHECK-NEXT: br label %for.body
+  br label %for.body
+
+for.body:
+  %iv = phi i32 [ 0, %entry ], [ %inc, %continue ]
+  %acc = phi i32 [ 0, %entry ], [ %add, %continue ]
+  %r.chk = icmp ult i32 %iv, 2000
+  br i1 %r.chk, label %continue, label %fail
+continue:
+  %i1 = load i32, i32* %a, align 4
+  %add = add nsw i32 %i1, %acc
+  %inc = add nuw nsw i32 %iv, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+
+fail:
+  call void @f()
+  ret i32 -1
+}
+
+; Same as test1, but with a floating point IR and fcmp
+define i32 @test_fcmp(i32* noalias nocapture readonly %a) nounwind uwtable {
+; CHECK-LABEL: @test_fcmp(
+entry:
+; CHECK: %i1 = load i32, i32* %a, align 4
+; CHECK-NEXT: br label %for.body
+  br label %for.body
+
+for.body:
+  %iv = phi float [ 0.0, %entry ], [ %inc, %continue ]
+  %acc = phi i32 [ 0, %entry ], [ %add, %continue ]
+  %r.chk = fcmp olt float %iv, 2000.0
+  br i1 %r.chk, label %continue, label %fail
+continue:
+  %i1 = load i32, i32* %a, align 4
+  %add = add nsw i32 %i1, %acc
+  %inc = fadd float %iv, 1.0
+  %exitcond = fcmp ogt float %inc, 1000.0
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+
+fail:
+  call void @f()
+  ret i32 -1
+}
+
+; Count down from a.length w/entry guard
+; TODO: currently unable to prove the following:
+; ule i32 (add nsw i32 %len, -1), %len where len is [0, 512]
+define i32 @test2(i32* noalias nocapture readonly %a) nounwind uwtable {
+; CHECK-LABEL: @test2(
+entry:
+  %len = load i32, i32* %a, align 4, !range !{i32 0, i32 512}
+  %is.non.pos = icmp eq i32 %len, 0
+  br i1 %is.non.pos, label %fail, label %preheader
+preheader:
+  %lenminusone = add nsw i32 %len, -1
+  br label %for.body
+for.body:
+  %iv = phi i32 [ %lenminusone, %preheader ], [ %dec, %continue ]
+  %acc = phi i32 [ 0, %preheader ], [ %add, %continue ]
+  %r.chk = icmp ule i32 %iv, %len
+  br i1 %r.chk, label %continue, label %fail
+continue:
+; CHECK-LABEL: continue
+; CHECK: %i1 = load i32, i32* %a, align 4
+  %i1 = load i32, i32* %a, align 4
+  %add = add nsw i32 %i1, %acc
+  %dec = add nsw i32 %iv, -1
+  %exitcond = icmp eq i32 %dec, 0
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+
+fail:
+  call void @f()
+  ret i32 -1
+}
+
+; trivially true for zero
+define i32 @test3(i32* noalias nocapture readonly %a) nounwind uwtable {
+; CHECK-LABEL: @test3(
+entry:
+  %len = load i32, i32* %a, align 4, !range !{i32 0, i32 512}
+  %is.zero = icmp eq i32 %len, 0
+  br i1 %is.zero, label %fail, label %preheader
+preheader:
+; CHECK: %i1 = load i32, i32* %a, align 4
+; CHECK-NEXT: br label %for.body
+  br label %for.body
+for.body:
+  %iv = phi i32 [ 0, %preheader ], [ %inc, %continue ]
+  %acc = phi i32 [ 0, %preheader ], [ %add, %continue ]
+  %r.chk = icmp ule i32 %iv, %len
+  br i1 %r.chk, label %continue, label %fail
+continue:
+  %i1 = load i32, i32* %a, align 4
+  %add = add nsw i32 %i1, %acc
+  %inc = add nuw nsw i32 %iv, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+
+fail:
+  call void @f()
+  ret i32 -1
+}
+
+; requires fact length is non-zero
+; TODO: IsKnownNonNullFromDominatingConditions is currently only be done for
+; pointers; should handle integers too
+define i32 @test4(i32* noalias nocapture readonly %a) nounwind uwtable {
+; CHECK-LABEL: @test4(
+entry:
+  %len = load i32, i32* %a, align 4, !range !{i32 0, i32 512}
+  %is.zero = icmp eq i32 %len, 0
+  br i1 %is.zero, label %fail, label %preheader
+preheader:
+  br label %for.body
+for.body:
+  %iv = phi i32 [ 0, %preheader ], [ %inc, %continue ]
+  %acc = phi i32 [ 0, %preheader ], [ %add, %continue ]
+  %r.chk = icmp ult i32 %iv, %len
+  br i1 %r.chk, label %continue, label %fail
+continue:
+; CHECK-LABEL: continue
+; CHECK: %i1 = load i32, i32* %a, align 4
+  %i1 = load i32, i32* %a, align 4
+  %add = add nsw i32 %i1, %acc
+  %inc = add nuw nsw i32 %iv, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+
+fail:
+  call void @f()
+  ret i32 -1
+}
+
+; variation on test1 with branch swapped
+define i32 @test-brswap(i32* noalias nocapture readonly %a) nounwind uwtable {
+; CHECK-LABEL: @test-brswap(
+entry:
+; CHECK: %i1 = load i32, i32* %a, align 4
+; CHECK-NEXT: br label %for.body
+  br label %for.body
+
+for.body:
+  %iv = phi i32 [ 0, %entry ], [ %inc, %continue ]
+  %acc = phi i32 [ 0, %entry ], [ %add, %continue ]
+  %r.chk = icmp ugt i32 %iv, 2000
+  br i1 %r.chk, label %fail, label %continue
+continue:
+  %i1 = load i32, i32* %a, align 4
+  %add = add nsw i32 %i1, %acc
+  %inc = add nuw nsw i32 %iv, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+
+fail:
+  call void @f()
+  ret i32 -1
+}
+
+define i32 @test-nonphi(i32* noalias nocapture readonly %a) nounwind uwtable {
+; CHECK-LABEL: @test-nonphi(
+entry:
+  br label %for.body
+
+for.body:
+; CHECK-LABEL: continue
+; CHECK: %i1 = load i32, i32* %a, align 4
+  %iv = phi i32 [ 0, %entry ], [ %inc, %continue ]
+  %acc = phi i32 [ 0, %entry ], [ %add, %continue ]
+  %xor = xor i32 %iv, 72
+  %r.chk = icmp ugt i32 %xor, 2000
+  br i1 %r.chk, label %fail, label %continue
+continue:
+  %i1 = load i32, i32* %a, align 4
+  %add = add nsw i32 %i1, %acc
+  %inc = add nuw nsw i32 %iv, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+
+fail:
+  call void @f()
+  ret i32 -1
+}
+
+define i32 @test-wrongphi(i32* noalias nocapture readonly %a) nounwind uwtable {
+; CHECK-LABEL: @test-wrongphi(
+entry:
+  br label %for.body
+  
+for.body:
+  %iv = phi i32 [ 0, %entry ], [ %inc, %continue ]
+  %acc = phi i32 [ 0, %entry ], [ %add, %continue ]
+  %cond = icmp ult i32 %iv, 500
+  br i1 %cond, label %dummy_block1, label %dummy_block2
+
+dummy_block1:
+  br label %dummy_block2
+
+dummy_block2:
+  %wrongphi = phi i32 [11, %for.body], [12, %dummy_block1]
+  %r.chk = icmp ugt i32 %wrongphi, 2000
+  br i1 %r.chk, label %fail, label %continue
+continue:
+; CHECK-LABEL: continue
+; CHECK: %i1 = load i32, i32* %a, align 4
+  %i1 = load i32, i32* %a, align 4
+  %add = add nsw i32 %i1, %acc
+  %inc = add nuw nsw i32 %iv, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+
+fail:
+  call void @f()
+  ret i32 -1
+}
+
+; This works because loop-simplify is run implicitly, but test for it anyways
+define i32 @test-multiple-latch(i32* noalias nocapture readonly %a) nounwind uwtable {
+; CHECK-LABEL: @test-multiple-latch(
+entry:
+; CHECK: %i1 = load i32, i32* %a, align 4
+; CHECK-NEXT: br label %for.body
+  br label %for.body
+
+for.body:
+  %iv = phi i32 [ 0, %entry ], [ %inc, %continue1 ], [ %inc, %continue2 ]
+  %acc = phi i32 [ 0, %entry ], [ %add, %continue1 ], [ %add, %continue2 ]
+  %r.chk = icmp ult i32 %iv, 2000
+  br i1 %r.chk, label %continue1, label %fail
+continue1:
+  %i1 = load i32, i32* %a, align 4
+  %add = add nsw i32 %i1, %acc
+  %inc = add nuw nsw i32 %iv, 1
+  %cmp = icmp eq i32 %add, 0
+  br i1 %cmp, label %continue2, label %for.body
+continue2:
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+
+fail:
+  call void @f()
+  ret i32 -1
+}
+
+define void @test-hoisting-in-presence-of-guards(i1 %c, i32* %p) {
+
+; CHECK-LABEL: @test-hoisting-in-presence-of-guards
+; CHECK:       entry:
+; CHECK:         %a = load i32, i32* %p
+; CHECK:         %invariant_cond = icmp ne i32 %a, 100
+; CHECK:       loop:
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.next = add i32 %iv, 1
+  %a = load i32, i32* %p
+  %invariant_cond = icmp ne i32 %a, 100
+  call void (i1, ...) @llvm.experimental.guard(i1 %invariant_cond) [ "deopt"() ]
+  %loop_cond = icmp slt i32 %iv.next, 1000
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+
+declare void @may_throw() inaccessiblememonly
+
+; Test that we can sink a mustexecute load from loop header even in presence of
+; throwing instructions after it.
+define void @test_hoist_from_header_01(i32* %p, i32 %n) {
+
+; CHECK-LABEL: @test_hoist_from_header_01(
+; CHECK:       entry:
+; CHECK-NEXT:  %load = load i32, i32* %p
+; CHECK-NOT:   load i32
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %dummy = phi i32 [ 0, %entry ], [ %merge, %backedge ]
+  %load = load i32, i32* %p
+  call void @may_throw()
+  %cond = icmp slt i32 %iv, %n
+  br i1 %cond, label %if.true, label %if.false
+
+if.true:
+  %a = add i32 %iv, %iv
+  br label %backedge
+
+if.false:
+  %b = mul i32 %iv, %iv
+  br label %backedge
+
+backedge:
+  %merge = phi i32 [ %a, %if.true ], [ %b, %if.false ]
+  %iv.next = add i32 %iv, %merge
+  %loop.cond = icmp ult i32 %iv.next, %load
+  br i1 %loop.cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_hoist_from_header_02(i32* %p, i32 %n) {
+
+; CHECK-LABEL: @test_hoist_from_header_02(
+; CHECK:       entry:
+; CHECK-NEXT:  %load = load i32, i32* %p
+; CHECK-NOT:   load i32
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %dummy = phi i32 [ 0, %entry ], [ %merge, %backedge ]
+  %load = load i32, i32* %p
+  %cond = icmp slt i32 %iv, %n
+  br i1 %cond, label %if.true, label %if.false
+
+if.true:
+  call void @may_throw()
+  %a = add i32 %iv, %iv
+  br label %backedge
+
+if.false:
+  %b = mul i32 %iv, %iv
+  br label %backedge
+
+backedge:
+  %merge = phi i32 [ %a, %if.true ], [ %b, %if.false ]
+  %iv.next = add i32 %iv, %merge
+  %loop.cond = icmp ult i32 %iv.next, %load
+  br i1 %loop.cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_hoist_from_header_03(i32* %p, i32 %n) {
+
+; CHECK-LABEL: @test_hoist_from_header_03(
+; CHECK:       entry:
+; CHECK-NEXT:  %load = load i32, i32* %p
+; CHECK-NOT:   load i32
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %dummy = phi i32 [ 0, %entry ], [ %merge, %backedge ]
+  %load = load i32, i32* %p
+  %cond = icmp slt i32 %iv, %n
+  br i1 %cond, label %if.true, label %if.false
+
+if.true:
+  %a = add i32 %iv, %iv
+  br label %backedge
+
+if.false:
+  %b = mul i32 %iv, %iv
+  br label %backedge
+
+backedge:
+  %merge = phi i32 [ %a, %if.true ], [ %b, %if.false ]
+  call void @may_throw()
+  %iv.next = add i32 %iv, %merge
+  %loop.cond = icmp ult i32 %iv.next, %load
+  br i1 %loop.cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Check that a throwing instruction prohibits hoisting across it.
+define void @test_hoist_from_header_04(i32* %p, i32 %n) {
+
+; CHECK-LABEL: @test_hoist_from_header_04(
+; CHECK:       entry:
+; CHECK:       loop:
+; CHECK:       %load = load i32, i32* %p
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %dummy = phi i32 [ 0, %entry ], [ %merge, %backedge ]
+  call void @may_throw()
+  %load = load i32, i32* %p
+  %cond = icmp slt i32 %iv, %n
+  br i1 %cond, label %if.true, label %if.false
+
+if.true:
+  %a = add i32 %iv, %iv
+  br label %backedge
+
+if.false:
+  %b = mul i32 %iv, %iv
+  br label %backedge
+
+backedge:
+  %merge = phi i32 [ %a, %if.true ], [ %b, %if.false ]
+  %iv.next = add i32 %iv, %merge
+  %loop.cond = icmp ult i32 %iv.next, %load
+  br i1 %loop.cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can hoist a mustexecute load from backedge even if something
+; throws after it.
+define void @test_hoist_from_backedge_01(i32* %p, i32 %n) {
+
+; CHECK-LABEL: @test_hoist_from_backedge_01(
+; CHECK:       entry:
+; CHECK-NEXT:  %load = load i32, i32* %p
+; CHECK-NOT:   load i32
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %dummy = phi i32 [ 0, %entry ], [ %merge, %backedge ]
+  %cond = icmp slt i32 %iv, %n
+  br i1 %cond, label %if.true, label %if.false
+
+if.true:
+  %a = add i32 %iv, %iv
+  br label %backedge
+
+if.false:
+  %b = mul i32 %iv, %iv
+  br label %backedge
+
+backedge:
+  %merge = phi i32 [ %a, %if.true ], [ %b, %if.false ]
+  %iv.next = add i32 %iv, %merge
+  %load = load i32, i32* %p
+  call void @may_throw()
+  %loop.cond = icmp ult i32 %iv.next, %load
+  br i1 %loop.cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Check that we don't hoist the load if something before it can throw.
+define void @test_hoist_from_backedge_02(i32* %p, i32 %n) {
+
+; CHECK-LABEL: @test_hoist_from_backedge_02(
+; CHECK:       entry:
+; CHECK:       loop:
+; CHECK:       %load = load i32, i32* %p
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %dummy = phi i32 [ 0, %entry ], [ %merge, %backedge ]
+  %cond = icmp slt i32 %iv, %n
+  br i1 %cond, label %if.true, label %if.false
+
+if.true:
+  %a = add i32 %iv, %iv
+  br label %backedge
+
+if.false:
+  %b = mul i32 %iv, %iv
+  br label %backedge
+
+backedge:
+  %merge = phi i32 [ %a, %if.true ], [ %b, %if.false ]
+  %iv.next = add i32 %iv, %merge
+  call void @may_throw()
+  %load = load i32, i32* %p
+  %loop.cond = icmp ult i32 %iv.next, %load
+  br i1 %loop.cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_hoist_from_backedge_03(i32* %p, i32 %n) {
+
+; CHECK-LABEL: @test_hoist_from_backedge_03(
+; CHECK:       entry:
+; CHECK:       loop:
+; CHECK:       %load = load i32, i32* %p
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %dummy = phi i32 [ 0, %entry ], [ %merge, %backedge ]
+  %cond = icmp slt i32 %iv, %n
+  br i1 %cond, label %if.true, label %if.false
+
+if.true:
+  %a = add i32 %iv, %iv
+  br label %backedge
+
+if.false:
+  %b = mul i32 %iv, %iv
+  call void @may_throw()
+  br label %backedge
+
+backedge:
+  %merge = phi i32 [ %a, %if.true ], [ %b, %if.false ]
+  %iv.next = add i32 %iv, %merge
+  %load = load i32, i32* %p
+  %loop.cond = icmp ult i32 %iv.next, %load
+  br i1 %loop.cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_hoist_from_backedge_04(i32* %p, i32 %n) {
+
+; CHECK-LABEL: @test_hoist_from_backedge_04(
+; CHECK:       entry:
+; CHECK:       loop:
+; CHECK:       %load = load i32, i32* %p
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %dummy = phi i32 [ 0, %entry ], [ %merge, %backedge ]
+  call void @may_throw()
+  %cond = icmp slt i32 %iv, %n
+  br i1 %cond, label %if.true, label %if.false
+
+if.true:
+  %a = add i32 %iv, %iv
+  br label %backedge
+
+if.false:
+  %b = mul i32 %iv, %iv
+  br label %backedge
+
+backedge:
+  %merge = phi i32 [ %a, %if.true ], [ %b, %if.false ]
+  %iv.next = add i32 %iv, %merge
+  %load = load i32, i32* %p
+  %loop.cond = icmp ult i32 %iv.next, %load
+  br i1 %loop.cond, label %loop, label %exit
+
+exit:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LICM/hoist-nounwind.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/hoist-nounwind.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/hoist-nounwind.ll (added)
+++ llvm/trunk/test/Transforms/LICM/hoist-nounwind.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,98 @@
+; RUN: opt -S -basicaa -licm < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
+; RUN: opt -S -basicaa -licm -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @f() nounwind
+
+; Don't hoist load past nounwind call.
+define i32 @test1(i32* noalias nocapture readonly %a) nounwind uwtable {
+; CHECK-LABEL: @test1(
+entry:
+  br label %for.body
+
+; CHECK: tail call void @f()
+; CHECK-NEXT: load i32
+for.body:
+  %i.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %x.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  tail call void @f() nounwind
+  %i1 = load i32, i32* %a, align 4
+  %add = add nsw i32 %i1, %x.05
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+}
+
+; Don't hoist division past nounwind call.
+define i32 @test2(i32 %N, i32 %c) nounwind uwtable {
+; CHECK-LABEL: @test2(
+entry:
+  %cmp4 = icmp sgt i32 %N, 0
+  br i1 %cmp4, label %for.body, label %for.cond.cleanup
+
+; CHECK: tail call void @f()
+; CHECK-NEXT: sdiv i32
+for.body:
+  %i.05 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  tail call void @f() nounwind
+  %div = sdiv i32 5, %c
+  %add = add i32 %i.05, 1
+  %inc = add i32 %add, %div
+  %cmp = icmp slt i32 %inc, %N
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret i32 0
+}
+
+; Hoist a non-volatile load past volatile load.
+define i32 @test3(i32* noalias nocapture readonly %a, i32* %v) nounwind uwtable {
+; CHECK-LABEL: @test3(
+entry:
+  br label %for.body
+
+; CHECK: load i32
+; CHECK: for.body:
+; CHECK: load volatile i32
+; CHECK-NOT: load
+for.body:
+  %i.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %x.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %xxx = load volatile i32, i32* %v, align 4
+  %i1 = load i32, i32* %a, align 4
+  %add = add nsw i32 %i1, %x.05
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+}
+
+; Don't a volatile load past volatile load.
+define i32 @test4(i32* noalias nocapture readonly %a, i32* %v) nounwind uwtable {
+; CHECK-LABEL: @test4(
+entry:
+  br label %for.body
+
+; CHECK: for.body:
+; CHECK: load volatile i32
+; CHECK-NEXT: load volatile i32
+for.body:
+  %i.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %x.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %xxx = load volatile i32, i32* %v, align 4
+  %i1 = load volatile i32, i32* %a, align 4
+  %add = add nsw i32 %i1, %x.05
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+}
\ No newline at end of file

Added: llvm/trunk/test/Transforms/LICM/hoist-phi.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/hoist-phi.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/hoist-phi.ll (added)
+++ llvm/trunk/test/Transforms/LICM/hoist-phi.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,1520 @@
+; RUN: opt -S -licm < %s | FileCheck %s -check-prefixes=CHECK,CHECK-DISABLED
+; RUN: opt -S -licm -licm-control-flow-hoisting=1 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-ENABLED
+; RUN: opt -S -licm -licm-control-flow-hoisting=0 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-DISABLED
+; RUN: opt -passes='require<opt-remark-emit>,loop(licm)' -S < %s | FileCheck %s -check-prefixes=CHECK,CHECK-DISABLED
+; RUN: opt -passes='require<opt-remark-emit>,loop(licm)' -licm-control-flow-hoisting=1 -S < %s | FileCheck %s -check-prefixes=CHECK,CHECK-ENABLED
+; RUN: opt -passes='require<opt-remark-emit>,loop(licm)' -licm-control-flow-hoisting=0 -S < %s | FileCheck %s -check-prefixes=CHECK,CHECK-DISABLED
+
+; RUN: opt -passes='require<opt-remark-emit>,loop(licm)' -licm-control-flow-hoisting=1 -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s | FileCheck %s -check-prefixes=CHECK,CHECK-ENABLED
+; Enable run below when adding promotion. e.g. "store i32 %phi, i32* %p" is promoted to phi.lcssa.
+; opt -passes='require<opt-remark-emit>,loop(licm)' -licm-control-flow-hoisting=0 -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s | FileCheck %s -check-prefixes=CHECK,CHECK-DISABLED
+
+
+; CHECK-LABEL: @triangle_phi
+define void @triangle_phi(i32 %x, i32* %p) {
+; CHECK-LABEL: entry:
+; CHECK: %cmp1 = icmp sgt i32 %x, 0
+; CHECK-ENABLED: br i1 %cmp1, label %[[IF_LICM:.*]], label %[[THEN_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[IF_LICM]]:
+; CHECK: %add = add i32 %x, 1
+; CHECK-ENABLED: br label %[[THEN_LICM]]
+
+; CHECK-ENABLED: [[THEN_LICM]]:
+; CHECK-ENABLED: phi i32 [ %add, %[[IF_LICM]] ], [ %x, %entry ]
+; CHECK-ENABLED: store i32 %phi, i32* %p
+; CHECK-ENABLED: %cmp2 = icmp ne i32 %phi, 0
+; CHECK: br label %loop
+
+loop:
+  %cmp1 = icmp sgt i32 %x, 0
+  br i1 %cmp1, label %if, label %then
+
+if:
+  %add = add i32 %x, 1
+  br label %then
+
+; CHECK-LABEL: then:
+; CHECK-DISABLED: %phi = phi i32 [ %add, %if ], [ %x, %loop ]
+; CHECK-DISABLED: %cmp2 = icmp ne i32 %phi, 0
+then:
+  %phi = phi i32 [ %add, %if ], [ %x, %loop ]
+  store i32 %phi, i32* %p
+  %cmp2 = icmp ne i32 %phi, 0
+  br i1 %cmp2, label %loop, label %end
+
+; CHECK-LABEL: end:
+; CHECK-DISABLED: %[[PHI_LCSSA:.*]] = phi i32 [ %phi, %then ]
+; CHECK-DISABLED: store i32 %[[PHI_LCSSA]], i32* %p
+end:
+  ret void
+}
+
+; CHECK-LABEL: @diamond_phi
+define void @diamond_phi(i32 %x, i32* %p) {
+; CHECK-LABEL: entry:
+; CHECK: %cmp1 = icmp sgt i32 %x, 0
+; CHECK-ENABLED: br i1 %cmp1, label %[[IF_LICM:.*]], label %[[ELSE_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[IF_LICM]]:
+; CHECK-DAG: %add = add i32 %x, 1
+; CHECK-ENABLED: br label %[[THEN_LICM:.*]]
+
+; CHECK-ENABLED: [[ELSE_LICM]]:
+; CHECK-DAG: %sub = sub i32 %x, 1
+; CHECK-ENABLED: br label %[[THEN_LICM]]
+
+; CHECK-ENABLED: [[THEN_LICM]]
+; CHECK-ENABLED: %phi = phi i32 [ %add, %[[IF_LICM]] ], [ %sub, %[[ELSE_LICM]] ]
+; CHECK-ENABLED: store i32 %phi, i32* %p
+; CHECK-ENABLED: %cmp2 = icmp ne i32 %phi, 0
+; CHECK: br label %loop
+
+loop:
+  %cmp1 = icmp sgt i32 %x, 0
+  br i1 %cmp1, label %if, label %else
+
+if:
+  %add = add i32 %x, 1
+  br label %then
+
+else:
+  %sub = sub i32 %x, 1
+  br label %then
+
+; CHECK-LABEL: then:
+; CHECK-DISABLED: %phi = phi i32 [ %add, %if ], [ %sub, %else ]
+; CHECK-DISABLED: %cmp2 = icmp ne i32 %phi, 0
+then:
+  %phi = phi i32 [ %add, %if ], [ %sub, %else ]
+  store i32 %phi, i32* %p
+  %cmp2 = icmp ne i32 %phi, 0
+  br i1 %cmp2, label %loop, label %end
+
+; CHECK-LABEL: end:
+; CHECK-DISABLED: %[[PHI_LCSSA:.*]] = phi i32 [ %phi, %then ]
+; CHECK-DISABLED: store i32 %[[PHI_LCSSA]], i32* %p
+end:
+  ret void
+}
+
+; TODO: This is currently too complicated for us to be able to hoist the phi.
+; CHECK-LABEL: @three_way_phi
+define void @three_way_phi(i32 %x, i32* %p) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %cmp1 = icmp sgt i32 %x, 0
+; CHECK-DAG: %add = add i32 %x, 1
+; CHECK-DAG: %cmp2 = icmp sgt i32 %add, 0
+; CHECK-ENABLED: br i1 %cmp1, label %[[IF_LICM:.*]], label %[[ELSE_LICM:.*]]
+
+; CHECK-ENABLED: [[IF_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM:.*]]
+
+; CHECK-ENABLED: [[THEN_LICM]]:
+; CHECK: %sub = sub i32 %x, 1
+; CHECK: br label %loop
+
+entry:
+  br label %loop
+
+loop:
+  %cmp1 = icmp sgt i32 %x, 0
+  br i1 %cmp1, label %if, label %then
+
+if:
+  %add = add i32 %x, 1
+  %cmp2 = icmp sgt i32 %add, 0
+  br i1 %cmp2, label %if.if, label %then
+
+if.if:
+  %sub = sub i32 %x, 1
+  br label %then
+
+then:
+  %phi = phi i32 [ 0, %loop ], [ %add, %if ], [ %sub, %if.if ]
+  store i32 %phi, i32* %p
+  %cmp3 = icmp ne i32 %phi, 0
+  br i1 %cmp3, label %loop, label %end
+
+end:
+  ret void
+}
+
+; TODO: This is currently too complicated for us to be able to hoist the phi.
+; CHECK-LABEL: @tree_phi
+define void @tree_phi(i32 %x, i32* %p) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %cmp1 = icmp sgt i32 %x, 0
+; CHECK-DAG: %add = add i32 %x, 1
+; CHECK-DAG: %cmp2 = icmp sgt i32 %add, 0
+; CHECK-DAG: %sub = sub i32 %x, 1
+; CHECK: br label %loop
+
+entry:
+  br label %loop
+
+loop:
+  %cmp1 = icmp sgt i32 %x, 0
+  br i1 %cmp1, label %if, label %else
+
+if:
+  %add = add i32 %x, 1
+  %cmp2 = icmp sgt i32 %add, 0
+  br i1 %cmp2, label %if.if, label %if.else
+
+if.if:
+  br label %then
+
+if.else:
+  br label %then
+
+else:
+  %sub = sub i32 %x, 1
+  br label %then
+
+then:
+  %phi = phi i32 [ %add, %if.if ], [ 0, %if.else ], [ %sub, %else ]
+  store i32 %phi, i32* %p
+  %cmp3 = icmp ne i32 %phi, 0
+  br i1 %cmp3, label %loop, label %end
+
+end:
+  ret void
+}
+
+; TODO: We can hoist the first phi, but not the second.
+; CHECK-LABEL: @phi_phi
+define void @phi_phi(i32 %x, i32* %p) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %cmp1 = icmp sgt i32 %x, 0
+; CHECK-DAG: %add = add i32 %x, 1
+; CHECK-DAG: %cmp2 = icmp sgt i32 %add, 0
+; CHECK-DAG: %sub = sub i32 %x, 1
+; CHECK-ENABLED: br i1 %cmp2, label %[[IF_IF_LICM:.*]], label %[[IF_ELSE_LICM:.*]]
+
+; CHECK-ENABLED: [[IF_IF_LICM]]:
+; CHECK-ENABLED: br label %[[IF_THEN_LICM:.*]]
+
+; CHECK-ENABLED: [[IF_ELSE_LICM]]:
+; CHECK-ENABLED: br label %[[IF_THEN_LICM]]
+
+; CHECK-ENABLED: [[IF_THEN_LICM]]:
+; CHECK-ENABLED: %phi1 = phi i32 [ %add, %[[IF_IF_LICM]] ], [ 0, %[[IF_ELSE_LICM]] ]
+; CHECK: br label %loop
+
+entry:
+  br label %loop
+
+loop:
+  %cmp1 = icmp sgt i32 %x, 0
+  br i1 %cmp1, label %if, label %else
+
+if:
+  %add = add i32 %x, 1
+  %cmp2 = icmp sgt i32 %add, 0
+  br i1 %cmp2, label %if.if, label %if.else
+
+if.if:
+  br label %if.then
+
+if.else:
+  br label %if.then
+
+; CHECK-LABEL: if.then:
+; CHECK-DISABLED: %phi1 = phi i32 [ %add, %if.if ], [ 0, %if.else ]
+if.then:
+  %phi1 = phi i32 [ %add, %if.if ], [ 0, %if.else ]
+  br label %then
+
+else:
+  %sub = sub i32 %x, 1
+  br label %then
+
+; CHECK-LABEL: then:
+; CHECK: %phi2 = phi i32 [ %phi1, %if.then ], [ %sub, %else ]
+then:
+  %phi2 = phi i32 [ %phi1, %if.then ], [ %sub, %else ]
+  store i32 %phi2, i32* %p
+  %cmp3 = icmp ne i32 %phi2, 0
+  br i1 %cmp3, label %loop, label %end
+
+end:
+  ret void
+}
+
+; Check that we correctly duplicate empty control flow.
+; CHECK-LABEL: @empty_triangle_phi
+define i8 @empty_triangle_phi(i32 %x, i32 %y) {
+; CHECK-LABEL: entry:
+; CHECK: %cmp1 = icmp eq i32 %x, 0
+; CHECK-ENABLED: br i1 %cmp1, label %[[IF_LICM:.*]], label %[[THEN_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[IF_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM]]
+
+; CHECK-ENABLED: [[THEN_LICM]]:
+; CHECK-ENABLED: %phi = phi i8 [ 0, %[[IF_LICM]] ], [ 1, %entry ]
+; CHECK: %cmp2 = icmp eq i32 %y, 0
+; CHECK: br label %loop
+
+loop:
+  %cmp1 = icmp eq i32 %x, 0
+  br i1 %cmp1, label %if, label %then
+
+if:
+  br label %then
+
+; CHECK-LABEL: then:
+; CHECK-DISABLED: %phi = phi i8 [ 0, %if ], [ 1, %loop ]
+then:
+  %phi = phi i8 [ 0, %if ], [ 1, %loop ]
+  %cmp2 = icmp eq i32 %y, 0
+  br i1 %cmp2, label %end, label %loop
+
+end:
+  ret i8 %phi
+}
+
+; CHECK-LABEL: @empty_diamond_phi
+define i8 @empty_diamond_phi(i32 %x, i32 %y) {
+; CHECK-LABEL: entry:
+; CHECK: %cmp1 = icmp eq i32 %x, 0
+; CHECK-ENABLED: br i1 %cmp1, label %[[IF_LICM:.*]], label %[[ELSE_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[IF_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM:.*]]
+
+; CHECK-ENABLED: [[ELSE_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM]]
+
+; CHECK-ENABLED: [[THEN_LICM]]:
+; CHECK-ENABLED: %phi = phi i8 [ 0, %[[IF_LICM]] ], [ 1, %[[ELSE_LICM]] ]
+; CHECK: %cmp2 = icmp eq i32 %y, 0
+; CHECK: br label %loop
+
+loop:
+  %cmp1 = icmp eq i32 %x, 0
+  br i1 %cmp1, label %if, label %else
+
+if:
+  br label %then
+
+else:
+  br label %then
+
+; CHECK-LABEL: then:
+; CHECK-DISABLED: %phi = phi i8 [ 0, %if ], [ 1, %else ]
+then:
+  %phi = phi i8 [ 0, %if ], [ 1, %else ]
+  %cmp2 = icmp eq i32 %y, 0
+  br i1 %cmp2, label %end, label %loop
+
+end:
+  ret i8 %phi
+}
+
+; Check that we correctly handle the case that the first thing we try to hoist is a phi.
+; CHECK-LABEL: @empty_triangle_phi_first
+define i8 @empty_triangle_phi_first(i32 %x, i1 %cond) {
+; CHECK-LABEL: entry:
+; CHECK-ENABLED: br i1 %cond, label %[[IF_LICM:.*]], label %[[THEN_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[IF_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM]]
+
+; CHECK-ENABLED: [[THEN_LICM]]:
+; CHECK-ENABLED: %phi = phi i8 [ 0, %[[IF_LICM]] ], [ 1, %entry ]
+; CHECK: %cmp = icmp eq i32 %x, 0
+; CHECK: br label %loop
+
+loop:
+  br i1 %cond, label %if, label %then
+
+if:
+  br label %then
+
+; CHECK-LABEL: then:
+; CHECK-DISABLED: %phi = phi i8 [ 0, %if ], [ 1, %loop ]
+then:
+  %phi = phi i8 [ 0, %if ], [ 1, %loop ]
+  %cmp = icmp eq i32 %x, 0
+  br i1 %cmp, label %end, label %loop
+
+end:
+  ret i8 %phi
+}
+
+; CHECK-LABEL: @empty_diamond_phi
+define i8 @empty_diamond_phi_first(i32 %x, i1 %cond) {
+; CHECK-LABEL: entry:
+; CHECK-ENABLED: br i1 %cond, label %[[IF_LICM:.*]], label %[[ELSE_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[IF_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM:.*]]
+
+; CHECK-ENABLED: [[ELSE_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM]]
+
+; CHECK-ENABLED: [[THEN_LICM]]:
+; CHECK-ENABLED: %phi = phi i8 [ 0, %[[IF_LICM]] ], [ 1, %[[ELSE_LICM]] ]
+; CHECK: %cmp = icmp eq i32 %x, 0
+; CHECK: br label %loop
+
+loop:
+  br i1 %cond, label %if, label %else
+
+if:
+  br label %then
+
+else:
+  br label %then
+
+; CHECK-LABEL: then:
+; CHECK-DISABLED: %phi = phi i8 [ 0, %if ], [ 1, %else ]
+then:
+  %phi = phi i8 [ 0, %if ], [ 1, %else ]
+  %cmp = icmp eq i32 %x, 0
+  br i1 %cmp, label %end, label %loop
+
+end:
+  ret i8 %phi
+}
+
+; CHECK-LABEL: @empty_triangle_phi_first
+define i8 @empty_triangle_phi_first_empty_loop_head(i32 %x, i1 %cond) {
+; CHECK-LABEL: entry:
+; CHECK-ENABLED: br i1 %cond, label %[[IF_LICM:.*]], label %[[THEN_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[IF_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM]]
+
+; CHECK-ENABLED: [[THEN_LICM]]:
+; CHECK-ENABLED: %phi = phi i8 [ 0, %[[IF_LICM]] ], [ 1, %entry ]
+; CHECK: %cmp = icmp eq i32 %x, 0
+; CHECK: br label %loop
+
+loop:
+  br label %test
+
+test:
+  br i1 %cond, label %if, label %then
+
+if:
+  br label %then
+
+; CHECK-LABEL: then:
+; CHECK-DISABLED: %phi = phi i8 [ 0, %if ], [ 1, %test ]
+then:
+  %phi = phi i8 [ 0, %if ], [ 1, %test ]
+  %cmp = icmp eq i32 %x, 0
+  br i1 %cmp, label %end, label %loop
+
+end:
+  ret i8 %phi
+}
+
+; CHECK-LABEL: @empty_diamond_phi_first_empty_loop_head
+define i8 @empty_diamond_phi_first_empty_loop_head(i32 %x, i1 %cond) {
+; CHECK-LABEL: entry:
+; CHECK-ENABLED: br i1 %cond, label %[[IF_LICM:.*]], label %[[ELSE_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[IF_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM:.*]]
+
+; CHECK-ENABLED: [[ELSE_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM]]
+
+; CHECK-ENABLED: [[THEN_LICM]]:
+; CHECK-ENABLED: %phi = phi i8 [ 0, %[[IF_LICM]] ], [ 1, %[[ELSE_LICM]] ]
+; CHECK: %cmp = icmp eq i32 %x, 0
+; CHECK: br label %loop
+
+loop:
+  br label %test
+
+test:
+  br i1 %cond, label %if, label %else
+
+if:
+  br label %then
+
+else:
+  br label %then
+
+; CHECK-LABEL: then:
+; CHECK-DISABLED: %phi = phi i8 [ 0, %if ], [ 1, %else ]
+then:
+  %phi = phi i8 [ 0, %if ], [ 1, %else ]
+  %cmp = icmp eq i32 %x, 0
+  br i1 %cmp, label %end, label %loop
+
+end:
+  ret i8 %phi
+}
+
+; The phi is on one branch of a diamond while simultaneously at the end of a
+; triangle. Check that we duplicate the triangle and not the diamond.
+; CHECK-LABEL: @triangle_diamond
+define void @triangle_diamond(i32* %ptr, i32 %x, i32 %y) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %cmp1 = icmp ne i32 %x, 0
+; CHECK-DAG: %cmp2 = icmp ne i32 %y, 0
+; CHECK-ENABLED: br i1 %cmp1, label %[[IF_LICM:.*]], label %[[THEN_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[IF_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM]]
+
+; CHECK-ENABLED: [[THEN_LICM]]:
+; CHECK-ENABLED: %phi = phi i32 [ 0, %[[IF_LICM]] ], [ 127, %entry ]
+; CHECK: br label %loop
+
+loop:
+  %cmp1 = icmp ne i32 %x, 0
+  br i1 %cmp1, label %if, label %then
+
+if:
+  %cmp2 = icmp ne i32 %y, 0
+  br i1 %cmp2, label %if.then, label %then
+
+; CHECK-LABEL: then:
+; CHECK-DISABLED: %phi = phi i32 [ 0, %if ], [ 127, %loop ]
+then:
+  %phi = phi i32 [ 0, %if ], [ 127, %loop ]
+  store i32 %phi, i32* %ptr
+  br label %end
+
+if.then:
+  br label %end
+
+end:
+  br label %loop
+}
+
+; As the previous, but the end of the diamond is the head of the loop.
+; CHECK-LABEL: @triangle_diamond_backedge
+define void @triangle_diamond_backedge(i32* %ptr, i32 %x, i32 %y) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %cmp1 = icmp ne i32 %x, 0
+; CHECK-DAG: %cmp2 = icmp ne i32 %y, 0
+; CHECK-ENABLED: br i1 %cmp1, label %[[IF_LICM:.*]], label %[[THEN_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[IF_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM]]
+
+; CHECK-ENABLED: [[THEN_LICM]]:
+; CHECK-ENABLED: %phi = phi i32 [ 0, %[[IF_LICM]] ], [ 127, %entry ]
+; CHECK: br label %loop
+
+loop:
+  %cmp1 = icmp ne i32 %x, 0
+  br i1 %cmp1, label %if, label %then
+
+if:
+  %cmp2 = icmp ne i32 %y, 0
+  br i1 %cmp2, label %backedge, label %then
+
+; CHECK-LABEL: then:
+; CHECK-DISABLED: %phi = phi i32 [ 0, %if ], [ 127, %loop ]
+then:
+  %phi = phi i32 [ 0, %if ], [ 127, %loop ]
+  store i32 %phi, i32* %ptr
+  br label %loop
+
+backedge:
+  br label %loop
+}
+
+; TODO: The inner diamonds can be hoisted, but not currently the outer diamond
+; CHECK-LABEL: @diamonds_inside_diamond
+define void @diamonds_inside_diamond(i32 %x, i32* %p) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %cmp1 = icmp sgt i32 %x, 0
+; CHECK-DAG: %cmp3 = icmp slt i32 %x, -10
+; CHECK-ENABLED: br i1 %cmp3, label %[[ELSE_IF_LICM:.*]], label %[[ELSE_ELSE_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[ELSE_IF_LICM]]:
+; CHECK-ENABLED: br label %[[ELSE_THEN_LICM:.*]]
+
+; CHECK-ENABLED: [[ELSE_ELSE_LICM]]:
+; CHECK-ENABLED: br label %[[ELSE_THEN_LICM]]
+
+; CHECK-ENABLED: [[ELSE_THEN_LICM]]:
+; CHECK-ENABLED: %phi2 = phi i32 [ 2, %[[ELSE_IF_LICM]] ], [ 3, %[[ELSE_ELSE_LICM]] ]
+; CHECK: %cmp2 = icmp sgt i32 %x, 10
+; CHECK-ENABLED: br i1 %cmp2, label %[[IF_IF_LICM:.*]], label %[[IF_ELSE_LICM:.*]]
+
+; CHECK-ENABLED: [[IF_IF_LICM]]:
+; CHECK-ENABLED: br label %[[IF_THEN_LICM:.*]]
+
+; CHECK-ENABLED: [[IF_ELSE_LICM]]:
+; CHECK-ENABLED: br label %[[IF_THEN_LICM]]
+
+; CHECK-ENABLED: [[IF_THEN_LICM]]:
+; CHECK-ENABLED: %phi1 = phi i32 [ 0, %[[IF_IF_LICM]] ], [ 1, %[[IF_ELSE_LICM]] ]
+; CHECK: br label %loop
+
+loop:
+  %cmp1 = icmp sgt i32 %x, 0
+  br i1 %cmp1, label %if, label %else
+
+if:
+  %cmp2 = icmp sgt i32 %x, 10
+  br i1 %cmp2, label %if.if, label %if.else
+
+if.if:
+  br label %if.then
+
+if.else:
+  br label %if.then
+
+; CHECK-LABEL: if.then:
+; CHECK-DISABLED: %phi1 = phi i32 [ 0, %if.if ], [ 1, %if.else ]
+if.then:
+  %phi1 = phi i32 [ 0, %if.if ], [ 1, %if.else ]
+  br label %then
+
+else:
+  %cmp3 = icmp slt i32 %x, -10
+  br i1 %cmp3, label %else.if, label %else.else
+
+else.if:
+  br label %else.then
+
+else.else:
+  br label %else.then
+
+; CHECK-LABEL: else.then:
+; CHECK-DISABLED: %phi2 = phi i32 [ 2, %else.if ], [ 3, %else.else ]
+else.then:
+  %phi2 = phi i32 [ 2, %else.if ], [ 3, %else.else ]
+  br label %then
+
+; CHECK-LABEL: then:
+; CHECK: %phi3 = phi i32 [ %phi1, %if.then ], [ %phi2, %else.then ]
+; CHECK: %cmp4 = icmp ne i32 %phi3, 0
+then:
+  %phi3 = phi i32 [ %phi1, %if.then ], [ %phi2, %else.then ]
+  store i32 %phi3, i32* %p
+  %cmp4 = icmp ne i32 %phi3, 0
+  br i1 %cmp4, label %loop, label %end
+
+end:
+  ret void
+}
+
+; We can hoist blocks that contain an edge that exits the loop by ignoring that
+; edge in the hoisted block.
+; CHECK-LABEL: @triangle_phi_loopexit
+define void @triangle_phi_loopexit(i32 %x, i32* %p) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %add = add i32 %x, 1
+; CHECK-DAG: %cmp1 = icmp sgt i32 %x, 0
+; CHECK-DAG: %cmp2 = icmp sgt i32 10, %add
+; CHECK-ENABLED: br i1 %cmp1, label %[[IF_LICM:.*]], label %[[THEN_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[IF_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM]]
+
+; CHECK-ENABLED: [[THEN_LICM]]:
+; CHECK-ENABLED: %phi = phi i32 [ %add, %[[IF_LICM]] ], [ %x, %entry ]
+; CHECK: br label %loop
+
+loop:
+  %cmp1 = icmp sgt i32 %x, 0
+  br i1 %cmp1, label %if, label %then
+
+if:
+  %add = add i32 %x, 1
+  %cmp2 = icmp sgt i32 10, %add
+  br i1 %cmp2, label %then, label %end
+
+; CHECK-LABEL: then:
+; CHECK-DISABLED: %phi = phi i32 [ %add, %if ], [ %x, %loop ]
+then:
+  %phi = phi i32 [ %add, %if ], [ %x, %loop ]
+  store i32 %phi, i32* %p
+  %cmp3 = icmp ne i32 %phi, 0
+  br i1 %cmp3, label %loop, label %end
+
+end:
+  ret void
+}
+
+; CHECK-LABEL: @diamond_phi_oneloopexit
+define void @diamond_phi_oneloopexit(i32 %x, i32* %p) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %add = add i32 %x, 1
+; CHECK-DAG: %cmp1 = icmp sgt i32 %x, 0
+; CHECK-DAG: %cmp2 = icmp sgt i32 10, %add
+; CHECK-ENABLED: br i1 %cmp1, label %[[IF_LICM:.*]], label %[[THEN_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[IF_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM:.*]]
+
+; CHECK-ENABLED: [[ELSE_LICM]]:
+; CHECK-DAG: %sub = sub i32 %x, 1
+; CHECK-ENABLED: br label %[[THEN_LICM]]
+
+; CHECK-ENABLED: [[THEN_LICM]]
+; CHECK-ENABLED: %phi = phi i32 [ %add, %[[IF_LICM]] ], [ %sub, %[[ELSE_LICM]] ]
+; CHECK-ENABLED: %cmp3 = icmp ne i32 %phi, 0
+; CHECK: br label %loop
+
+loop:
+  %cmp1 = icmp sgt i32 %x, 0
+  br i1 %cmp1, label %if, label %else
+
+if:
+  %add = add i32 %x, 1
+  %cmp2 = icmp sgt i32 10, %add
+  br i1 %cmp2, label %then, label %end
+
+else:
+  %sub = sub i32 %x, 1
+  br label %then
+
+; CHECK-LABEL: then:
+; CHECK-DISABLED: %phi = phi i32 [ %add, %if ], [ %sub, %else ]
+then:
+  %phi = phi i32 [ %add, %if ], [ %sub, %else ]
+  store i32 %phi, i32* %p
+  %cmp3 = icmp ne i32 %phi, 0
+  br i1 %cmp3, label %loop, label %end
+
+end:
+  ret void
+}
+
+; CHECK-LABEL: @diamond_phi_twoloopexit
+define void @diamond_phi_twoloopexit(i32 %x, i32* %p) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %sub = sub i32 %x, 1
+; CHECK-DAG: %add = add i32 %x, 1
+; CHECK-DAG: %cmp1 = icmp sgt i32 %x, 0
+; CHECK-DAG: %cmp2 = icmp sgt i32 10, %add
+; CHECK-DAG: %cmp3 = icmp sgt i32 10, %sub
+; CHECK-ENABLED: br i1 %cmp1, label %[[IF_LICM:.*]], label %[[THEN_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[IF_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM:.*]]
+
+; CHECK-ENABLED: [[ELSE_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM]]
+
+; CHECK-ENABLED: [[THEN_LICM]]
+; CHECK-ENABLED: %phi = phi i32 [ %add, %[[IF_LICM]] ], [ %sub, %[[ELSE_LICM]] ]
+; CHECK-ENABLED: %cmp4 = icmp ne i32 %phi, 0
+; CHECK: br label %loop
+
+loop:
+  %cmp1 = icmp sgt i32 %x, 0
+  br i1 %cmp1, label %if, label %else
+
+if:
+  %add = add i32 %x, 1
+  %cmp2 = icmp sgt i32 10, %add
+  br i1 %cmp2, label %then, label %end
+
+else:
+  %sub = sub i32 %x, 1
+  %cmp3 = icmp sgt i32 10, %sub
+  br i1 %cmp3, label %then, label %end
+
+; CHECK-LABEL: then:
+; CHECK-DISABLED: %phi = phi i32 [ %add, %if ], [ %sub, %else ]
+; CHECK-DISABLED: %cmp4 = icmp ne i32 %phi, 0
+then:
+  %phi = phi i32 [ %add, %if ], [ %sub, %else ]
+  store i32 %phi, i32* %p
+  %cmp4 = icmp ne i32 %phi, 0
+  br i1 %cmp4, label %loop, label %end
+
+end:
+  ret void
+}
+
+; The store cannot be hoisted, so add and shr cannot be hoisted into a
+; conditional block.
+; CHECK-LABEL: @conditional_use
+define void @conditional_use(i32 %x, i32* %p) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %cond = icmp ugt i32 %x, 0
+; CHECK-DAG: %add = add i32 %x, 5
+; CHECK-DAG: %shr = ashr i32 %add, 1
+; CHECK: br label %loop
+entry:
+  br label %loop
+
+loop:
+  %cond = icmp ugt i32 %x, 0
+  br i1 %cond, label %if, label %else
+
+; CHECK-LABEL: if:
+; CHECK: store i32 %shr, i32* %p, align 4
+if:
+  %add = add i32 %x, 5
+  %shr = ashr i32 %add, 1
+  store i32 %shr, i32* %p, align 4
+  br label %then
+
+else:
+  br label %then
+
+then:
+  br label %loop
+}
+
+; A diamond with two triangles on the left and one on the right. This test is
+; to check that we have a unique loop preheader when we hoist the store (and so
+; don't fail an assertion).
+; CHECK-LABEL: @triangles_in_diamond
+define void @triangles_in_diamond(i32* %ptr) {
+; CHECK-LABEL: entry:
+; CHECK: store i32 0, i32* %ptr, align 4
+; CHECK: br label %loop
+entry:
+  br label %loop
+
+loop:
+  br i1 undef, label %left_triangle_1, label %right_triangle
+
+left_triangle_1:
+  br i1 undef, label %left_triangle_1_if, label %left_triangle_2
+
+left_triangle_1_if:
+  br label %left_triangle_2
+
+left_triangle_2:
+  br i1 undef, label %left_triangle_2_if, label %left_triangle_2_then
+
+left_triangle_2_if:
+  br label %left_triangle_2_then
+
+left_triangle_2_then:
+  br label %loop.end
+
+right_triangle:
+  br i1 undef, label %right_triangle.if, label %right_triangle.then
+
+right_triangle.if:
+  br label %right_triangle.then
+
+right_triangle.then:
+  br label %loop.end
+
+loop.end:
+  store i32 0, i32* %ptr, align 4
+  br label %loop
+}
+
+; %cmp dominates its used after being hoisted, but not after %brmerge is rehoisted
+; CHECK-LABEL: @rehoist
+define void @rehoist(i8* %this, i32 %x) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %sub = add nsw i32 %x, -1
+; CHECK-DAG: %fptr = bitcast i8* %this to void (i8*)*
+; CHECK-DAG: %cmp = icmp eq i32 0, %sub
+; CHECK-DAG: %brmerge = or i1 %cmp, true
+entry:
+  %sub = add nsw i32 %x, -1
+  br label %loop
+
+loop:
+  br i1 undef, label %if1, label %else1
+
+if1:
+  %fptr = bitcast i8* %this to void (i8*)*
+  call void %fptr(i8* %this)
+  br label %then1
+
+else1:
+  br label %then1
+
+then1:
+  %cmp = icmp eq i32 0, %sub
+  br i1 %cmp, label %end, label %else2
+
+else2:
+  %brmerge = or i1 %cmp, true
+  br i1 %brmerge, label %if3, label %end
+
+if3:
+  br label %end
+
+end:
+  br label %loop
+}
+
+; A test case that uses empty blocks in a way that can cause control flow
+; hoisting to get confused.
+; CHECK-LABEL: @empty_blocks_multiple_conditional_branches
+define void @empty_blocks_multiple_conditional_branches(float %arg, float* %ptr) {
+; CHECK-LABEL: entry
+; CHECK-DAG: %div1 = fmul float %arg, 4.000000e+00
+; CHECK-DAG: %div2 = fmul float %arg, 2.000000e+00
+entry:
+  br label %loop
+
+; The exact path to the phi isn't checked here, because it depends on whether
+; cond2 or cond3 is hoisted first
+; CHECK-ENABLED: %phi = phi float [ 0.000000e+00, %{{.*}} ], [ %div1, %{{.*}} ]
+; CHECK: br label %loop
+
+loop:
+  br i1 undef, label %backedge2, label %cond1
+
+cond1:
+  br i1 undef, label %cond1.if, label %cond1.else
+
+cond1.else:
+  br label %cond3
+
+cond1.if:
+  br label %cond1.if.next
+
+cond1.if.next:
+  br label %cond2
+
+cond2:
+  %div1 = fmul float %arg, 4.000000e+00
+  br i1 undef, label %cond2.if, label %cond2.then
+
+cond2.if:
+  br label %cond2.then
+
+; CHECK-LABEL: cond2.then:
+; CHECK-DISABLED: %phi = phi float [ 0.000000e+00, %cond2 ], [ %div1, %cond2.if ]
+cond2.then:
+  %phi = phi float [ 0.000000e+00, %cond2 ], [ %div1, %cond2.if ]
+  store float %phi, float* %ptr
+  br label %backedge2
+
+cond3:
+  br i1 undef, label %cond3.then, label %cond3.if
+
+cond3.if:
+  %div2 = fmul float %arg, 2.000000e+00
+  store float %div2, float* %ptr
+  br label %cond3.then
+
+cond3.then:
+  br label %loop
+
+backedge2:
+  br label %loop
+}
+
+; We can't do much here, so mainly just check that we don't crash.
+; CHECK-LABEL: @many_path_phi
+define void @many_path_phi(i32* %ptr1, i32* %ptr2) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %gep3 = getelementptr inbounds i32, i32* %ptr2, i32 2
+; CHECK-DAG: %gep2 = getelementptr inbounds i32, i32* %ptr2, i32 2
+; CHECK: br label %loop
+entry:
+  br label %loop
+
+loop:
+  %phi1 = phi i32 [ 0, %entry ], [ %phi2, %end ]
+  %cmp1 = icmp ugt i32 %phi1, 3
+  br i1 %cmp1, label %cond2, label %cond1
+
+cond1:
+  br i1 undef, label %end, label %cond1.else
+
+cond1.else:
+  %gep2 = getelementptr inbounds i32, i32* %ptr2, i32 2
+  %val2 = load i32, i32* %gep2, align 4
+  %cmp2 = icmp eq i32 %val2, 13
+  br i1 %cmp2, label %cond1.end, label %end
+
+cond1.end:
+  br label %end
+
+cond2:
+  br i1 undef, label %end, label %cond2.else
+
+cond2.else:
+  %gep3 = getelementptr inbounds i32, i32* %ptr2, i32 2
+  %val3 = load i32, i32* %gep3, align 4
+  %cmp3 = icmp eq i32 %val3, 13
+  br i1 %cmp3, label %cond2.end, label %end
+
+cond2.end:
+  br label %end
+
+end:
+  %phi2 = phi i32 [ 1, %cond1 ], [ 2, %cond1.else ], [ 3, %cond1.end ], [ 4, %cond2 ], [ 5, %cond2.else ], [ 6, %cond2.end ]
+  br label %loop
+}
+
+; Check that we correctly handle the hoisting of %gep when theres a critical
+; edge that branches to the preheader.
+; CHECK-LABEL: @crit_edge
+define void @crit_edge(i32* %ptr, i32 %idx, i1 %cond1, i1 %cond2) {
+; CHECK-LABEL: entry:
+; CHECK: %gep = getelementptr inbounds i32, i32* %ptr, i32 %idx
+; CHECK: br label %preheader
+entry:
+  br label %preheader
+
+preheader:
+  br label %loop
+
+loop:
+  br i1 %cond1, label %then, label %if
+
+if:
+  %gep = getelementptr inbounds i32, i32* %ptr, i32 %idx
+  %val = load i32, i32* %gep
+  br label %then
+
+then:
+  %phi = phi i32 [ %val, %if ], [ 0, %loop ]
+  store i32 %phi, i32* %ptr
+  br i1 %cond2, label %loop, label %crit_edge
+
+crit_edge:
+  br label %preheader
+}
+
+; Check that the conditional sub is correctly hoisted from the inner loop to the
+; preheader of the outer loop.
+; CHECK-LABEL: @hoist_from_innermost_loop
+define void @hoist_from_innermost_loop(i32 %nx, i32* %ptr) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %sub = sub nsw i32 0, %nx
+; CHECK: br label %outer_loop
+entry:
+  br label %outer_loop
+
+outer_loop:
+  br label %middle_loop
+
+middle_loop:
+  br label %inner_loop
+
+inner_loop:
+  br i1 undef, label %inner_loop_end, label %if
+
+if:
+  %sub = sub nsw i32 0, %nx
+  store i32 %sub, i32* %ptr, align 4
+  br label %inner_loop_end
+
+inner_loop_end:
+  br i1 undef, label %inner_loop, label %middle_loop_end
+
+middle_loop_end:
+  br i1 undef, label %middle_loop, label %outer_loop_end
+
+outer_loop_end:
+  br label %outer_loop
+}
+
+; We have a diamond starting from %if, but %if.if is also reachable from %loop,
+; so %gep should not be conditionally hoisted.
+; CHECK-LABEL: @diamond_with_extra_in_edge
+define void @diamond_with_extra_in_edge(i32* %ptr1, i32* %ptr2, i32 %arg) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %cmp2 = icmp ne i32 0, %arg
+; CHECK-DAG: %gep = getelementptr i32, i32* %ptr1, i32 4
+; CHECK: br label %loop
+entry:
+  br label %loop
+
+loop:
+  %phi1 = phi i32 [ 0, %entry ], [ %phi2, %then ]
+  %cmp1 = icmp ugt i32 16, %phi1
+  br i1 %cmp1, label %if, label %if.if
+
+if:
+  %cmp2 = icmp ne i32 0, %arg
+  br i1 %cmp2, label %if.if, label %if.else
+
+if.if:
+  %gep = getelementptr i32, i32* %ptr1, i32 4
+  %val = load i32, i32* %gep, align 4
+  br label %then
+
+if.else:
+  br label %then
+
+then:
+  %phi2 = phi i32 [ %val, %if.if ], [ %phi1, %if.else ]
+  store i32 %phi2, i32* %ptr2, align 4
+  br label %loop
+}
+
+; %loop/%if/%then form a triangle, but %loop/%if/%then/%end also form a diamond.
+; The triangle should be picked for conditional hoisting.
+; CHECK-LABEL: @both_triangle_and_diamond
+define void @both_triangle_and_diamond(i32* %ptr1, i32* %ptr2, i32 %arg) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %cmp1 = icmp ne i32 0, %arg
+; CHECK-DAG: %gep = getelementptr i32, i32* %ptr1, i32 4
+; CHECK-ENABLED: br i1 %cmp1, label %[[IF_LICM:.*]], label %[[THEN_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[IF_LICM]]:
+; CHECK-ENABLED: br label %[[THEN_LICM]]
+
+; CHECK-ENABLED: [[THEN_LICM]]:
+; CHECK-ENABLED: %phi2 = phi i32 [ 0, %[[IF_LICM]] ], [ 1, %entry ]
+; CHECK: br label %loop
+
+loop:
+  %phi1 = phi i32 [ 0, %entry ], [ %phi3, %end ]
+  %cmp1 = icmp ne i32 0, %arg
+  br i1 %cmp1, label %if, label %then
+
+if:
+  %gep = getelementptr i32, i32* %ptr1, i32 4
+  %val = load i32, i32* %gep, align 4
+  %cmp2 = icmp ugt i32 16, %phi1
+  br i1 %cmp2, label %end, label %then
+
+; CHECK-LABEL: then:
+; CHECK-DISABLED: %phi2 = phi i32 [ 0, %if ], [ 1, %loop ]
+then:
+  %phi2 = phi i32 [ 0, %if ], [ 1, %loop ]
+  br label %end
+
+end:
+  %phi3 = phi i32 [ %phi2, %then ], [ %val, %if ]
+  store i32 %phi3, i32* %ptr2, align 4
+  br label %loop
+}
+
+; We shouldn't duplicate the branch at the end of %loop and should instead hoist
+; %val to %entry.
+; CHECK-LABEL: @same_destination_branch
+define i32 @same_destination_branch(i32 %arg1, i32 %arg2) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %cmp1 = icmp ne i32 %arg2, 0
+; CHECK-DAG: %val = add i32 %arg1, 1
+; CHECK: br label %loop
+entry:
+  br label %loop
+
+; CHECK-LABEL: loop:
+; CHECK: %phi = phi i32 [ 0, %entry ], [ %add, %then ]
+loop:
+  %phi = phi i32 [ 0, %entry ], [ %add, %then ]
+  %add = add i32 %phi, 1
+  %cmp1 = icmp ne i32 %arg2, 0
+  br i1 %cmp1, label %if, label %if
+
+if:
+  %val = add i32 %arg1, 1
+  br label %then
+
+then:
+  %cmp2 = icmp ne i32 %val, %phi
+  br i1 %cmp2, label %loop, label %end
+
+end:
+  ret i32 %val
+}
+
+; Diamond-like control flow but the left/right blocks actually have the same
+; destinations.
+; TODO: We could potentially hoist all of phi2-4, but currently only hoist phi2.
+; CHECK-LABEL: @diamond_like_same_destinations
+define i32 @diamond_like_same_destinations(i32 %arg1, i32 %arg2) {
+; CHECK-LABEL: entry:
+; CHECK-DAG: %cmp1 = icmp ne i32 %arg1, 0
+; CHECK-DAG: %cmp2 = icmp ugt i32 %arg2, 1
+; CHECK-DAG: %cmp3 = icmp ugt i32 %arg2, 2
+; CHECK-ENABLED: br i1 %cmp1, label %[[LEFT1_LICM:.*]], label %[[RIGHT1_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[LEFT1_LICM]]:
+; CHECK-ENABLED: br label %[[LEFT2_LICM:.*]]
+
+; CHECK-ENABLED: [[RIGHT1_LICM]]:
+; CHECK-ENABLED: br label %[[LEFT2_LICM]]
+
+; CHECK-ENABLED: [[LEFT2_LICM]]:
+; CHECK-ENABLED: %phi2 = phi i32 [ 0, %[[LEFT1_LICM]] ], [ 1, %[[RIGHT1_LICM]] ]
+; CHECK: br label %loop
+
+loop:
+  %phi1 = phi i32 [ 0, %entry ], [ %add, %loopend ]
+  %add = add i32 %phi1, 1
+  %cmp1 = icmp ne i32 %arg1, 0
+  br i1 %cmp1, label %left1, label %right1
+
+left1:
+  %cmp2 = icmp ugt i32 %arg2, 1
+  br i1 %cmp2, label %left2, label %right2
+
+right1:
+  %cmp3 = icmp ugt i32 %arg2, 2
+  br i1 %cmp3, label %left2, label %right2
+
+; CHECK-LABEL: left2:
+; CHECK-DISABLED: %phi2 = phi i32 [ 0, %left1 ], [ 1, %right1 ]
+left2:
+  %phi2 = phi i32 [ 0, %left1 ], [ 1, %right1 ]
+  br label %loopend
+
+; CHECK-LABEL: right2:
+; CHECK: %phi3 = phi i32 [ 2, %left1 ], [ 3, %right1 ]
+right2:
+  %phi3 = phi i32 [ 2, %left1 ], [ 3, %right1 ]
+  br label %loopend
+
+; CHECK-LABEL: loopend:
+; CHECK: %phi4 = phi i32 [ %phi2, %left2 ], [ %phi3, %right2 ]
+loopend:
+  %phi4 = phi i32 [ %phi2, %left2 ], [ %phi3, %right2 ]
+  %cmp4 = icmp ne i32 %phi1, 32
+  br i1 %cmp4, label %loop, label %end
+
+end:
+  ret i32 %phi4
+}
+
+; A phi with multiple incoming values for the same block due to a branch with
+; two destinations that are actually the same. We can't hoist this.
+; TODO: This could be hoisted by erasing one of the incoming values.
+; CHECK-LABEL: @phi_multiple_values_same_block
+define i32 @phi_multiple_values_same_block(i32 %arg) {
+; CHECK-LABEL: entry:
+; CHECK: %cmp = icmp sgt i32 %arg, 4
+; CHECK-NOT: phi
+; CHECK: br label %loop
+entry:
+  br label %loop
+
+loop:
+  %cmp = icmp sgt i32 %arg, 4
+  br i1 %cmp, label %if, label %then
+
+if:
+  br i1 undef, label %then, label %then
+
+then:
+  %phi = phi i32 [ %arg, %loop ], [ 1, %if ], [ 1, %if ]
+  br i1 undef, label %exit, label %loop
+
+exit:
+  ret i32 %phi
+}
+
+; %phi is conditionally used in %d, and the store that %d is used in cannot be
+; hoisted. This means that we have to rehoist %d, but have to make sure to
+; rehoist it after %phi.
+; CHECK-LABEL: @phi_conditional_use
+define i64 @phi_conditional_use(i32 %f, i32* %g) {
+; CHECK-LABEL: entry:
+; CHECK: %cmp1 = icmp eq i32 %f, 1
+; CHECK: %cmp2 = icmp eq i32 %f, 0
+; CHECK-ENABLED: br i1 %cmp1, label %[[IF_END_LICM:.*]], label %[[IF_THEN_LICM:.*]]
+entry:
+  %cmp1 = icmp eq i32 %f, 1
+  %cmp2 = icmp eq i32 %f, 0
+  br label %loop
+
+; CHECK-ENABLED: [[IF_THEN_LICM]]:
+; CHECK-ENABLED: br label %[[IF_END_LICM]]
+
+; CHECK-ENABLED: [[IF_END_LICM]]:
+; CHECK-ENABLED: %phi = phi i64 [ 0, %entry ], [ 1, %[[IF_THEN_LICM]] ]
+; CHECK-ENABLED: %d = getelementptr inbounds i32, i32* %g, i64 %phi
+; CHECK-ENABLED: i1 %cmp2, label %[[LOOP_BACKEDGE_LICM:.*]], label %[[IF_THEN2_LICM:.*]]
+
+; CHECK-ENABLED: [[IF_THEN2_LICM]]:
+; CHECK-ENABLED: br label %[[LOOP_BACKEDGE_LICM]]
+
+; CHECK-ENABLED: [[LOOP_BACKEDGE_LICM]]:
+; CHECK: br label %loop
+
+loop:
+  br i1 %cmp1, label %if.end, label %if.then
+
+if.then:
+  br label %if.end
+
+; CHECK-LABEL: if.end:
+; CHECK-DISABLED: %phi = phi i64 [ 0, %loop ], [ 1, %if.then ]
+if.end:
+  %phi = phi i64 [ 0, %loop ], [ 1, %if.then ]
+  br i1 %cmp2, label %loop.backedge, label %if.then2
+
+; CHECK-LABEL: if.then2:
+; CHECK-DISABLED: %d = getelementptr inbounds i32, i32* %g, i64 %phi
+if.then2:
+  %d = getelementptr inbounds i32, i32* %g, i64 %phi
+  store i32 1, i32* %d, align 4
+  br label %loop.backedge
+
+loop.backedge:
+  br label %loop
+}
+
+; As above, but we have two such phis
+; CHECK-LABEL: @phi_conditional_use_twice
+define i64 @phi_conditional_use_twice(i32 %f, i32* %g) {
+; CHECK-LABEL: entry:
+; CHECK: %cmp1 = icmp eq i32 %f, 1
+; CHECK: %cmp2 = icmp eq i32 %f, 0
+; CHECK-ENABLED: br i1 %cmp1, label %[[IF_END_LICM:.*]], label %[[IF_THEN_LICM:.*]]
+entry:
+  %cmp1 = icmp eq i32 %f, 1
+  %cmp2 = icmp eq i32 %f, 0
+  %cmp3 = icmp sgt i32 %f, 0
+  br label %loop
+
+; CHECK-ENABLED: [[IF_THEN_LICM]]:
+; CHECK-ENABLED: br label %[[IF_END_LICM]]
+
+; CHECK-ENABLED: [[IF_END_LICM]]:
+; CHECK-ENABLED: %phi1 = phi i64 [ 0, %entry ], [ 1, %[[IF_THEN_LICM]] ]
+; CHECK-ENABLED: %d = getelementptr inbounds i32, i32* %g, i64 %phi1
+; CHECK-ENABLED: i1 %cmp2, label %[[IF_END2_LICM:.*]], label %[[IF_THEN2_LICM:.*]]
+
+; CHECK-ENABLED: [[IF_THEN2_LICM]]:
+; CHECK-ENABLED: br label %[[IF_END2_LICM]]
+
+; CHECK-ENABLED: [[IF_END2_LICM]]:
+; CHECK-ENABLED: %phi2 = phi i64 [ 2, %[[IF_END_LICM]] ], [ 3, %[[IF_THEN2_LICM]] ]
+; CHECK-ENABLED: %e = getelementptr inbounds i32, i32* %g, i64 %phi2
+; CHECK-ENABLED: i1 %cmp3, label %[[LOOP_BACKEDGE_LICM:.*]], label %[[IF_THEN3_LICM:.*]]
+
+; CHECK-ENABLED: [[IF_THEN3_LICM]]:
+; CHECK-ENABLED: br label %[[LOOP_BACKEDGE_LICM]]
+
+; CHECK-ENABLED: [[LOOP_BACKEDGE_LICM]]:
+; CHECK: br label %loop
+
+loop:
+  br i1 %cmp1, label %if.end, label %if.then
+
+if.then:
+  br label %if.end
+
+; CHECK-LABEL: if.end:
+; CHECK-DISABLED: %phi1 = phi i64 [ 0, %loop ], [ 1, %if.then ]
+if.end:
+  %phi1 = phi i64 [ 0, %loop ], [ 1, %if.then ]
+  br i1 %cmp2, label %if.end2, label %if.then2
+
+; CHECK-LABEL: if.then2:
+; CHECK-DISABLED: %d = getelementptr inbounds i32, i32* %g, i64 %phi1
+if.then2:
+  %d = getelementptr inbounds i32, i32* %g, i64 %phi1
+  store i32 1, i32* %d, align 4
+  br label %if.end2
+
+; CHECK-LABEL: if.end2:
+; CHECK-DISABLED: %phi2 = phi i64 [ 2, %if.end ], [ 3, %if.then2 ]
+if.end2:
+  %phi2 = phi i64 [ 2, %if.end ], [ 3, %if.then2 ]
+  br i1 %cmp3, label %loop.backedge, label %if.then3
+
+; CHECK-LABEL: if.then3:
+; CHECK-DISABLED: %e = getelementptr inbounds i32, i32* %g, i64 %phi2
+if.then3:
+  %e = getelementptr inbounds i32, i32* %g, i64 %phi2
+  store i32 1, i32* %e, align 4
+  br label %loop.backedge
+
+loop.backedge:
+  br label %loop
+}
+
+; The order that we hoist instructions from the loop is different to the textual
+; order in the function. Check that we can rehoist this correctly.
+; CHECK-LABEL: @rehoist_wrong_order_1
+define void @rehoist_wrong_order_1(i32* %ptr) {
+; CHECK-LABEL: entry
+; CHECK-DAG: %gep2 = getelementptr inbounds i32, i32* %ptr, i64 2
+; CHECK-DAG: %gep3 = getelementptr inbounds i32, i32* %ptr, i64 3
+; CHECK-DAG: %gep1 = getelementptr inbounds i32, i32* %ptr, i64 1
+; CHECK-ENABLED: br i1 undef, label %[[IF1_LICM:.*]], label %[[ELSE1_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[IF1_LICM]]:
+; CHECK-ENABLED: br label %[[LOOP_BACKEDGE_LICM:.*]]
+
+; CHECK-ENABLED: [[ELSE1_LICM]]:
+; CHECK-ENABLED: br label %[[LOOP_BACKEDGE_LICM]]
+
+; CHECK-ENABLED: [[LOOP_BACKEDGE_LICM]]:
+; CHECK-ENABLED: br i1 undef, label %[[IF3_LICM:.*]], label %[[END_LICM:.*]]
+
+; CHECK-ENABLED: [[IF3_LICM]]:
+; CHECK-ENABLED: br label %[[END_LICM]]
+
+; CHECK-ENABLED: [[END_LICM]]:
+; CHECK: br label %loop
+
+loop:
+  br i1 undef, label %if1, label %else1
+
+if1:
+  %gep1 = getelementptr inbounds i32, i32* %ptr, i64 1
+  store i32 0, i32* %gep1, align 4
+  br label %loop.backedge
+
+else1:
+  %gep2 = getelementptr inbounds i32, i32* %ptr, i64 2
+  store i32 0, i32* %gep2, align 4
+  br i1 undef, label %if2, label %loop.backedge
+
+if2:
+  br i1 undef, label %if3, label %end
+
+if3:
+  %gep3 = getelementptr inbounds i32, i32* %ptr, i64 3
+  store i32 0, i32* %gep3, align 4
+  br label %end
+
+end:
+  br label %loop.backedge
+
+loop.backedge:
+  br label %loop
+
+}
+
+; CHECK-LABEL: @rehoist_wrong_order_2
+define void @rehoist_wrong_order_2(i32* %ptr) {
+; CHECK-LABEL: entry
+; CHECK-DAG: %gep2 = getelementptr inbounds i32, i32* %ptr, i64 2
+; CHECK-DAG: %gep3 = getelementptr inbounds i32, i32* %gep2, i64 3
+; CHECK-DAG: %gep1 = getelementptr inbounds i32, i32* %ptr, i64 1
+; CHECK-ENABLED: br i1 undef, label %[[IF1_LICM:.*]], label %[[ELSE1_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[IF1_LICM]]:
+; CHECK-ENABLED: br label %[[LOOP_BACKEDGE_LICM:.*]]
+
+; CHECK-ENABLED: [[ELSE1_LICM]]:
+; CHECK-ENABLED: br label %[[LOOP_BACKEDGE_LICM]]
+
+; CHECK-ENABLED: [[LOOP_BACKEDGE_LICM]]:
+; CHECK-ENABLED: br i1 undef, label %[[IF3_LICM:.*]], label %[[END_LICM:.*]]
+
+; CHECK-ENABLED: [[IF3_LICM]]:
+; CHECK-ENABLED: br label %[[END_LICM]]
+
+; CHECK-ENABLED: [[END_LICM]]:
+; CHECK: br label %loop
+
+loop:
+  br i1 undef, label %if1, label %else1
+
+if1:
+  %gep1 = getelementptr inbounds i32, i32* %ptr, i64 1
+  store i32 0, i32* %gep1, align 4
+  br label %loop.backedge
+
+else1:
+  %gep2 = getelementptr inbounds i32, i32* %ptr, i64 2
+  store i32 0, i32* %gep2, align 4
+  br i1 undef, label %if2, label %loop.backedge
+
+if2:
+  br i1 undef, label %if3, label %end
+
+if3:
+  %gep3 = getelementptr inbounds i32, i32* %gep2, i64 3
+  store i32 0, i32* %gep3, align 4
+  br label %end
+
+end:
+  br label %loop.backedge
+
+loop.backedge:
+  br label %loop
+}
+
+; CHECK-LABEL: @rehoist_wrong_order_3
+define void @rehoist_wrong_order_3(i32* %ptr) {
+; CHECK-LABEL: entry
+; CHECK-DAG: %gep2 = getelementptr inbounds i32, i32* %ptr, i64 2
+; CHECK-DAG: %gep1 = getelementptr inbounds i32, i32* %ptr, i64 1
+; CHECK-ENABLED: br i1 undef, label %[[IF1_LICM:.*]], label %[[ELSE1_LICM:.*]]
+entry:
+  br label %loop
+
+; CHECK-ENABLED: [[IF1_LICM]]:
+; CHECK-ENABLED: br label %[[IF2_LICM:.*]]
+
+; CHECK-ENABLED: [[ELSE1_LICM]]:
+; CHECK-ENABLED: br label %[[IF2_LICM]]
+
+; CHECK-ENABLED: [[IF2_LICM]]:
+; CHECK-ENABLED: %phi = phi i32* [ %gep1, %[[IF1_LICM]] ], [ %gep2, %[[ELSE1_LICM]] ]
+; CHECK-ENABLED: %gep3 = getelementptr inbounds i32, i32* %phi, i64 3
+; CHECK-ENABLED: br i1 undef, label %[[IF3_LICM:.*]], label %[[END_LICM:.*]]
+
+; CHECK-ENABLED: [[IF3_LICM]]:
+; CHECK-ENABLED: br label %[[END_LICM]]
+
+; CHECK-ENABLED: [[END_LICM]]:
+; CHECK: br label %loop
+
+loop:
+  br i1 undef, label %if1, label %else1
+
+if1:
+  %gep1 = getelementptr inbounds i32, i32* %ptr, i64 1
+  store i32 0, i32* %gep1, align 4
+  br label %if2
+
+else1:
+  %gep2 = getelementptr inbounds i32, i32* %ptr, i64 2
+  store i32 0, i32* %gep2, align 4
+  br i1 undef, label %if2, label %loop.backedge
+
+if2:
+  %phi = phi i32* [ %gep1, %if1 ], [ %gep2, %else1 ]
+  br i1 undef, label %if3, label %end
+
+if3:
+  %gep3 = getelementptr inbounds i32, i32* %phi, i64 3
+  store i32 0, i32* %gep3, align 4
+  br label %end
+
+end:
+  br label %loop.backedge
+
+loop.backedge:
+  br label %loop
+}

Added: llvm/trunk/test/Transforms/LICM/hoist-round.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/hoist-round.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/hoist-round.ll (added)
+++ llvm/trunk/test/Transforms/LICM/hoist-round.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,70 @@
+; RUN: opt -S -licm < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
+; RUN: opt -S -licm -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
+
+target datalayout = "E-m:e-p:32:32-i8:8:8-i16:16:16-i64:32:32-f64:32:32-v64:32:32-v128:32:32-a0:0:32-n32"
+
+; This test verifies that ceil, floor, nearbyint, trunc, rint, round,
+; copysign, minnum, maxnum, minimum, maximum, and fabs intrinsics are
+; considered safe to speculate.
+
+; CHECK-LABEL: @test
+; CHECK: call float @llvm.ceil.f32
+; CHECK: call float @llvm.floor.f32
+; CHECK: call float @llvm.nearbyint.f32
+; CHECK: call float @llvm.rint.f32
+; CHECK: call float @llvm.round.f32
+; CHECK: call float @llvm.trunc.f32
+; CHECK: call float @llvm.fabs.f32
+; CHECK: call float @llvm.copysign.f32
+; CHECK: call float @llvm.minnum.f32
+; CHECK: call float @llvm.maxnum.f32
+; CHECK: call float @llvm.powi.f32
+; CHECK: for.body:
+
+define void @test(float %arg1, float %arg2) {
+entry:
+  br label %for.head
+
+for.head:
+  %IND = phi i32 [ 0, %entry ], [ %IND.new, %for.body ]
+  %CMP = icmp slt i32 %IND, 10
+  br i1 %CMP, label %for.body, label %exit
+
+for.body:
+  %tmp.1 = call float @llvm.ceil.f32(float %arg1)
+  %tmp.2 = call float @llvm.floor.f32(float %tmp.1)
+  %tmp.3 = call float @llvm.nearbyint.f32(float %tmp.2)
+  %tmp.4 = call float @llvm.rint.f32(float %tmp.3)
+  %tmp.5 = call float @llvm.round.f32(float %tmp.4)
+  %tmp.6 = call float @llvm.trunc.f32(float %tmp.5)
+  %tmp.7 = call float @llvm.fabs.f32(float %tmp.6)
+  %tmp.8 = call float @llvm.copysign.f32(float %tmp.7, float %arg2)
+  %tmp.9 = call float @llvm.minnum.f32(float %tmp.8, float %arg2)
+  %tmp.10 = call float @llvm.maxnum.f32(float %tmp.9, float %arg2)
+  %tmp.11 = call float @llvm.minimum.f32(float %tmp.10, float %arg2)
+  %tmp.12 = call float @llvm.maximum.f32(float %tmp.11, float %arg2)
+  %tmp.13 = call float @llvm.powi.f32(float %tmp.12, i32 4)
+  call void @consume(float %tmp.13)
+  %IND.new = add i32 %IND, 1
+  br label %for.head
+
+exit:
+  ret void
+}
+
+declare void @consume(float)
+
+declare float @llvm.ceil.f32(float)
+declare float @llvm.floor.f32(float)
+declare float @llvm.nearbyint.f32(float)
+declare float @llvm.rint.f32(float)
+declare float @llvm.round.f32(float)
+declare float @llvm.trunc.f32(float)
+declare float @llvm.fabs.f32(float)
+declare float @llvm.copysign.f32(float, float)
+declare float @llvm.minnum.f32(float, float)
+declare float @llvm.maxnum.f32(float, float)
+declare float @llvm.minimum.f32(float, float)
+declare float @llvm.maximum.f32(float, float)
+declare float @llvm.powi.f32(float, i32)

Added: llvm/trunk/test/Transforms/LICM/hoisting.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/hoisting.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/hoisting.ll (added)
+++ llvm/trunk/test/Transforms/LICM/hoisting.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,340 @@
+; RUN: opt < %s -licm -S | FileCheck %s
+; RUN: opt < %s -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(licm)' -S | FileCheck %s
+; RUN: opt < %s -licm -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s
+
+ at X = global i32 0		; <i32*> [#uses=1]
+
+declare void @foo()
+
+declare i32 @llvm.bitreverse.i32(i32)
+
+; This testcase tests for a problem where LICM hoists 
+; potentially trapping instructions when they are not guaranteed to execute.
+define i32 @test1(i1 %c) {
+; CHECK-LABEL: @test1(
+	%A = load i32, i32* @X		; <i32> [#uses=2]
+	br label %Loop
+Loop:		; preds = %LoopTail, %0
+	call void @foo( )
+	br i1 %c, label %LoopTail, label %IfUnEqual
+        
+IfUnEqual:		; preds = %Loop
+; CHECK: IfUnEqual:
+; CHECK-NEXT: sdiv i32 4, %A
+	%B1 = sdiv i32 4, %A		; <i32> [#uses=1]
+	br label %LoopTail
+        
+LoopTail:		; preds = %IfUnEqual, %Loop
+	%B = phi i32 [ 0, %Loop ], [ %B1, %IfUnEqual ]		; <i32> [#uses=1]
+	br i1 %c, label %Loop, label %Out
+Out:		; preds = %LoopTail
+	%C = sub i32 %A, %B		; <i32> [#uses=1]
+	ret i32 %C
+}
+
+
+declare void @foo2(i32) nounwind
+
+
+;; It is ok and desirable to hoist this potentially trapping instruction.
+define i32 @test2(i1 %c) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT: load i32, i32* @X
+; CHECK-NEXT: %B = sdiv i32 4, %A
+  %A = load i32, i32* @X
+  br label %Loop
+
+Loop:
+  ;; Should have hoisted this div!
+  %B = sdiv i32 4, %A
+  br label %loop2
+
+loop2:
+  call void @foo2( i32 %B )
+  br i1 %c, label %Loop, label %Out
+
+Out:
+  %C = sub i32 %A, %B
+  ret i32 %C
+}
+
+
+; This loop invariant instruction should be constant folded, not hoisted.
+define i32 @test3(i1 %c) {
+; CHECK-LABEL: define i32 @test3(
+; CHECK: call void @foo2(i32 6)
+	%A = load i32, i32* @X		; <i32> [#uses=2]
+	br label %Loop
+Loop:
+	%B = add i32 4, 2		; <i32> [#uses=2]
+	call void @foo2( i32 %B )
+	br i1 %c, label %Loop, label %Out
+Out:		; preds = %Loop
+	%C = sub i32 %A, %B		; <i32> [#uses=1]
+	ret i32 %C
+}
+
+; CHECK-LABEL: @test4(
+; CHECK: call
+; CHECK: sdiv
+; CHECK: ret
+define i32 @test4(i32 %x, i32 %y) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %n.01 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  call void @foo_may_call_exit(i32 0)
+  %div = sdiv i32 %x, %y
+  %add = add nsw i32 %n.01, %div
+  %inc = add nsw i32 %i.02, 1
+  %cmp = icmp slt i32 %inc, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %n.0.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %n.0.lcssa
+}
+
+declare void @foo_may_call_exit(i32)
+
+; PR14854
+; CHECK-LABEL: @test5(
+; CHECK: extractvalue
+; CHECK: br label %tailrecurse
+; CHECK: tailrecurse:
+; CHECK: ifend:
+; CHECK: insertvalue
+define { i32*, i32 } @test5(i32 %i, { i32*, i32 } %e) {
+entry:
+  br label %tailrecurse
+
+tailrecurse:                                      ; preds = %then, %entry
+  %i.tr = phi i32 [ %i, %entry ], [ %cmp2, %then ]
+  %out = extractvalue { i32*, i32 } %e, 1
+  %d = insertvalue { i32*, i32 } %e, i32* null, 0
+  %cmp1 = icmp sgt i32 %out, %i.tr
+  br i1 %cmp1, label %then, label %ifend
+
+then:                                             ; preds = %tailrecurse
+  call void @foo()
+  %cmp2 = add i32 %i.tr, 1
+  br label %tailrecurse
+
+ifend:                                            ; preds = %tailrecurse
+  ret { i32*, i32 } %d
+}
+
+; CHECK: define i32 @hoist_bitreverse(i32)
+; CHECK: bitreverse
+; CHECK: br label %header
+define i32 @hoist_bitreverse(i32)  {
+  br label %header
+
+header:
+  %sum = phi i32 [ 0, %1 ], [ %5, %latch ]
+  %2 = phi i32 [ 0, %1 ], [ %6, %latch ]
+  %3 = icmp slt i32 %2, 1024
+  br i1 %3, label %body, label %return
+
+body:
+  %4 = call i32 @llvm.bitreverse.i32(i32 %0)
+  %5 = add i32 %sum, %4
+  br label %latch
+
+latch:
+  %6 = add nsw i32 %2, 1
+  br label %header
+
+return:
+  ret i32 %sum
+}
+
+; Can neither sink nor hoist
+define i32 @test_volatile(i1 %c) {
+; CHECK-LABEL: @test_volatile(
+; CHECK-LABEL: Loop:
+; CHECK: load volatile i32, i32* @X
+; CHECK-LABEL: Out:
+  br label %Loop
+
+Loop:
+  %A = load volatile i32, i32* @X
+  br i1 %c, label %Loop, label %Out
+
+Out:
+  ret i32 %A
+}
+
+
+declare {}* @llvm.invariant.start.p0i8(i64, i8* nocapture) nounwind readonly
+declare void @llvm.invariant.end.p0i8({}*, i64, i8* nocapture) nounwind
+declare void @escaping.invariant.start({}*) nounwind
+; invariant.start dominates the load, and in this scope, the
+; load is invariant. So, we can hoist the `addrld` load out of the loop.
+define i32 @test_fence(i8* %addr, i32 %n, i8* %volatile) {
+; CHECK-LABEL: @test_fence
+; CHECK-LABEL: entry
+; CHECK: invariant.start
+; CHECK: %addrld = load atomic i32, i32* %addr.i unordered, align 8
+; CHECK: br label %loop
+entry: 
+  %gep = getelementptr inbounds i8, i8* %addr, i64 8
+  %addr.i = bitcast i8* %gep to i32 *
+  store atomic i32 5, i32 * %addr.i unordered, align 8
+  fence release
+  %invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
+  br label %loop
+
+loop:
+  %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
+  %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
+  %volload = load atomic i8, i8* %volatile unordered, align 8
+  fence acquire
+  %volchk = icmp eq i8 %volload, 0
+  %addrld = load atomic i32, i32* %addr.i unordered, align 8
+  %sel = select i1 %volchk, i32 0, i32 %addrld
+  %sum.next = add i32 %sel, %sum
+  %indvar.next = add i32 %indvar, 1
+  %cond = icmp slt i32 %indvar.next, %n
+  br i1 %cond, label %loop, label %loopexit
+
+loopexit:
+  ret i32 %sum
+}
+
+
+
+; Same as test above, but the load is no longer invariant (presence of
+; invariant.end). We cannot hoist the addrld out of loop.
+define i32 @test_fence1(i8* %addr, i32 %n, i8* %volatile) {
+; CHECK-LABEL: @test_fence1
+; CHECK-LABEL: entry
+; CHECK: invariant.start
+; CHECK-NEXT: invariant.end
+; CHECK-NEXT: br label %loop
+entry:
+  %gep = getelementptr inbounds i8, i8* %addr, i64 8
+  %addr.i = bitcast i8* %gep to i32 *
+  store atomic i32 5, i32 * %addr.i unordered, align 8
+  fence release
+  %invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
+  call void @llvm.invariant.end.p0i8({}* %invst, i64 4, i8* %gep)
+  br label %loop
+
+loop:
+  %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
+  %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
+  %volload = load atomic i8, i8* %volatile unordered, align 8
+  fence acquire
+  %volchk = icmp eq i8 %volload, 0
+  %addrld = load atomic i32, i32* %addr.i unordered, align 8
+  %sel = select i1 %volchk, i32 0, i32 %addrld
+  %sum.next = add i32 %sel, %sum
+  %indvar.next = add i32 %indvar, 1
+  %cond = icmp slt i32 %indvar.next, %n
+  br i1 %cond, label %loop, label %loopexit
+
+loopexit:
+  ret i32 %sum
+}
+
+; same as test above, but instead of invariant.end, we have the result of
+; invariant.start escaping through a call. We cannot hoist the load.
+define i32 @test_fence2(i8* %addr, i32 %n, i8* %volatile) {
+; CHECK-LABEL: @test_fence2
+; CHECK-LABEL: entry
+; CHECK-NOT: load
+; CHECK: br label %loop
+entry:
+  %gep = getelementptr inbounds i8, i8* %addr, i64 8
+  %addr.i = bitcast i8* %gep to i32 *
+  store atomic i32 5, i32 * %addr.i unordered, align 8
+  fence release
+  %invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
+  call void @escaping.invariant.start({}* %invst)
+  br label %loop
+
+loop:
+  %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
+  %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
+  %volload = load atomic i8, i8* %volatile unordered, align 8
+  fence acquire
+  %volchk = icmp eq i8 %volload, 0
+  %addrld = load atomic i32, i32* %addr.i unordered, align 8
+  %sel = select i1 %volchk, i32 0, i32 %addrld
+  %sum.next = add i32 %sel, %sum
+  %indvar.next = add i32 %indvar, 1
+  %cond = icmp slt i32 %indvar.next, %n
+  br i1 %cond, label %loop, label %loopexit
+
+loopexit:
+  ret i32 %sum
+}
+
+; FIXME: invariant.start dominates the load, and in this scope, the
+; load is invariant. So, we can hoist the `addrld` load out of the loop.
+; Consider the loadoperand addr.i bitcasted before being passed to
+; invariant.start
+define i32 @test_fence3(i32* %addr, i32 %n, i8* %volatile) {
+; CHECK-LABEL: @test_fence3
+; CHECK-LABEL: entry
+; CHECK: invariant.start
+; CHECK-NOT: %addrld = load atomic i32, i32* %addr.i unordered, align 8
+; CHECK: br label %loop
+entry: 
+  %addr.i = getelementptr inbounds i32, i32* %addr, i64 8
+  %gep = bitcast i32* %addr.i to i8 *
+  store atomic i32 5, i32 * %addr.i unordered, align 8
+  fence release
+  %invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
+  br label %loop
+
+loop:
+  %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
+  %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
+  %volload = load atomic i8, i8* %volatile unordered, align 8
+  fence acquire
+  %volchk = icmp eq i8 %volload, 0
+  %addrld = load atomic i32, i32* %addr.i unordered, align 8
+  %sel = select i1 %volchk, i32 0, i32 %addrld
+  %sum.next = add i32 %sel, %sum
+  %indvar.next = add i32 %indvar, 1
+  %cond = icmp slt i32 %indvar.next, %n
+  br i1 %cond, label %loop, label %loopexit
+
+loopexit:
+  ret i32 %sum
+}
+
+; We should not hoist the addrld out of the loop.
+define i32 @test_fence4(i32* %addr, i32 %n, i8* %volatile) {
+; CHECK-LABEL: @test_fence4
+; CHECK-LABEL: entry
+; CHECK-NOT: %addrld = load atomic i32, i32* %addr.i unordered, align 8
+; CHECK: br label %loop
+entry: 
+  %addr.i = getelementptr inbounds i32, i32* %addr, i64 8
+  %gep = bitcast i32* %addr.i to i8 *
+  br label %loop
+
+loop:
+  %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
+  %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
+  store atomic i32 5, i32 * %addr.i unordered, align 8
+  fence release
+  %invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
+  %volload = load atomic i8, i8* %volatile unordered, align 8
+  fence acquire
+  %volchk = icmp eq i8 %volload, 0
+  %addrld = load atomic i32, i32* %addr.i unordered, align 8
+  %sel = select i1 %volchk, i32 0, i32 %addrld
+  %sum.next = add i32 %sel, %sum
+  %indvar.next = add i32 %indvar, 1
+  %cond = icmp slt i32 %indvar.next, %n
+  br i1 %cond, label %loop, label %loopexit
+
+loopexit:
+  ret i32 %sum
+}

Added: llvm/trunk/test/Transforms/LICM/infinite_loops.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/infinite_loops.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/infinite_loops.ll (added)
+++ llvm/trunk/test/Transforms/LICM/infinite_loops.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -basicaa -licm < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
+
+; Make sure we don't hoist the unsafe division to some executable block.
+define void @test_impossible_exit_in_untaken_block(i32 %a, i32 %b, i32* %p) {
+; CHECK-LABEL: @test_impossible_exit_in_untaken_block(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 false, label [[NEVER_TAKEN:%.*]], label [[BACKEDGE]]
+; CHECK:       never_taken:
+; CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    store i32 [[DIV]], i32* [[P:%.*]]
+; CHECK-NEXT:    br i1 true, label [[BACKEDGE]], label [[EXIT:%.*]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    br label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  br i1 false, label %never_taken, label %backedge
+
+never_taken:
+  %div = sdiv i32 %a, %b
+  store i32 %div, i32* %p
+  br i1 true, label %backedge, label %exit
+
+backedge:
+  %iv.next = add i32 %iv, 1
+  br label %loop
+
+exit:
+  ret void
+}
+
+; The test above is UB in C++, because there is a requirement that any
+; thead should eventually terminate, execute volatile access operation, call IO
+; or synchronize. In spite of that, the behavior in the test above *might* be
+; correct. This one is equivalent to the test above, but it has a volatile
+; memory access in the loop's mustexec block, so the compiler no longer has a
+; right to assume that it must terminate. Show that the same problem persists,
+; and that it was a bug and not a cool optimization based on loop infinity.
+; By the moment when this test was added, it was accidentally correct due to
+; reasons not directly related to this piece of logic. Make sure that it keeps
+; correct in the future.
+define void @test_impossible_exit_in_untaken_block_no_ub(i32 %a, i32 %b, i32* noalias %p, i32* noalias %vp) {
+; CHECK-LABEL: @test_impossible_exit_in_untaken_block_no_ub(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, i32* [[VP:%.*]]
+; CHECK-NEXT:    br i1 false, label [[NEVER_TAKEN:%.*]], label [[BACKEDGE]]
+; CHECK:       never_taken:
+; CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    store i32 [[DIV]], i32* [[P:%.*]]
+; CHECK-NEXT:    br i1 true, label [[BACKEDGE]], label [[EXIT:%.*]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    br label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  load volatile i32, i32* %vp
+  br i1 false, label %never_taken, label %backedge
+
+never_taken:
+  %div = sdiv i32 %a, %b
+  store i32 %div, i32* %p
+  br i1 true, label %backedge, label %exit
+
+backedge:
+  %iv.next = add i32 %iv, 1
+  br label %loop
+
+exit:
+  ret void
+}
+
+; Same as above, but the volatile access is in mustexecute backedge block. The
+; loop is no longer "finite by specification", make sure we don't hoist sdiv
+; from it no matter how general the MustThrow analysis is.
+define void @test_impossible_exit_in_untaken_block_no_ub_2(i32 %a, i32 %b, i32* noalias %p, i32* noalias %vp) {
+; CHECK-LABEL: @test_impossible_exit_in_untaken_block_no_ub_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 false, label [[NEVER_TAKEN:%.*]], label [[BACKEDGE]]
+; CHECK:       never_taken:
+; CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    store i32 [[DIV]], i32* [[P:%.*]]
+; CHECK-NEXT:    br i1 true, label [[BACKEDGE]], label [[EXIT:%.*]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, i32* [[VP:%.*]]
+; CHECK-NEXT:    br label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  br i1 false, label %never_taken, label %backedge
+
+never_taken:
+  %div = sdiv i32 %a, %b
+  store i32 %div, i32* %p
+  br i1 true, label %backedge, label %exit
+
+backedge:
+  %iv.next = add i32 %iv, 1
+  load volatile i32, i32* %vp
+  br label %loop
+
+exit:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LICM/int_sideeffect.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/int_sideeffect.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/int_sideeffect.ll (added)
+++ llvm/trunk/test/Transforms/LICM/int_sideeffect.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,27 @@
+; RUN: opt -S < %s -licm | FileCheck %s
+
+declare void @llvm.sideeffect()
+
+; LICM across a @llvm.sideeffect.
+
+; CHECK-LABEL: licm
+; CHECK: load
+; CHECK: loop:
+; CHECK-NOT: load
+define float @licm(i64 %n, float* nocapture readonly %p) #0 {
+bb0:
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %bb0 ], [ %t5, %loop ]
+  %sum = phi float [ 0.000000e+00, %bb0 ], [ %t4, %loop ]
+  call void @llvm.sideeffect()
+  %t3 = load float, float* %p
+  %t4 = fadd float %sum, %t3
+  %t5 = add i64 %i, 1
+  %t6 = icmp ult i64 %t5, %n
+  br i1 %t6, label %loop, label %bb2
+
+bb2:
+  ret float %t4
+}

Added: llvm/trunk/test/Transforms/LICM/invariant.start.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/invariant.start.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/invariant.start.ll (added)
+++ llvm/trunk/test/Transforms/LICM/invariant.start.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,138 @@
+; RUN: opt -licm -basicaa -licm-n2-threshold=0 < %s -S | FileCheck %s
+; RUN: opt -licm -basicaa -licm-n2-threshold=200 < %s -S | FileCheck %s --check-prefix=ALIAS-N2
+; RUN: opt -aa-pipeline=basic-aa -licm-n2-threshold=0 -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -licm-n2-threshold=200 -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s --check-prefix=ALIAS-N2
+
+define void @test1(i1 %cond, i32* %ptr) {
+; CHECK-LABEL: @test1(
+; CHECK-LABEL: entry:
+; CHECK: call {}* @llvm.invariant.start.p0i32(i64 4, i32* %ptr)
+; CHECK: %val = load i32, i32* %ptr
+; CHECK-LABEL: loop:
+
+; ALIAS-N2-LABEL: @test1(
+; ALIAS-N2-LABEL: entry:
+; ALIAS-N2: call {}* @llvm.invariant.start.p0i32(i64 4, i32* %ptr)
+; ALIAS-N2: %val = load i32, i32* %ptr
+; ALIAS-N2-LABEL: loop:
+
+entry:
+  br label %loop
+
+loop:
+  %x = phi i32 [ 0, %entry ], [ %x.inc, %loop ]
+  call {}* @llvm.invariant.start.p0i32(i64 4, i32* %ptr)
+  %val = load i32, i32* %ptr
+  %x.inc = add i32 %x, %val
+  br label %loop
+}
+
+;; despite the loop varying invariant.start, we should be
+;; able to hoist the load
+define void @test2(i1 %cond, i32* %ptr) {
+; CHECK-LABEL: @test2(
+; CHECK-LABEL: entry:
+; CHECK: %val = load i32, i32* %ptr
+; CHECK-LABEL: loop:
+; CHECK: call {}* @llvm.invariant.start.p0i32(i64 4, i32* %piv)
+
+; ALIAS-N2-LABEL: @test2(
+; ALIAS-N2-LABEL: entry:
+; ALIAS-N2:         %val = load i32, i32* %ptr
+; ALIAS-N2-LABEL: loop:
+; ALIAS-N2:         call {}* @llvm.invariant.start.p0i32(i64 4, i32* %piv)
+entry:
+  br label %loop
+
+loop:
+  %x = phi i32 [ 0, %entry ], [ %x.inc, %loop ]
+  %piv = getelementptr i32, i32* %ptr, i32 %x
+  call {}* @llvm.invariant.start.p0i32(i64 4, i32* %piv)
+  %val = load i32, i32* %ptr
+  %x.inc = add i32 %x, %val
+  br label %loop
+}
+
+define void @test3(i1 %cond, i32* %ptr) {
+; CHECK-LABEL: @test3(
+; CHECK-LABEL: entry:
+; CHECK: call {}* @llvm.invariant.start.p0i32(i64 4, i32* %ptr)
+; CHECK: %val = load i32, i32* %ptr
+; CHECK-LABEL: loop:
+
+; ALIAS-N2-LABEL: @test3(
+; ALIAS-N2-LABEL: entry:
+; ALIAS-N2: call {}* @llvm.invariant.start.p0i32(i64 4, i32* %ptr)
+; ALIAS-N2: %val = load i32, i32* %ptr
+; ALIAS-N2-LABEL: loop:
+entry:
+  br label %loop
+
+loop:
+  %x = phi i32 [ 0, %entry ], [ %x.inc, %loop ]
+  call {}* @llvm.invariant.start.p0i32(i64 4, i32* %ptr)
+  %val = load i32, i32* %ptr
+  %p2 = getelementptr i32, i32* %ptr, i32 1
+  store volatile i32 0, i32* %p2
+  %x.inc = add i32 %x, %val
+  br label %loop
+}
+
+; can't hoist due to init in loop, only well defined if loop exits
+; on first iteration, but we don't bother checking for that currently
+define void @test4(i1 %cond, i32* %ptr) {
+; CHECK-LABEL: @test4(
+; CHECK-LABEL: entry:
+; CHECK-LABEL: loop:
+; CHECK:   store i32 0, i32* %ptr
+; CHECK: call {}* @llvm.invariant.start.p0i32(i64 4, i32* %ptr)
+; CHECK: %val = load i32, i32* %ptr
+
+; ALIAS-N2-LABEL: @test4(
+; ALIAS-N2-LABEL: entry:
+; ALIAS-N2-LABEL: loop:
+; ALIAS-N2:   store i32 0, i32* %ptr
+; ALIAS-N2: call {}* @llvm.invariant.start.p0i32(i64 4, i32* %ptr)
+; ALIAS-N2: %val = load i32, i32* %ptr
+entry:
+  br label %loop
+
+loop:
+  %x = phi i32 [ 0, %entry ], [ %x.inc, %loop ]
+  store i32 0, i32* %ptr
+  call {}* @llvm.invariant.start.p0i32(i64 4, i32* %ptr)
+  %val = load i32, i32* %ptr
+  %x.inc = add i32 %x, %val
+  br label %loop
+}
+
+; don't try to reason about scopes
+define void @test5(i1 %cond, i32* %ptr) {
+; CHECK-LABEL: @test5(
+; CHECK-LABEL: entry:
+; CHECK-LABEL: loop:
+; CHECK:   store i32 0, i32* %ptr
+; CHECK: call {}* @llvm.invariant.start.p0i32(i64 4, i32* %ptr)
+; CHECK: %val = load i32, i32* %ptr
+
+; ALIAS-N2-LABEL: @test5(
+; ALIAS-N2-LABEL: entry:
+; ALIAS-N2-LABEL: loop:
+; ALIAS-N2:   store i32 0, i32* %ptr
+; ALIAS-N2: call {}* @llvm.invariant.start.p0i32(i64 4, i32* %ptr)
+; ALIAS-N2: %val = load i32, i32* %ptr
+entry:
+  br label %loop
+
+loop:
+  %x = phi i32 [ 0, %entry ], [ %x.inc, %loop ]
+  store i32 0, i32* %ptr
+  %scope = call {}* @llvm.invariant.start.p0i32(i64 4, i32* %ptr)
+  %val = load i32, i32* %ptr
+  call void @llvm.invariant.end.p0i32({}* %scope, i64 4, i32* %ptr)
+  %x.inc = add i32 %x, %val
+  br label %loop
+}
+
+declare {}* @llvm.invariant.start.p0i32(i64, i32*)
+declare void @llvm.invariant.end.p0i32({}*, i64, i32*)

Added: llvm/trunk/test/Transforms/LICM/lcssa-ssa-promoter.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/lcssa-ssa-promoter.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/lcssa-ssa-promoter.ll (added)
+++ llvm/trunk/test/Transforms/LICM/lcssa-ssa-promoter.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,77 @@
+; RUN: opt -S -basicaa -licm < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s| FileCheck %s
+;
+; Manually validate LCSSA form is preserved even after SSAUpdater is used to
+; promote things in the loop bodies.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at x = common global i32 0, align 4
+ at y = common global i32 0, align 4
+
+define void @PR18688() {
+; CHECK-LABEL: @PR18688(
+
+entry:
+  br i1 undef, label %return, label %outer.preheader
+
+outer.preheader:
+  br label %outer.header
+; CHECK: outer.preheader:
+; CHECK: br label %outer.header
+
+outer.header:
+  store i32 0, i32* @x, align 4
+  br i1 undef, label %outer.latch, label %inner.preheader
+; CHECK: outer.header:
+; CHECK-NEXT: br i1 undef, label %outer.latch, label %inner.preheader
+
+inner.preheader:
+  br label %inner.header
+; CHECK: inner.preheader:
+; CHECK-NEXT: br label %inner.header
+
+inner.header:
+  br i1 undef, label %inner.body.rhs, label %inner.latch
+; CHECK: inner.header:
+; CHECK-NEXT: %[[PHI0:[^,]+]] = phi i32 [ %{{[^,]+}}, %inner.latch ], [ 0, %inner.preheader ]
+; CHECK-NEXT: br i1 undef, label %inner.body.rhs, label %inner.latch
+
+inner.body.rhs:
+  store i32 0, i32* @x, align 4
+  br label %inner.latch
+; CHECK: inner.body.rhs:
+; CHECK-NEXT: br label %inner.latch
+
+inner.latch:
+  %y_val = load i32, i32* @y, align 4
+  %icmp = icmp eq i32 %y_val, 0
+  br i1 %icmp, label %inner.exit, label %inner.header
+; CHECK: inner.latch:
+; CHECK-NEXT: %[[PHI1:[^,]+]] = phi i32 [ 0, %inner.body.rhs ], [ %[[PHI0]], %inner.header ]
+; CHECK-NEXT: br i1 %{{[^,]+}}, label %inner.exit, label %inner.header
+
+inner.exit:
+  br label %outer.latch
+; CHECK: inner.exit:
+; CHECK-NEXT: %[[INNER_LCSSA:[^,]+]] = phi i32 [ %[[PHI1]], %inner.latch ]
+; CHECK-NEXT: br label %outer.latch
+
+outer.latch:
+  br i1 undef, label %outer.exit, label %outer.header
+; CHECK: outer.latch:
+; CHECK-NEXT: %[[PHI2:[^,]+]] = phi i32 [ %[[INNER_LCSSA]], %inner.exit ], [ 0, %outer.header ]
+; CHECK-NEXT: br i1 {{.*}}, label %outer.exit, label %outer.header
+
+outer.exit:
+  br label %return
+; CHECK: outer.exit:
+; CHECK-NEXT: %[[OUTER_LCSSA:[^,]+]] = phi i32 [ %[[PHI2]], %outer.latch ]
+; CHECK-NEXT: store i32 %[[OUTER_LCSSA]]
+; CHECK-NEXT: br label %return
+
+return:
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/LICM/loopsink-pr38462.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/loopsink-pr38462.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/loopsink-pr38462.ll (added)
+++ llvm/trunk/test/Transforms/LICM/loopsink-pr38462.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,65 @@
+; RUN: opt -S -loop-sink < %s | FileCheck %s
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc19.13.26128"
+
+%struct.FontInfoData = type { i32 (...)** }
+%struct.S = type { i8 }
+
+; CHECK: @pr38462
+; Make sure not to assert by trying to sink into catch.dispatch.
+
+define void @pr38462(%struct.FontInfoData* %this) personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*) !prof !1 {
+entry:
+  %s = alloca %struct.S
+  %call6 = call i32 @f()
+  %tobool7 = icmp eq i32 %call6, 0
+  br i1 %tobool7, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:
+  %0 = getelementptr inbounds %struct.S, %struct.S* %s, i64 0, i32 0
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %call2 = invoke i32 @f() to label %__try.cont unwind label %catch.dispatch
+
+catch.dispatch:
+  %1 = catchswitch within none [label %__except] unwind to caller
+
+__except:
+  %2 = catchpad within %1 [i8* null]
+  catchret from %2 to label %__except3
+
+__except3:
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* nonnull %0)
+  %call.i = call zeroext i1 @g(%struct.S* nonnull %s)
+  br i1 %call.i, label %if.then.i, label %exit
+
+if.then.i:
+  %call2.i = call i32 @f()
+  br label %exit
+
+exit:
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* nonnull %0)
+  br label %__try.cont
+
+__try.cont:
+  %call = call i32 @f()
+  %tobool = icmp eq i32 %call, 0
+  br i1 %tobool, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+declare i32 @__C_specific_handler(...)
+declare i32 @f()
+declare zeroext i1 @g(%struct.S*)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
+
+!1 = !{!"function_entry_count", i64 1}
+

Added: llvm/trunk/test/Transforms/LICM/loopsink-pr39570.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/loopsink-pr39570.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/loopsink-pr39570.ll (added)
+++ llvm/trunk/test/Transforms/LICM/loopsink-pr39570.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,112 @@
+; RUN: opt -S -loop-sink < %s | FileCheck %s
+
+; CHECK: pr39570
+; Make sure not to assert.
+
+%0 = type { i32, %1*, %2, %6*, %33* }
+%1 = type { i32 (...)** }
+%2 = type { %3* }
+%3 = type { %4, i32, %5* }
+%4 = type { i32 (...)**, i32 }
+%5 = type opaque
+%6 = type { %7, %1*, %31*, i8, %2, %32* }
+%7 = type <{ %8, %9*, %10, i32, %33*, %33*, %33*, %27, %28, i16 }>
+%8 = type { i32 (...)** }
+%9 = type opaque
+%10 = type { %11, %16, %18, %19 }
+%11 = type { %12*, i32, i32, %13* }
+%12 = type { i32 (...)** }
+%13 = type { %14*, %14* }
+%14 = type { %15, i32 }
+%15 = type { %12*, i32, i32, i16* }
+%16 = type { %12*, i32, i32, %17* }
+%17 = type { %13, %14* }
+%18 = type { %12*, i32, i32, %14** }
+%19 = type { %20, %21, %12*, float, i32, i32, %22, %22, %24, i32, i32 }
+%20 = type { i8 }
+%21 = type { i8 }
+%22 = type { %12*, %23*, %23* }
+%23 = type opaque
+%24 = type { %12*, i32, i32, %25* }
+%25 = type { %12*, i32, i32, %26* }
+%26 = type opaque
+%27 = type { %33* }
+%28 = type { %29, i32, i32, %14* }
+%29 = type { %30 }
+%30 = type { i32 (...)** }
+%31 = type opaque
+%32 = type { i32 (...)** }
+%33 = type <{ %8, %9*, %10, i32, %33*, %33*, %33*, %27, %28, i16, [2 x i8] }>
+
+define dso_local void @pr39570() local_unnamed_addr align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) !prof !1 {
+  br i1 undef, label %8, label %1, !prof !2
+
+; <label>:1:                                      ; preds = %0
+  %2 = load %0*, %0** undef, align 4
+  br label %3
+
+; <label>:3:                                      ; preds = %7, %1
+  %4 = getelementptr inbounds %0, %0* %2, i32 undef, i32 0
+  br label %5
+
+; <label>:5:                                      ; preds = %3
+  %6 = getelementptr inbounds %0, %0* %2, i32 undef, i32 4
+  br i1 undef, label %18, label %7, !prof !3
+
+; <label>:7:                                      ; preds = %5
+  br label %3
+
+; <label>:8:                                      ; preds = %0
+  invoke void @baz()
+          to label %9 unwind label %12
+
+; <label>:9:                                      ; preds = %8
+  invoke void @bar()
+          to label %17 unwind label %10
+
+; <label>:10:                                     ; preds = %9
+  %11 = landingpad { i8*, i32 }
+          catch i8* null
+  unreachable
+
+; <label>:12:                                     ; preds = %8
+  %13 = landingpad { i8*, i32 }
+          cleanup
+  invoke void @bar()
+          to label %16 unwind label %14
+
+; <label>:14:                                     ; preds = %12
+  %15 = landingpad { i8*, i32 }
+          catch i8* null
+  unreachable
+
+; <label>:16:                                     ; preds = %12
+  resume { i8*, i32 } %13
+
+; <label>:17:                                     ; preds = %9
+  br label %18
+
+; <label>:18:                                     ; preds = %17, %5
+  invoke void @baz()
+          to label %19 unwind label %20
+
+; <label>:19:                                     ; preds = %18
+  invoke void @bar()
+          to label %22 unwind label %20
+
+; <label>:20:                                     ; preds = %19
+  %21 = landingpad { i8*, i32 }
+          catch i8* null
+  unreachable
+
+; <label>:22:                                     ; preds = %19
+  ret void
+}
+
+declare dso_local i32 @__gxx_personality_v0(...)
+declare dso_local void @bar() local_unnamed_addr
+declare dso_local void @baz() local_unnamed_addr align 2
+
+!1 = !{!"function_entry_count", i64 0}
+!2 = !{!"branch_weights", i32 1, i32 3215551}
+!3 = !{!"branch_weights", i32 3215551, i32 1}

Added: llvm/trunk/test/Transforms/LICM/loopsink-pr39695.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/loopsink-pr39695.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/loopsink-pr39695.ll (added)
+++ llvm/trunk/test/Transforms/LICM/loopsink-pr39695.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,37 @@
+; RUN: opt -S -loop-sink < %s | FileCheck %s
+
+; The load instruction should not be sunk into following loop.
+; CHECK:      @foo
+; CHECK-NEXT: entry
+; CHECK-NEXT: %ptr = load i8*, i8** %pp, align 8
+; CHECK-NEXT: store i8* null, i8** %pp, align 8
+
+define i32 @foo(i32 %n, i8** %pp) !prof !0 {
+entry:
+  %ptr = load i8*, i8** %pp, align 8
+  store i8* null, i8** %pp, align 8
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp ult i32 %i.0, %n
+  br i1 %cmp, label %for.body, label %for.end, !prof !1
+
+for.body:                                         ; preds = %for.cond
+  %0 = sext i32 %i.0 to i64
+  %arrayidx = getelementptr inbounds i8, i8* %ptr, i64 %0
+  %1 = load i8, i8* %arrayidx, align 1
+  %or19 = call i8 @llvm.bitreverse.i8(i8 %1)
+  %v = sext i8 %or19 to i32
+  %inc = add i32 %i.0, %v
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret i32 %i.0
+}
+
+declare i8 @llvm.bitreverse.i8(i8) #0
+attributes #0 = { nounwind readnone speculatable }
+
+!0 = !{!"function_entry_count", i64 1}
+!1 = !{!"branch_weights", i32 1, i32 2000}

Added: llvm/trunk/test/Transforms/LICM/loopsink.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/loopsink.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/loopsink.ll (added)
+++ llvm/trunk/test/Transforms/LICM/loopsink.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,445 @@
+; RUN: opt -S -loop-sink < %s | FileCheck %s
+; RUN: opt -S -aa-pipeline=basic-aa -passes=loop-sink < %s | FileCheck %s
+
+ at g = global i32 0, align 4
+
+;     b1
+;    /  \
+;   b2  b6
+;  /  \  |
+; b3  b4 |
+;  \  /  |
+;   b5   |
+;    \  /
+;     b7
+; preheader: 1000
+; b2: 15
+; b3: 7
+; b4: 7
+; Sink load to b2
+; CHECK: t1
+; CHECK: .b2:
+; CHECK: load i32, i32* @g
+; CHECK: .b3:
+; CHECK-NOT:  load i32, i32* @g
+define i32 @t1(i32, i32) #0 !prof !0 {
+  %3 = icmp eq i32 %1, 0
+  br i1 %3, label %.exit, label %.preheader
+
+.preheader:
+  %invariant = load i32, i32* @g
+  br label %.b1
+
+.b1:
+  %iv = phi i32 [ %t7, %.b7 ], [ 0, %.preheader ]
+  %c1 = icmp sgt i32 %iv, %0
+  br i1 %c1, label %.b2, label %.b6, !prof !1
+
+.b2:
+  %c2 = icmp sgt i32 %iv, 1
+  br i1 %c2, label %.b3, label %.b4
+
+.b3:
+  %t3 = sub nsw i32 %invariant, %iv
+  br label %.b5
+
+.b4:
+  %t4 = add nsw i32 %invariant, %iv
+  br label %.b5
+
+.b5:
+  %p5 = phi i32 [ %t3, %.b3 ], [ %t4, %.b4 ]
+  %t5 = mul nsw i32 %p5, 5
+  br label %.b7
+
+.b6:
+  %t6 = add nsw i32 %iv, 100
+  br label %.b7
+
+.b7:
+  %p7 = phi i32 [ %t6, %.b6 ], [ %t5, %.b5 ]
+  %t7 = add nuw nsw i32 %iv, 1
+  %c7 = icmp eq i32 %t7, %p7
+  br i1 %c7, label %.b1, label %.exit, !prof !3
+
+.exit:
+  ret i32 10
+}
+
+;     b1
+;    /  \
+;   b2  b6
+;  /  \  |
+; b3  b4 |
+;  \  /  |
+;   b5   |
+;    \  /
+;     b7
+; preheader: 500
+; b1: 16016
+; b3: 8
+; b6: 8
+; Sink load to b3 and b6
+; CHECK: t2
+; CHECK: .preheader:
+; CHECK-NOT: load i32, i32* @g
+; CHECK: .b3:
+; CHECK: load i32, i32* @g
+; CHECK: .b4:
+; CHECK: .b6:
+; CHECK: load i32, i32* @g
+; CHECK: .b7:
+define i32 @t2(i32, i32) #0 !prof !0 {
+  %3 = icmp eq i32 %1, 0
+  br i1 %3, label %.exit, label %.preheader
+
+.preheader:
+  %invariant = load i32, i32* @g
+  br label %.b1
+
+.b1:
+  %iv = phi i32 [ %t7, %.b7 ], [ 0, %.preheader ]
+  %c1 = icmp sgt i32 %iv, %0
+  br i1 %c1, label %.b2, label %.b6, !prof !2
+
+.b2:
+  %c2 = icmp sgt i32 %iv, 1
+  br i1 %c2, label %.b3, label %.b4, !prof !1
+
+.b3:
+  %t3 = sub nsw i32 %invariant, %iv
+  br label %.b5
+
+.b4:
+  %t4 = add nsw i32 5, %iv
+  br label %.b5
+
+.b5:
+  %p5 = phi i32 [ %t3, %.b3 ], [ %t4, %.b4 ]
+  %t5 = mul nsw i32 %p5, 5
+  br label %.b7
+
+.b6:
+  %t6 = add nsw i32 %iv, %invariant
+  br label %.b7
+
+.b7:
+  %p7 = phi i32 [ %t6, %.b6 ], [ %t5, %.b5 ]
+  %t7 = add nuw nsw i32 %iv, 1
+  %c7 = icmp eq i32 %t7, %p7
+  br i1 %c7, label %.b1, label %.exit, !prof !3
+
+.exit:
+  ret i32 10
+}
+
+;     b1
+;    /  \
+;   b2  b6
+;  /  \  |
+; b3  b4 |
+;  \  /  |
+;   b5   |
+;    \  /
+;     b7
+; preheader: 500
+; b3: 8
+; b5: 16008
+; Do not sink load from preheader.
+; CHECK: t3
+; CHECK: .preheader:
+; CHECK: load i32, i32* @g
+; CHECK: .b1:
+; CHECK-NOT: load i32, i32* @g
+define i32 @t3(i32, i32) #0 !prof !0 {
+  %3 = icmp eq i32 %1, 0
+  br i1 %3, label %.exit, label %.preheader
+
+.preheader:
+  %invariant = load i32, i32* @g
+  br label %.b1
+
+.b1:
+  %iv = phi i32 [ %t7, %.b7 ], [ 0, %.preheader ]
+  %c1 = icmp sgt i32 %iv, %0
+  br i1 %c1, label %.b2, label %.b6, !prof !2
+
+.b2:
+  %c2 = icmp sgt i32 %iv, 1
+  br i1 %c2, label %.b3, label %.b4, !prof !1
+
+.b3:
+  %t3 = sub nsw i32 %invariant, %iv
+  br label %.b5
+
+.b4:
+  %t4 = add nsw i32 5, %iv
+  br label %.b5
+
+.b5:
+  %p5 = phi i32 [ %t3, %.b3 ], [ %t4, %.b4 ]
+  %t5 = mul nsw i32 %p5, %invariant
+  br label %.b7
+
+.b6:
+  %t6 = add nsw i32 %iv, 5
+  br label %.b7
+
+.b7:
+  %p7 = phi i32 [ %t6, %.b6 ], [ %t5, %.b5 ]
+  %t7 = add nuw nsw i32 %iv, 1
+  %c7 = icmp eq i32 %t7, %p7
+  br i1 %c7, label %.b1, label %.exit, !prof !3
+
+.exit:
+  ret i32 10
+}
+
+; For single-BB loop with <=1 avg trip count, sink load to b1
+; CHECK: t4
+; CHECK: .preheader:
+; CHECK-NOT: load i32, i32* @g
+; CHECK: .b1:
+; CHECK: load i32, i32* @g
+; CHECK: .exit:
+define i32 @t4(i32, i32) #0 !prof !0 {
+.preheader:
+  %invariant = load i32, i32* @g
+  br label %.b1
+
+.b1:
+  %iv = phi i32 [ %t1, %.b1 ], [ 0, %.preheader ]
+  %t1 = add nsw i32 %invariant, %iv
+  %c1 = icmp sgt i32 %iv, %0
+  br i1 %c1, label %.b1, label %.exit, !prof !1
+
+.exit:
+  ret i32 10
+}
+
+;     b1
+;    /  \
+;   b2  b6
+;  /  \  |
+; b3  b4 |
+;  \  /  |
+;   b5   |
+;    \  /
+;     b7
+; preheader: 1000
+; b2: 15
+; b3: 7
+; b4: 7
+; There is alias store in loop, do not sink load
+; CHECK: t5
+; CHECK: .preheader:
+; CHECK: load i32, i32* @g
+; CHECK: .b1:
+; CHECK-NOT: load i32, i32* @g
+define i32 @t5(i32, i32*) #0 !prof !0 {
+  %3 = icmp eq i32 %0, 0
+  br i1 %3, label %.exit, label %.preheader
+
+.preheader:
+  %invariant = load i32, i32* @g
+  br label %.b1
+
+.b1:
+  %iv = phi i32 [ %t7, %.b7 ], [ 0, %.preheader ]
+  %c1 = icmp sgt i32 %iv, %0
+  br i1 %c1, label %.b2, label %.b6, !prof !1
+
+.b2:
+  %c2 = icmp sgt i32 %iv, 1
+  br i1 %c2, label %.b3, label %.b4
+
+.b3:
+  %t3 = sub nsw i32 %invariant, %iv
+  br label %.b5
+
+.b4:
+  %t4 = add nsw i32 %invariant, %iv
+  br label %.b5
+
+.b5:
+  %p5 = phi i32 [ %t3, %.b3 ], [ %t4, %.b4 ]
+  %t5 = mul nsw i32 %p5, 5
+  br label %.b7
+
+.b6:
+  %t6 = call i32 @foo()
+  br label %.b7
+
+.b7:
+  %p7 = phi i32 [ %t6, %.b6 ], [ %t5, %.b5 ]
+  %t7 = add nuw nsw i32 %iv, 1
+  %c7 = icmp eq i32 %t7, %p7
+  br i1 %c7, label %.b1, label %.exit, !prof !3
+
+.exit:
+  ret i32 10
+}
+
+;     b1
+;    /  \
+;   b2  b6
+;  /  \  |
+; b3  b4 |
+;  \  /  |
+;   b5   |
+;    \  /
+;     b7
+; preheader: 1000
+; b2: 15
+; b3: 7
+; b4: 7
+; Regardless of aliasing store in loop this load from constant memory can be sunk.
+; CHECK: t5_const_memory
+; CHECK: .preheader:
+; CHECK-NOT: load i32, i32* @g_const
+; CHECK: .b2:
+; CHECK: load i32, i32* @g_const
+; CHECK: br i1 %c2, label %.b3, label %.b4
+define i32 @t5_const_memory(i32, i32*) #0 !prof !0 {
+  %3 = icmp eq i32 %0, 0
+  br i1 %3, label %.exit, label %.preheader
+
+.preheader:
+  %invariant = load i32, i32* @g_const
+  br label %.b1
+
+.b1:
+  %iv = phi i32 [ %t7, %.b7 ], [ 0, %.preheader ]
+  %c1 = icmp sgt i32 %iv, %0
+  br i1 %c1, label %.b2, label %.b6, !prof !1
+
+.b2:
+  %c2 = icmp sgt i32 %iv, 1
+  br i1 %c2, label %.b3, label %.b4
+
+.b3:
+  %t3 = sub nsw i32 %invariant, %iv
+  br label %.b5
+
+.b4:
+  %t4 = add nsw i32 %invariant, %iv
+  br label %.b5
+
+.b5:
+  %p5 = phi i32 [ %t3, %.b3 ], [ %t4, %.b4 ]
+  %t5 = mul nsw i32 %p5, 5
+  br label %.b7
+
+.b6:
+  %t6 = call i32 @foo()
+  br label %.b7
+
+.b7:
+  %p7 = phi i32 [ %t6, %.b6 ], [ %t5, %.b5 ]
+  %t7 = add nuw nsw i32 %iv, 1
+  %c7 = icmp eq i32 %t7, %p7
+  br i1 %c7, label %.b1, label %.exit, !prof !3
+
+.exit:
+  ret i32 10
+}
+
+;     b1
+;    /  \
+;   b2  b3
+;    \  /
+;     b4
+; preheader: 1000
+; b2: 15
+; b3: 7
+; Do not sink unordered atomic load to b2
+; CHECK: t6
+; CHECK: .preheader:
+; CHECK:  load atomic i32, i32* @g unordered, align 4
+; CHECK: .b2:
+; CHECK-NOT: load atomic i32, i32* @g unordered, align 4
+define i32 @t6(i32, i32) #0 !prof !0 {
+  %3 = icmp eq i32 %1, 0
+  br i1 %3, label %.exit, label %.preheader
+
+.preheader:
+  %invariant = load atomic i32, i32* @g unordered, align 4
+  br label %.b1
+
+.b1:
+  %iv = phi i32 [ %t3, %.b4 ], [ 0, %.preheader ]
+  %c1 = icmp sgt i32 %iv, %0
+  br i1 %c1, label %.b2, label %.b3, !prof !1
+
+.b2:
+  %t1 = add nsw i32 %invariant, %iv
+  br label %.b4
+
+.b3:
+  %t2 = add nsw i32 %iv, 100
+  br label %.b4
+
+.b4:
+  %p1 = phi i32 [ %t2, %.b3 ], [ %t1, %.b2 ]
+  %t3 = add nuw nsw i32 %iv, 1
+  %c2 = icmp eq i32 %t3, %p1
+  br i1 %c2, label %.b1, label %.exit, !prof !3
+
+.exit:
+  ret i32 10
+}
+
+ at g_const = constant i32 0, align 4
+
+;     b1
+;    /  \
+;   b2  b3
+;    \  /
+;     b4
+; preheader: 1000
+; b2: 0.5
+; b3: 999.5
+; Sink unordered atomic load to b2. It is allowed to sink into loop unordered
+; load from constant.
+; CHECK: t7
+; CHECK: .preheader:
+; CHECK-NOT:  load atomic i32, i32* @g_const unordered, align 4
+; CHECK: .b2:
+; CHECK: load atomic i32, i32* @g_const unordered, align 4
+define i32 @t7(i32, i32) #0 !prof !0 {
+  %3 = icmp eq i32 %1, 0
+  br i1 %3, label %.exit, label %.preheader
+
+.preheader:
+  %invariant = load atomic i32, i32* @g_const unordered, align 4
+  br label %.b1
+
+.b1:
+  %iv = phi i32 [ %t3, %.b4 ], [ 0, %.preheader ]
+  %c1 = icmp sgt i32 %iv, %0
+  br i1 %c1, label %.b2, label %.b3, !prof !1
+
+.b2:
+  %t1 = add nsw i32 %invariant, %iv
+  br label %.b4
+
+.b3:
+  %t2 = add nsw i32 %iv, 100
+  br label %.b4
+
+.b4:
+  %p1 = phi i32 [ %t2, %.b3 ], [ %t1, %.b2 ]
+  %t3 = add nuw nsw i32 %iv, 1
+  %c2 = icmp eq i32 %t3, %p1
+  br i1 %c2, label %.b1, label %.exit, !prof !3
+
+.exit:
+  ret i32 10
+}
+
+declare i32 @foo()
+
+!0 = !{!"function_entry_count", i64 1}
+!1 = !{!"branch_weights", i32 1, i32 2000}
+!2 = !{!"branch_weights", i32 2000, i32 1}
+!3 = !{!"branch_weights", i32 100, i32 1}

Added: llvm/trunk/test/Transforms/LICM/no-preheader-test.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/no-preheader-test.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/no-preheader-test.ll (added)
+++ llvm/trunk/test/Transforms/LICM/no-preheader-test.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,21 @@
+; Test that LICM works when there is not a loop-preheader
+; RUN: opt < %s -licm | llvm-dis
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s | llvm-dis
+
+define void @testfunc(i32 %i.s, i1 %ifcond) {
+	br i1 %ifcond, label %Then, label %Else
+Then:		; preds = %0
+	br label %Loop
+Else:		; preds = %0
+	br label %Loop
+Loop:		; preds = %Loop, %Else, %Then
+	%j = phi i32 [ 0, %Then ], [ 12, %Else ], [ %Next, %Loop ]		; <i32> [#uses=1]
+	%i = bitcast i32 %i.s to i32		; <i32> [#uses=1]
+	%i2 = mul i32 %i, 17		; <i32> [#uses=1]
+	%Next = add i32 %j, %i2		; <i32> [#uses=2]
+	%cond = icmp eq i32 %Next, 0		; <i1> [#uses=1]
+	br i1 %cond, label %Out, label %Loop
+Out:		; preds = %Loop
+	ret void
+}
+

Added: llvm/trunk/test/Transforms/LICM/opt-remarks-conditional-load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/opt-remarks-conditional-load.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/opt-remarks-conditional-load.ll (added)
+++ llvm/trunk/test/Transforms/LICM/opt-remarks-conditional-load.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,47 @@
+; RUN: opt < %s -licm -pass-remarks-missed=licm -o /dev/null 2>&1 | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' %s -o /dev/null -pass-remarks-missed=licm 2>&1 | FileCheck %s
+target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
+
+; With the load from %p conditional, we can't optmize this and the remark
+; should tell us about it.
+
+define void @test(i32* %array, i32* noalias %p) {
+Entry:
+  br label %Loop
+
+Loop:
+  %j = phi i32 [ 0, %Entry ], [ %Next, %else]
+  %addr = getelementptr i32, i32* %array, i32 %j
+  %a = load i32, i32* %addr
+  %c = icmp eq i32 %a, 0
+  br i1 %c, label %then, label %else
+
+then:
+; CHECK: remark: /tmp/kk.c:2:20: failed to hoist load with loop-invariant address because load is conditionally executed
+  %b = load i32, i32* %p, !dbg !8
+  %a2 = add i32 %a, %b
+  store i32 %a2, i32* %addr
+  br label %else
+
+else:
+  %Next = add i32 %j, 1
+  %cond = icmp eq i32 %Next, 0
+  br i1 %cond, label %Out, label %Loop
+
+Out:
+  ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !2)
+!1 = !DIFile(filename: "/tmp/kk.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"PIC Level", i32 2}
+!5 = !{!"clang version 3.9.0 "}
+!6 = distinct !DISubprogram(name: "success", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
+!7 = !DISubroutineType(types: !2)
+!8 = !DILocation(line: 2, column: 20, scope: !6)

Added: llvm/trunk/test/Transforms/LICM/opt-remarks-intervening-store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/opt-remarks-intervening-store.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/opt-remarks-intervening-store.ll (added)
+++ llvm/trunk/test/Transforms/LICM/opt-remarks-intervening-store.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,67 @@
+; RUN: opt < %s -licm -pass-remarks-missed=licm -o /dev/null 2>&1 | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' %s -o /dev/null -pass-remarks-missed=licm 2>&1 | FileCheck %s
+target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
+
+; Without the noalias on %p, we can't optmize this and the remark should tell
+; us about it.
+
+define void @test(i32* %array, i32* %p) {
+Entry:
+  br label %Loop
+
+Loop:
+  %j = phi i32 [ 0, %Entry ], [ %Next, %Loop ]
+  %addr = getelementptr i32, i32* %array, i32 %j
+  %a = load i32, i32* %addr
+; CHECK: remark: /tmp/kk.c:2:20: failed to move load with loop-invariant address because the loop may invalidate its value
+  %b = load i32, i32* %p, !dbg !8
+  %a2 = add i32 %a, %b
+  store i32 %a2, i32* %addr
+  %Next = add i32 %j, 1
+  %cond = icmp eq i32 %Next, 0
+  br i1 %cond, label %Out, label %Loop
+
+Out:
+  ret void
+}
+
+; This illustrates why we need to check loop-invariance before issuing this
+; remark.
+
+define i32 @invalidated_load_with_non_loop_invariant_address(i32* %array, i32* %array2) {
+Entry:
+  br label %Loop
+
+Loop:
+  %j = phi i32 [ 0, %Entry ], [ %Next, %Loop ]
+
+; CHECK-NOT: /tmp/kk.c:3:20: {{.*}} loop-invariant
+  %addr = getelementptr i32, i32* %array, i32 %j
+  %a = load i32, i32* %addr, !dbg !9
+
+  %addr2 = getelementptr i32, i32* %array2, i32 %j
+  store i32 %j, i32* %addr2
+
+  %Next = add i32 %j, 1
+  %cond = icmp eq i32 %Next, 0
+  br i1 %cond, label %Out, label %Loop
+
+Out:
+  %a2 = phi i32 [ %a, %Loop ]
+  ret i32 %a2
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !2)
+!1 = !DIFile(filename: "/tmp/kk.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"PIC Level", i32 2}
+!5 = !{!"clang version 3.9.0 "}
+!6 = distinct !DISubprogram(name: "success", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
+!7 = !DISubroutineType(types: !2)
+!8 = !DILocation(line: 2, column: 20, scope: !6)
+!9 = !DILocation(line: 3, column: 20, scope: !6)

Added: llvm/trunk/test/Transforms/LICM/opt-remarks.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/opt-remarks.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/opt-remarks.ll (added)
+++ llvm/trunk/test/Transforms/LICM/opt-remarks.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,81 @@
+; RUN: opt < %s -licm -pass-remarks=licm -o /dev/null 2>&1 | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' %s -o /dev/null -pass-remarks=licm 2>&1 | FileCheck %s
+target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
+
+define void @hoist(i32* %array, i32* noalias %p) {
+Entry:
+  br label %Loop
+
+Loop:
+  %j = phi i32 [ 0, %Entry ], [ %Next, %Loop ]
+  %addr = getelementptr i32, i32* %array, i32 %j
+  %a = load i32, i32* %addr
+; CHECK: remark: /tmp/kk.c:2:20: hoisting load
+  %b = load i32, i32* %p, !dbg !8
+  %a2 = add i32 %a, %b
+  store i32 %a2, i32* %addr
+  %Next = add i32 %j, 1
+  %cond = icmp eq i32 %Next, 0
+  br i1 %cond, label %Out, label %Loop
+
+Out:
+  ret void
+}
+
+define i32 @sink(i32* %array, i32* noalias %p, i32 %b) {
+Entry:
+  br label %Loop
+
+Loop:
+  %j = phi i32 [ 0, %Entry ], [ %Next, %Loop ]
+  %addr = getelementptr i32, i32* %array, i32 %j
+  %a = load i32, i32* %addr
+  %a2 = add i32 %a, %b
+  store i32 %a2, i32* %addr
+; CHECK: remark: /tmp/kk.c:2:21: sinking add
+  %a3 = add i32 %a, 1, !dbg !9
+  %Next = add i32 %j, 1
+  %cond = icmp eq i32 %Next, 0
+  br i1 %cond, label %Out, label %Loop
+
+Out:
+  %a4 = phi i32 [ %a3, %Loop ]
+  ret i32 %a4
+}
+
+define void @promote(i32* %array, i32* noalias %p) {
+Entry:
+  br label %Loop
+
+Loop:
+  %j = phi i32 [ 0, %Entry ], [ %Next, %Loop ]
+  %addr = getelementptr i32, i32* %array, i32 %j
+  %a = load i32, i32* %addr
+  %b = load i32, i32* %p
+  %a2 = add i32 %a, %b
+  store i32 %a2, i32* %addr
+; CHECK: remark: /tmp/kk.c:2:22: Moving accesses to memory location out of the loop
+  store i32 %b, i32* %p, !dbg !10
+  %Next = add i32 %j, 1
+  %cond = icmp eq i32 %Next, 0
+  br i1 %cond, label %Out, label %Loop
+
+Out:
+  ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !2)
+!1 = !DIFile(filename: "/tmp/kk.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"PIC Level", i32 2}
+!5 = !{!"clang version 3.9.0 "}
+!6 = distinct !DISubprogram(name: "success", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
+!7 = !DISubroutineType(types: !2)
+!8 = !DILocation(line: 2, column: 20, scope: !6)
+!9 = !DILocation(line: 2, column: 21, scope: !6)
+!10 = !DILocation(line: 2, column: 22, scope: !6)

Added: llvm/trunk/test/Transforms/LICM/pr23608.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/pr23608.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/pr23608.ll (added)
+++ llvm/trunk/test/Transforms/LICM/pr23608.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,50 @@
+; RUN: opt -S -licm %s | FileCheck %s
+; ModuleID = '../pr23608.ll'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.PyFrameObject = type { i32 }
+
+ at a = common global %struct.PyFrameObject* null, align 8
+ at __msan_origin_tls = external thread_local(initialexec) global i32
+
+define void @fn1() {
+entry:
+  br label %indirectgoto
+
+while.cond:                                       ; preds = %indirectgoto, %bb15
+  %tmp = load %struct.PyFrameObject*, %struct.PyFrameObject** @a, align 8
+  %_msld = load i64, i64* inttoptr (i64 and (i64 ptrtoint (%struct.PyFrameObject** @a to i64), i64 -70368744177665) to i64*), align 8
+  %tmp1 = load i32, i32* inttoptr (i64 add (i64 and (i64 ptrtoint (%struct.PyFrameObject** @a to i64), i64 -70368744177665), i64 35184372088832) to i32*), align 8
+  %f_iblock = getelementptr inbounds %struct.PyFrameObject, %struct.PyFrameObject* %tmp, i64 0, i32 0
+  br label %bb2
+
+bb:                                               ; preds = %while.cond
+  call void @__msan_warning_noreturn()
+  unreachable
+
+bb2:                                              ; preds = %while.cond
+  %tmp3 = load i32, i32* %f_iblock, align 4
+  %tmp4 = ptrtoint i32* %f_iblock to i64
+  %tmp8 = inttoptr i64 %tmp4 to i32*
+  %tobool = icmp eq i64 %tmp4, 0
+  br i1 %tobool, label %bb13, label %bb15
+
+bb13:                                             ; preds = %bb2
+; CHECK-LABEL: bb13:
+; CHECK: %tmp8.le = inttoptr
+  %.lcssa7 = phi i32* [ %tmp8, %bb2 ]
+  call void @__msan_warning_noreturn()
+  unreachable
+
+bb15:                                             ; preds = %bb2
+  br i1 %tobool, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %bb15
+  ret void
+
+indirectgoto:                                     ; preds = %indirectgoto, %entry
+  indirectbr i8* null, [label %indirectgoto, label %while.cond]
+}
+
+declare void @__msan_warning_noreturn()

Added: llvm/trunk/test/Transforms/LICM/pr26843.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/pr26843.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/pr26843.ll (added)
+++ llvm/trunk/test/Transforms/LICM/pr26843.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,32 @@
+; RUN: opt -S -basicaa -licm < %s | FileCheck %s
+
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc18.0.0"
+
+ at v = common global i32 zeroinitializer, align 4 
+
+; Make sure the store to v is not sunk past the memset
+; CHECK-LABEL: @main
+; CHECK: for.body:
+; CHECK-NEXT: store i32 1, i32* @v
+; CHECK-NEXT: tail call void @llvm.memset
+; CHECK: end:
+; CHECK-NEXT: ret i32 0
+
+define i32 @main(i1 %k) {
+entry:
+  br label %for.body
+ 
+for.body:
+  store i32 1, i32* @v, align 4
+  tail call void @llvm.memset.p0i8.i32(i8* align 4 bitcast (i32* @v to i8*), i8 0, i32 4, i1 false)
+  br label %for.latch
+  
+for.latch:
+  br i1 %k, label %for.body, label %end
+
+end:
+  ret i32 0
+}
+
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i1)

Added: llvm/trunk/test/Transforms/LICM/pr27262.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/pr27262.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/pr27262.ll (added)
+++ llvm/trunk/test/Transforms/LICM/pr27262.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,33 @@
+; RUN: opt -S -basicaa -licm < %s | FileCheck %s
+
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc18.0.0"
+
+; Make sure the store to v is not sunk past the memset
+; CHECK-LABEL: @main
+; CHECK: for.body:
+; CHECK-NEXT: store i8 1, i8* %p
+; CHECK-NEXT: store i8 2, i8* %p1
+; CHECK-NEXT: call void @llvm.memset
+; CHECK: end:
+; CHECK-NEXT: ret i32 0
+
+define i32 @main(i1 %k, i8* %p) {
+entry:
+  %p1 = getelementptr i8, i8* %p, i32 1
+  br label %for.body
+ 
+for.body:
+  store i8 1, i8* %p, align 1
+  store i8 2, i8* %p1, align 1
+  call void @llvm.memset.p0i8.i32(i8* %p, i8 255, i32 4, i1 false)
+  br label %for.latch
+  
+for.latch:
+  br i1 %k, label %for.body, label %end
+
+end:
+  ret i32 0
+}
+
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i1)

Added: llvm/trunk/test/Transforms/LICM/pr32129.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/pr32129.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/pr32129.ll (added)
+++ llvm/trunk/test/Transforms/LICM/pr32129.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,18 @@
+; RUN: opt -S -licm -loop-unswitch -licm < %s | FileCheck %s
+
+declare void @llvm.experimental.guard(i1, ...)
+
+define void @test() {
+; CHECK-LABEL: @test(
+; CHECK-NOT: guard
+entry:
+  br label %header
+
+header:
+  br label %loop
+
+loop:
+  %0 = icmp ult i32 0, 400
+  call void (i1, ...) @llvm.experimental.guard(i1 %0, i32 9) [ "deopt"() ]
+  br i1 undef, label %header, label %loop
+}

Added: llvm/trunk/test/Transforms/LICM/pr35342.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/pr35342.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/pr35342.ll (added)
+++ llvm/trunk/test/Transforms/LICM/pr35342.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,26 @@
+; RUN: opt -licm -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: @f1
+; CHECK-LABEL: bci_524:
+; CHECK: add i32 undef, undef
+define void @f1(i32 %v) {
+not_zero.lr.ph:
+  br label %not_zero
+
+not_zero:
+  br i1 undef, label %bci_748 ,  label %bci_314
+
+bci_314:
+  %0 = select i1 undef, i32 undef, i32 undef
+  br label %not_zero
+
+bci_524:                   ; No predecessors!
+  %add = add i32 %0, %0
+  br label %bci_748
+
+bci_748:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LICM/pr36228.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/pr36228.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/pr36228.ll (added)
+++ llvm/trunk/test/Transforms/LICM/pr36228.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,40 @@
+; RUN: opt -S -licm -o - %s | FileCheck %s
+;
+; Be sure that we don't hoist loads incorrectly if a loop has conditional UB.
+; See PR36228.
+
+declare void @check(i8)
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1)
+
+; CHECK-LABEL: define void @buggy
+define void @buggy(i8* %src, i1* %kOne) {
+entry:
+  %dst = alloca [1 x i8], align 1
+  %0 = getelementptr inbounds [1 x i8], [1 x i8]* %dst, i64 0, i64 0
+  store i8 42, i8* %0, align 1
+  %src16 = bitcast i8* %src to i16*
+  %srcval = load i16, i16* %src16
+  br label %while.cond
+
+while.cond:                                       ; preds = %if.end, %entry
+  %dp.0 = phi i8* [ %0, %entry ], [ %dp.1, %if.end ]
+  %1 = load volatile i1, i1* %kOne, align 4
+  br i1 %1, label %if.else, label %if.then
+
+if.then:                                          ; preds = %while.cond
+  store i8 9, i8* %dp.0, align 1
+  br label %if.end
+
+if.else:                                          ; preds = %while.cond
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dp.0, i8* %src, i64 2, i32 1, i1 false)
+  %dp.new = getelementptr inbounds i8, i8* %dp.0, i64 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %dp.1 = phi i8* [ %dp.0, %if.then ], [ %dp.new, %if.else ]
+  ; CHECK: %2 = load i8, i8* %0
+  %2 = load i8, i8* %0, align 1
+  ; CHECK-NEXT: call void @check(i8 %2)
+  call void @check(i8 %2)
+  br label %while.cond
+}

Added: llvm/trunk/test/Transforms/LICM/pr37323.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/pr37323.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/pr37323.ll (added)
+++ llvm/trunk/test/Transforms/LICM/pr37323.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,31 @@
+;RUN: opt -verify-dom-info -loop-simplify -postdomtree -licm -adce -verify-loop-info -S -o - %s | FileCheck %s
+;RUN: opt -verify-dom-info -passes='loop-simplify,require<postdomtree>,require<opt-remark-emit>,loop(licm),function(adce)' -S -o - %s | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+ at c = external global i16, align 1
+
+;Make sure this test do not crash while accessing PostDomTree which is not
+;preserved in LICM.
+;
+;CHECK-LABEL: fn1()
+;CHECK-LABEL: for.cond.loopexit.split.loop.exit
+;CHECK-LABEL: for.cond.loopexit.split.loop.exit1
+define void @fn1() {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %if.end, %for.cond1, %entry
+  %0 = phi i16 [ undef, %entry ], [ ptrtoint (i16* @c to i16), %if.end ], [ %.mux, %for.cond1 ]
+  br i1 undef, label %for.cond1, label %for.end8
+
+for.cond1:                                        ; preds = %if.end, %for.cond
+  %.mux = select i1 undef, i16 undef, i16 ptrtoint (i16* @c to i16)
+  br i1 undef, label %for.cond, label %if.end
+
+if.end:                                           ; preds = %for.cond1
+  br i1 undef, label %for.cond, label %for.cond1
+
+for.end8:                                         ; preds = %for.cond
+  ret void
+}

Added: llvm/trunk/test/Transforms/LICM/pr40317.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/pr40317.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/pr40317.ll (added)
+++ llvm/trunk/test/Transforms/LICM/pr40317.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,62 @@
+; RUN: opt -S -march=z13 -tbaa -licm -enable-mssa-loop-dependency -licm-control-flow-hoisting -verify-memoryssa < %s | FileCheck %s
+
+target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
+target triple = "s390x-ibm-linux"
+
+%0 = type { %1, %1, i16, %2 }
+%1 = type <{ i16, i8, i32, i32, i32, i64, i64 }>
+%2 = type { i8, i16, i16, [2 x i8] }
+
+ at 0 = internal global %0 { %1 <{ i16 22437, i8 117, i32 2017322857, i32 900074563, i32 -1390364, i64 0, i64 0 }>, %1 <{ i16 0, i8 7, i32 -387299562, i32 925371866, i32 -1, i64 4826244575317081679, i64 1 }>, i16 8, %2 { i8 0, i16 0, i16 3, [2 x i8] undef } }, align 2
+ at g_18 = external dso_local global i64, align 8
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) #1
+
+; CHECK-LABEL: @func_94
+; CHECK: bb:
+; CHECK: tail call void @llvm.memset.p0i8.i64
+; CHECK: load i32
+; CHECK: bb6.licm:
+; Function Attrs: noreturn nounwind
+define dso_local void @func_94(i16 %arg, i64* nocapture %arg1) local_unnamed_addr #3 {
+bb:
+  tail call void @llvm.memset.p0i8.i64(i8* align 8 undef, i8 0, i64 80, i1 false)
+  br label %bb3
+
+bb3:                                              ; preds = %bb13, %bb
+  %tmp5 = icmp eq i16 %arg, 0
+  br i1 %tmp5, label %bb6, label %bb13
+
+bb6:                                              ; preds = %bb3
+  %tmp7 = load i32, i32* getelementptr inbounds (%0, %0* @0, i64 0, i32 1, i32 2), align 1, !tbaa !11
+  %tmp8 = zext i32 %tmp7 to i64
+  %sext = shl i64 %tmp8, 56
+  %tmp10 = ashr exact i64 %sext, 56
+  store i64 %tmp10, i64* %arg1, align 8, !tbaa !12
+  br label %bb13
+
+bb13:                                             ; preds = %bb3, %bb6
+  br label %bb3
+}
+
+attributes #0 = { "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { norecurse nounwind readnone "use-soft-float"="false" }
+attributes #3 = { noreturn nounwind "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 8.0.0 (http://llvm.org/git/clang.git e593a791f2cf19db84237b0b9d632e9966a00a39) (http://llvm.org/git/llvm.git fe0523d1bd7def3ef62cfb3dd37a8b1941aafa81)"}
+!1 = !{!2, !8, i64 46}
+!2 = !{!"S5", !3, i64 0, !3, i64 31, !4, i64 62, !9, i64 64}
+!3 = !{!"S2", !4, i64 0, !5, i64 2, !7, i64 3, !7, i64 7, !7, i64 11, !8, i64 15, !8, i64 23}
+!4 = !{!"short", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
+!7 = !{!"int", !5, i64 0}
+!8 = !{!"long", !5, i64 0}
+!9 = !{!"S3", !7, i64 0, !4, i64 2, !4, i64 4}
+!10 = !{!2, !7, i64 42}
+!11 = !{!2, !7, i64 34}
+!12 = !{!8, !8, i64 0}

Added: llvm/trunk/test/Transforms/LICM/preheader-safe.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/preheader-safe.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/preheader-safe.ll (added)
+++ llvm/trunk/test/Transforms/LICM/preheader-safe.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,151 @@
+; RUN: opt -S -licm < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
+
+declare void @use_nothrow(i64 %a) nounwind
+declare void @use(i64 %a)
+declare void @maythrow()
+
+define void @nothrow(i64 %x, i64 %y, i1* %cond) {
+; CHECK-LABEL: nothrow
+; CHECK-LABEL: entry
+; CHECK: %div = udiv i64 %x, %y
+; CHECK-LABEL: loop
+; CHECK: call void @use_nothrow(i64 %div)
+entry:
+  br label %loop
+
+loop:                                         ; preds = %entry, %for.inc
+  %div = udiv i64 %x, %y
+  br label %loop2
+
+loop2:
+  call void @use_nothrow(i64 %div)
+  br label %loop
+}
+
+; The udiv is guarantee to execute if the loop is
+define void @throw_header_after(i64 %x, i64 %y, i1* %cond) {
+; CHECK-LABEL: throw_header_after
+; CHECK: %div = udiv i64 %x, %y
+; CHECK-LABEL: loop
+; CHECK: call void @use(i64 %div)
+entry:
+  br label %loop
+
+loop:                                         ; preds = %entry, %for.inc
+  %div = udiv i64 %x, %y
+  call void @use(i64 %div)
+  br label %loop
+}
+define void @throw_header_after_rec(i64* %xp, i64* %yp, i1* %cond) {
+; CHECK-LABEL: throw_header_after_rec
+; CHECK: %x = load i64, i64* %xp
+; CHECK: %y = load i64, i64* %yp
+; CHECK: %div = udiv i64 %x, %y
+; CHECK-LABEL: loop
+; CHECK: call void @use(i64 %div)
+entry:
+  br label %loop
+
+loop:                                         ; preds = %entry, %for.inc
+  %x = load i64, i64* %xp
+  %y = load i64, i64* %yp
+  %div = udiv i64 %x, %y
+  call void @use(i64 %div) readonly
+  br label %loop
+}
+
+; Similiar to the above, but the hoistable instruction (%y in this case)
+; happens not to be the first instruction in the block.
+define void @throw_header_after_nonfirst(i64* %xp, i64* %yp, i1* %cond) {
+; CHECK-LABEL: throw_header_after_nonfirst
+; CHECK: %y = load i64, i64* %yp
+; CHECK-LABEL: loop
+; CHECK: %x = load i64, i64* %gep
+; CHECK: %div = udiv i64 %x, %y
+; CHECK: call void @use(i64 %div)
+entry:
+  br label %loop
+
+loop:                                         ; preds = %entry, %for.inc
+  %iv = phi i64 [0, %entry], [%div, %loop]
+  %gep = getelementptr i64, i64* %xp, i64 %iv
+  %x = load i64, i64* %gep
+  %y = load i64, i64* %yp
+  %div = udiv i64 %x, %y
+  call void @use(i64 %div) readonly
+  br label %loop
+}
+
+; Negative test
+define void @throw_header_before(i64 %x, i64 %y, i1* %cond) {
+; CHECK-LABEL: throw_header_before
+; CHECK-LABEL: loop
+; CHECK: %div = udiv i64 %x, %y
+; CHECK: call void @use(i64 %div)
+entry:
+  br label %loop
+
+loop:                                         ; preds = %entry, %for.inc
+  call void @maythrow()
+  %div = udiv i64 %x, %y
+  call void @use(i64 %div)
+  br label %loop
+}
+
+; The header is known no throw, but the loop is not.  We can
+; still lift out of the header.
+define void @nothrow_header(i64 %x, i64 %y, i1 %cond) {
+; CHECK-LABEL: nothrow_header
+; CHECK-LABEL: entry
+; CHECK: %div = udiv i64 %x, %y
+; CHECK-LABEL: loop
+  ; CHECK: call void @use(i64 %div)
+entry:
+  br label %loop
+loop:                                         ; preds = %entry, %for.inc
+  %div = udiv i64 %x, %y
+  br i1 %cond, label %loop-if, label %exit
+loop-if:
+  call void @use(i64 %div)
+  br label %loop
+exit:
+  ret void
+}
+
+; Positive test - can hoist something that happens before thrower.
+define void @nothrow_header_pos(i64 %x, i64 %y, i1 %cond) {
+; CHECK-LABEL: nothrow_header_pos
+; CHECK-LABEL: entry
+; CHECK: %div = udiv i64 %x, %y
+; CHECK-LABEL: loop
+; CHECK: call void @use(i64 %div)
+entry:
+  br label %loop
+loop:                                         ; preds = %entry, %for.inc
+  br label %loop-if
+loop-if:
+  %div = udiv i64 %x, %y
+  call void @use(i64 %div)
+  br label %loop
+}
+
+
+; Negative test - can't move out of throwing block
+define void @nothrow_header_neg(i64 %x, i64 %y, i1 %cond) {
+; CHECK-LABEL: nothrow_header_neg
+; CHECK-LABEL: entry
+; CHECK-LABEL: loop
+; CHECK: call void @maythrow()
+; CHECK: %div = udiv i64 %x, %y
+; CHECK: call void @use(i64 %div)
+entry:
+  br label %loop
+loop:                                         ; preds = %entry, %for.inc
+  br label %loop-if
+loop-if:
+  call void @maythrow()
+  %div = udiv i64 %x, %y
+  call void @use(i64 %div)
+  br label %loop
+}

Added: llvm/trunk/test/Transforms/LICM/promote-order.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/promote-order.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/promote-order.ll (added)
+++ llvm/trunk/test/Transforms/LICM/promote-order.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,46 @@
+; RUN: opt -tbaa -basicaa -licm -S < %s | FileCheck %s
+; RUN: opt -aa-pipeline=type-based-aa,basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
+
+; LICM should keep the stores in their original order when it sinks/promotes them.
+; rdar://12045203
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+ at p = external global i8*
+
+define i32* @_Z4doiti(i32 %n, float* %tmp1, i32* %tmp3) nounwind {
+; CHECK-LABEL: for.body.lr.ph:
+; CHECK: store float 1.000000e+00, float* %tmp1
+; CHECK-LABEL: for.cond.for.end_crit_edge:
+; CHECK: store i32 1, i32* %tmp3
+
+entry:
+  %cmp1 = icmp slt i32 0, %n
+  br i1 %cmp1, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %i.02 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  store float 1.000000e+00, float* %tmp1, align 4, !tbaa !1
+  store i32 1, i32* %tmp3, align 4, !tbaa !2
+  %inc = add nsw i32 %i.02, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  %split = phi i32* [ %tmp3, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  %r.0.lcssa = phi i32* [ %split, %for.cond.for.end_crit_edge ], [ undef, %entry ]
+  ret i32* %r.0.lcssa
+}
+
+!0 = !{!"minimal TBAA"}
+!1 = !{!3, !3, i64 0}
+!2 = !{!4, !4, i64 0}
+!3 = !{!"float", !0}
+!4 = !{!"int", !0}

Added: llvm/trunk/test/Transforms/LICM/promote-tls.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/promote-tls.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/promote-tls.ll (added)
+++ llvm/trunk/test/Transforms/LICM/promote-tls.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,178 @@
+; RUN: opt -tbaa -basicaa -licm -S < %s | FileCheck %s
+; RUN: opt -aa-pipeline=type-based-aa,basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
+
+; If we can prove a local is thread local, we can insert stores during
+; promotion which wouldn't be legal otherwise.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-linux-generic"
+
+ at p = external global i8*
+
+declare i8* @malloc(i64)
+
+; Exercise the TLS case
+; CHECK-LABEL: @test
+define i32* @test(i32 %n) {
+entry:
+  ;; ignore the required null check for simplicity
+  %mem = call dereferenceable(16) noalias i8* @malloc(i64 16)
+  %addr = bitcast i8* %mem to i32*
+  br label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+; CHECK-LABEL: for.body.lr.ph:
+; CHECK-NEXT: %addr.promoted = load i32, i32* %addr, align 4
+  br label %for.header
+
+for.header:
+  %i.02 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %old = load i32, i32* %addr, align 4
+  ; deliberate impossible to analyze branch
+  %guard = load atomic i8*, i8** @p monotonic, align 8
+  %exitcmp = icmp eq i8* %guard, null
+  br i1 %exitcmp, label %for.body, label %early-exit
+
+early-exit:
+; CHECK-LABEL: early-exit:
+; CHECK: store i32 %new1.lcssa, i32* %addr, align 4
+  ret i32* null
+
+for.body:
+  %new = add i32 %old, 1
+  store i32 %new, i32* %addr, align 4
+  %inc = add nsw i32 %i.02, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.header, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+; CHECK-LABEL: for.cond.for.end_crit_edge:
+; CHECK: store i32 %new.lcssa, i32* %addr, align 4
+  %split = phi i32* [ %addr, %for.body ]
+  ret i32* null
+}
+
+; Stack allocations can also be thread-local
+; CHECK-LABEL: @test2
+define i32* @test2(i32 %n) {
+entry:
+  %mem = alloca i8, i32 16
+  %addr = bitcast i8* %mem to i32*
+  br label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+; CHECK-LABEL: for.body.lr.ph:
+; CHECK-NEXT: %addr.promoted = load i32, i32* %addr, align 4
+  br label %for.header
+
+for.header:
+  %i.02 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %old = load i32, i32* %addr, align 4
+  ; deliberate impossible to analyze branch
+  %guard = load atomic i8*, i8** @p monotonic, align 8
+  %exitcmp = icmp eq i8* %guard, null
+  br i1 %exitcmp, label %for.body, label %early-exit
+
+early-exit:
+; CHECK-LABEL: early-exit:
+; CHECK: store i32 %new1.lcssa, i32* %addr, align 4
+  ret i32* null
+
+for.body:
+  %new = add i32 %old, 1
+  store i32 %new, i32* %addr, align 4
+  %inc = add nsw i32 %i.02, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.header, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+; CHECK-LABEL: for.cond.for.end_crit_edge:
+; CHECK: store i32 %new.lcssa, i32* %addr, align 4
+  %split = phi i32* [ %addr, %for.body ]
+  ret i32* null
+}
+
+declare i8* @not_malloc(i64)
+
+; Negative test - not TLS
+; CHECK-LABEL: @test_neg
+define i32* @test_neg(i32 %n) {
+entry:
+  ;; ignore the required null check for simplicity
+  %mem = call dereferenceable(16) noalias i8* @not_malloc(i64 16)
+  %addr = bitcast i8* %mem to i32*
+  br label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.header
+
+for.header:
+  %i.02 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %old = load i32, i32* %addr, align 4
+  ; deliberate impossible to analyze branch
+  %guard = load volatile i8*, i8** @p
+  %exitcmp = icmp eq i8* %guard, null
+  br i1 %exitcmp, label %for.body, label %early-exit
+
+early-exit:
+; CHECK-LABEL: early-exit:
+; CHECK-NOT: store
+  ret i32* null
+
+for.body:
+; CHECK-LABEL: for.body:
+; CHECK: store i32 %new, i32* %addr, align 4
+  %new = add i32 %old, 1
+  store i32 %new, i32* %addr, align 4
+  %inc = add nsw i32 %i.02, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.header, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+; CHECK-LABEL: for.cond.for.end_crit_edge:
+; CHECK-NOT: store
+  %split = phi i32* [ %addr, %for.body ]
+  ret i32* null
+}
+
+; Negative test - can't speculate load since branch
+; may control alignment
+; CHECK-LABEL: @test_neg2
+define i32* @test_neg2(i32 %n) {
+entry:
+  ;; ignore the required null check for simplicity
+  %mem = call dereferenceable(16) noalias i8* @malloc(i64 16)
+  %addr = bitcast i8* %mem to i32*
+  br label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.header
+
+for.header:
+  %i.02 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  ; deliberate impossible to analyze branch
+  %guard = load volatile i8*, i8** @p
+  %exitcmp = icmp eq i8* %guard, null
+  br i1 %exitcmp, label %for.body, label %early-exit
+
+early-exit:
+; CHECK-LABEL: early-exit:
+; CHECK-NOT: store
+  ret i32* null
+
+for.body:
+; CHECK-LABEL: for.body:
+; CHECK: store i32 %new, i32* %addr, align 4
+  %old = load i32, i32* %addr, align 4
+  %new = add i32 %old, 1
+  store i32 %new, i32* %addr, align 4
+  %inc = add nsw i32 %i.02, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.header, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+; CHECK-LABEL: for.cond.for.end_crit_edge:
+; CHECK-NOT: store
+  %split = phi i32* [ %addr, %for.body ]
+  ret i32* null
+}

Added: llvm/trunk/test/Transforms/LICM/read-only-calls.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/read-only-calls.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/read-only-calls.ll (added)
+++ llvm/trunk/test/Transforms/LICM/read-only-calls.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,85 @@
+; RUN: opt -S -basicaa -licm -licm-n2-threshold=0 %s | FileCheck %s
+; RUN: opt -licm -basicaa -licm-n2-threshold=200 < %s -S | FileCheck %s --check-prefix=ALIAS-N2
+; RUN: opt -aa-pipeline=basic-aa -licm-n2-threshold=0 -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -licm-n2-threshold=200 -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s --check-prefix=ALIAS-N2
+
+; We should be able to hoist loads in presence of read only calls and stores
+; that do not alias.
+
+; Since LICM uses the AST mechanism for alias analysis, we will clump
+; together all loads and stores in one set along with the read-only call.
+; This prevents hoisting load that doesn't alias with any other memory
+; operations.
+
+declare void @foo(i64, i32*) readonly
+
+; hoist the load out with the n2-threshold
+; since it doesn't alias with the store.
+; default AST mechanism clumps all memory locations in one set because of the
+; readonly call
+define void @test1(i32* %ptr) {
+; CHECK-LABEL: @test1(
+; CHECK-LABEL: entry:
+; CHECK-LABEL: loop:
+; CHECK: %val = load i32, i32* %ptr
+
+; ALIAS-N2-LABEL: @test1(
+; ALIAS-N2-LABEL: entry:
+; ALIAS-N2:         %val = load i32, i32* %ptr
+; ALIAS-N2-LABEL: loop:
+entry:
+  br label %loop
+
+loop:
+  %x = phi i32 [ 0, %entry ], [ %x.inc, %loop ]
+  %val = load i32, i32* %ptr
+  call void @foo(i64 4, i32* %ptr)
+  %p2 = getelementptr i32, i32* %ptr, i32 1
+  store volatile i32 0, i32* %p2
+  %x.inc = add i32 %x, %val
+  br label %loop
+}
+
+; can hoist out load with the default AST and the alias analysis mechanism.
+define void @test2(i32* %ptr) {
+; CHECK-LABEL: @test2(
+; CHECK-LABEL: entry:
+; CHECK: %val = load i32, i32* %ptr
+; CHECK-LABEL: loop:
+
+; ALIAS-N2-LABEL: @test2(
+; ALIAS-N2-LABEL: entry:
+; ALIAS-N2:         %val = load i32, i32* %ptr
+; ALIAS-N2-LABEL: loop:
+entry:
+  br label %loop
+
+loop:
+  %x = phi i32 [ 0, %entry ], [ %x.inc, %loop ]
+  %val = load i32, i32* %ptr
+  call void @foo(i64 4, i32* %ptr)
+  %x.inc = add i32 %x, %val
+  br label %loop
+}
+
+; cannot hoist load since not guaranteed to execute
+define void @test3(i32* %ptr) {
+; CHECK-LABEL: @test3(
+; CHECK-LABEL: entry:
+; CHECK-LABEL: loop:
+; CHECK: %val = load i32, i32* %ptr
+
+; ALIAS-N2-LABEL: @test3(
+; ALIAS-N2-LABEL: entry:
+; ALIAS-N2-LABEL: loop:
+; ALIAS-N2:         %val = load i32, i32* %ptr
+entry:
+  br label %loop
+
+loop:
+  %x = phi i32 [ 0, %entry ], [ %x.inc, %loop ]
+  call void @foo(i64 4, i32* %ptr)
+  %val = load i32, i32* %ptr
+  %x.inc = add i32 %x, %val
+  br label %loop
+}

Added: llvm/trunk/test/Transforms/LICM/scalar-promote-memmodel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/scalar-promote-memmodel.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/scalar-promote-memmodel.ll (added)
+++ llvm/trunk/test/Transforms/LICM/scalar-promote-memmodel.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,38 @@
+; RUN: opt < %s -basicaa -licm -S | FileCheck %s
+; RUN: opt -aa-pipeline=type-based-aa,basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
+
+; Make sure we don't hoist a conditionally-executed store out of the loop;
+; it would violate the concurrency memory model
+
+ at g = common global i32 0, align 4
+
+define void @bar(i32 %n, i32 %b) nounwind uwtable ssp {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc5, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %tobool = icmp eq i32 %b, 0
+  br i1 %tobool, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %tmp3 = load i32, i32* @g, align 4
+  %inc = add nsw i32 %tmp3, 1
+  store i32 %inc, i32* @g, align 4
+  br label %for.inc
+
+; CHECK: load i32, i32*
+; CHECK-NEXT: add
+; CHECK-NEXT: store i32
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %inc5 = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}

Added: llvm/trunk/test/Transforms/LICM/scalar-promote-unwind.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/scalar-promote-unwind.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/scalar-promote-unwind.ll (added)
+++ llvm/trunk/test/Transforms/LICM/scalar-promote-unwind.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,318 @@
+; RUN: opt < %s -basicaa -licm -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Make sure we don't hoist the store out of the loop; %a would
+; have the wrong value if f() unwinds
+
+define void @test1(i32* nocapture noalias %a, i1 zeroext %y) uwtable {
+entry:
+  br label %for.body
+
+for.body:
+  %i.03 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %0 = load i32, i32* %a, align 4
+  %add = add nsw i32 %0, 1
+  store i32 %add, i32* %a, align 4
+  br i1 %y, label %if.then, label %for.inc
+
+; CHECK: define void @test1
+; CHECK: load i32, i32*
+; CHECK-NEXT: add
+; CHECK-NEXT: store i32
+
+if.then:
+  tail call void @f()
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %i.03, 1
+  %exitcond = icmp eq i32 %inc, 10000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+; We can hoist the store out of the loop here; if f() unwinds,
+; the lifetime of %a ends.
+
+define void @test2(i1 zeroext %y) uwtable {
+entry:
+  %a = alloca i32
+  br label %for.body
+
+for.body:
+  %i.03 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %0 = load i32, i32* %a, align 4
+  %add = add nsw i32 %0, 1
+  store i32 %add, i32* %a, align 4
+  br i1 %y, label %if.then, label %for.inc
+
+if.then:
+  tail call void @f()
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %i.03, 1
+  %exitcond = icmp eq i32 %inc, 10000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+; CHECK: define void @test2
+; CHECK: store i32
+; CHECK-NEXT: ret void
+  ret void
+}
+
+;; We can promote if the load can be proven safe to speculate, and the
+;; store safe to sink, even if the the store *isn't* must execute.
+define void @test3(i1 zeroext %y) uwtable {
+; CHECK-LABEL: @test3
+entry:
+; CHECK-LABEL: entry:
+; CHECK-NEXT:  %a = alloca i32
+; CHECK-NEXT:  %a.promoted = load i32, i32* %a, align 4
+  %a = alloca i32
+  br label %for.body
+
+for.body:
+  %i.03 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %0 = load i32, i32* %a, align 4
+  %add = add nsw i32 %0, 1
+  tail call void @f()
+  store i32 %add, i32* %a, align 4
+  %inc = add nuw nsw i32 %i.03, 1
+  %exitcond = icmp eq i32 %inc, 10000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+; CHECK-LABEL: for.cond.cleanup:
+; CHECK: store i32 %add.lcssa, i32* %a, align 4
+; CHECK-NEXT: ret void
+  ret void
+}
+
+;; Same as test3, but with unordered atomics
+define void @test3b(i1 zeroext %y) uwtable {
+; CHECK-LABEL: @test3
+entry:
+; CHECK-LABEL: entry:
+; CHECK-NEXT:  %a = alloca i32
+; CHECK-NEXT:  %a.promoted = load atomic i32, i32* %a unordered, align 4
+  %a = alloca i32
+  br label %for.body
+
+for.body:
+  %i.03 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %0 = load atomic i32, i32* %a unordered, align 4
+  %add = add nsw i32 %0, 1
+  tail call void @f()
+  store atomic i32 %add, i32* %a unordered, align 4
+  %inc = add nuw nsw i32 %i.03, 1
+  %exitcond = icmp eq i32 %inc, 10000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+; CHECK-LABEL: for.cond.cleanup:
+; CHECK: store atomic i32 %add.lcssa, i32* %a unordered, align 4
+; CHECK-NEXT: ret void
+  ret void
+}
+
+ at _ZTIi = external constant i8*
+
+; In this test, the loop is within a try block. There is an explicit unwind edge out of the loop.
+; Make sure this edge is treated as a loop exit, and that the loads and stores are promoted as
+; expected
+define void @loop_within_tryblock() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  %a = alloca i32, align 4
+  store i32 0, i32* %a, align 4
+  br label %for.cond
+
+for.cond:
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, 1024
+  br i1 %cmp, label %for.body, label %for.end
+
+; CHECK: for.body:
+; CHECK-NOT: load
+; CHECK-NOT: store 
+; CHECK: invoke
+for.body:
+  %0 = load i32, i32* %a, align 4
+  %add = add nsw i32 %0, 1
+  store i32 %add, i32* %a, align 4
+  invoke void @boo()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  br label %for.inc
+
+for.inc:
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+; CHECK: lpad:
+; CHECK: store
+; CHECK: br
+lpad:
+  %1 = landingpad { i8*, i32 }
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %2 = extractvalue { i8*, i32 } %1, 0
+  %3 = extractvalue { i8*, i32 } %1, 1
+  br label %catch.dispatch
+
+catch.dispatch:
+  %4 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)) #3
+  %matches = icmp eq i32 %3, %4
+  br i1 %matches, label %catch, label %eh.resume
+
+catch:
+  %5 = call i8* @__cxa_begin_catch(i8* %2) #3
+  %6 = bitcast i8* %5 to i32*
+  %7 = load i32, i32* %6, align 4
+  call void @__cxa_end_catch() #3
+  br label %try.cont
+
+try.cont:
+  ret void
+
+for.end:
+  br label %try.cont
+
+eh.resume:
+  %lpad.val = insertvalue { i8*, i32 } undef, i8* %2, 0
+  %lpad.val3 = insertvalue { i8*, i32 } %lpad.val, i32 %3, 1
+  resume { i8*, i32 } %lpad.val3
+}
+
+
+; The malloc'ed memory is not capture and therefore promoted.
+define void @malloc_no_capture() #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  %call = call i8* @malloc(i64 4)
+  %0 = bitcast i8* %call to i32*
+  br label %for.body
+
+; CHECK: for.body:
+; CHECK-NOT: load
+; CHECK-NOT: store
+; CHECK: br 
+for.body:
+  %i.0 = phi i32 [ 0, %entry  ], [ %inc, %for.latch ]
+  %1 = load i32, i32* %0, align 4
+  %add = add nsw i32 %1, 1
+  store i32 %add, i32* %0, align 4
+  br label %for.call
+
+for.call:
+  invoke void @boo()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  br label %for.latch
+
+for.latch:
+  %inc = add i32 %i.0, 1
+  %cmp = icmp slt i32 %i.0, 1024
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  br label %fun.ret
+
+lpad:
+  %2 = landingpad { i8*, i32 }
+          catch i8* null
+  %3 = extractvalue { i8*, i32 } %2, 0
+  %4 = extractvalue { i8*, i32 } %2, 1
+  br label %catch
+
+catch:
+  %5 = call i8* @__cxa_begin_catch(i8* %3) #4
+  %6 = bitcast i32* %0 to i8*
+  call void @free(i8* %6)
+  call void @__cxa_end_catch()
+  br label %fun.ret
+
+fun.ret:
+  ret void
+}
+
+; The malloc'ed memory can be captured and therefore not promoted.
+define void @malloc_capture(i32** noalias %A) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  %call = call i8* @malloc(i64 4)
+  %0 = bitcast i8* %call to i32*
+  br label %for.body
+
+; CHECK: for.body:
+; CHECK: load
+; CHECK: store
+; CHECK: br 
+for.body:
+  %i.0 = phi i32 [ 0, %entry  ], [ %inc, %for.latch ]
+  %1 = load i32, i32* %0, align 4
+  %add = add nsw i32 %1, 1
+  store i32 %add, i32* %0, align 4
+  br label %for.call
+
+for.call:
+  invoke void @boo_readnone()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  br label %for.latch
+
+for.latch:
+  store i32* %0, i32** %A 
+  %inc = add i32 %i.0, 1
+  %cmp = icmp slt i32 %i.0, 1024
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  br label %fun.ret
+
+lpad:
+  %2 = landingpad { i8*, i32 }
+          catch i8* null
+  %3 = extractvalue { i8*, i32 } %2, 0
+  %4 = extractvalue { i8*, i32 } %2, 1
+  br label %catch
+
+catch:
+  %5 = call i8* @__cxa_begin_catch(i8* %3) #4
+  %6 = bitcast i32* %0 to i8*
+  call void @free(i8* %6)
+  call void @__cxa_end_catch()
+  br label %fun.ret
+
+fun.ret:
+  ret void
+}
+
+; Function Attrs: nounwind
+declare noalias i8* @malloc(i64)
+
+; Function Attrs: nounwind
+declare void @free(i8* nocapture)
+
+declare void @boo() 
+
+; This is an artifical example, readnone functions by definition cannot unwind
+; exceptions by calling the C++ exception throwing methods
+; This function should only be used to test malloc_capture.
+declare void @boo_readnone() readnone
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+declare i32 @llvm.eh.typeid.for(i8*)
+
+declare void @f() uwtable

Added: llvm/trunk/test/Transforms/LICM/scalar-promote.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/scalar-promote.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/scalar-promote.ll (added)
+++ llvm/trunk/test/Transforms/LICM/scalar-promote.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,468 @@
+; RUN: opt < %s -basicaa -tbaa -licm -S | FileCheck %s
+; RUN: opt -aa-pipeline=type-based-aa,basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
+target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
+
+ at X = global i32 7   ; <i32*> [#uses=4]
+
+define void @test1(i32 %i) {
+Entry:
+  br label %Loop
+; CHECK-LABEL: @test1(
+; CHECK: Entry:
+; CHECK-NEXT:   load i32, i32* @X
+; CHECK-NEXT:   br label %Loop
+
+
+Loop:   ; preds = %Loop, %0
+  %j = phi i32 [ 0, %Entry ], [ %Next, %Loop ]    ; <i32> [#uses=1]
+  %x = load i32, i32* @X   ; <i32> [#uses=1]
+  %x2 = add i32 %x, 1   ; <i32> [#uses=1]
+  store i32 %x2, i32* @X
+  %Next = add i32 %j, 1   ; <i32> [#uses=2]
+  %cond = icmp eq i32 %Next, 0    ; <i1> [#uses=1]
+  br i1 %cond, label %Out, label %Loop
+
+Out:
+  ret void
+; CHECK: Out:
+; CHECK-NEXT:   %[[LCSSAPHI:.*]] = phi i32 [ %x2
+; CHECK-NEXT:   store i32 %[[LCSSAPHI]], i32* @X
+; CHECK-NEXT:   ret void
+
+}
+
+define void @test2(i32 %i) {
+Entry:
+  br label %Loop
+; CHECK-LABEL: @test2(
+; CHECK: Entry:
+; CHECK-NEXT:    %.promoted = load i32, i32* getelementptr inbounds (i32, i32* @X, i64 1)
+; CHECK-NEXT:    br label %Loop
+
+Loop:   ; preds = %Loop, %0
+  %X1 = getelementptr i32, i32* @X, i64 1    ; <i32*> [#uses=1]
+  %A = load i32, i32* %X1    ; <i32> [#uses=1]
+  %V = add i32 %A, 1    ; <i32> [#uses=1]
+  %X2 = getelementptr i32, i32* @X, i64 1    ; <i32*> [#uses=1]
+  store i32 %V, i32* %X2
+  br i1 false, label %Loop, label %Exit
+
+Exit:   ; preds = %Loop
+  ret void
+; CHECK: Exit:
+; CHECK-NEXT:   %[[LCSSAPHI:.*]] = phi i32 [ %V
+; CHECK-NEXT:   store i32 %[[LCSSAPHI]], i32* getelementptr inbounds (i32, i32* @X, i64 1)
+; CHECK-NEXT:   ret void
+}
+
+
+
+define void @test3(i32 %i) {
+; CHECK-LABEL: @test3(
+  br label %Loop
+Loop:
+        ; Should not promote this to a register
+  %x = load volatile i32, i32* @X
+  %x2 = add i32 %x, 1
+  store i32 %x2, i32* @X
+  br i1 true, label %Out, label %Loop
+
+; CHECK: Loop:
+; CHECK-NEXT: load volatile
+
+Out:    ; preds = %Loop
+  ret void
+}
+
+define void @test3b(i32 %i) {
+; CHECK-LABEL: @test3b(
+; CHECK-LABEL: Loop:
+; CHECK: store volatile
+; CHECK-LABEL: Out:
+  br label %Loop
+Loop:
+        ; Should not promote this to a register
+  %x = load i32, i32* @X
+  %x2 = add i32 %x, 1
+  store volatile i32 %x2, i32* @X
+  br i1 true, label %Out, label %Loop
+
+Out:    ; preds = %Loop
+  ret void
+}
+
+; PR8041
+define void @test4(i8* %x, i8 %n) {
+; CHECK-LABEL: @test4(
+  %handle1 = alloca i8*
+  %handle2 = alloca i8*
+  store i8* %x, i8** %handle1
+  br label %loop
+
+loop:
+  %tmp = getelementptr i8, i8* %x, i64 8
+  store i8* %tmp, i8** %handle2
+  br label %subloop
+
+subloop:
+  %count = phi i8 [ 0, %loop ], [ %nextcount, %subloop ]
+  %offsetx2 = load i8*, i8** %handle2
+  store i8 %n, i8* %offsetx2
+  %newoffsetx2 = getelementptr i8, i8* %offsetx2, i64 -1
+  store i8* %newoffsetx2, i8** %handle2
+  %nextcount = add i8 %count, 1
+  %innerexitcond = icmp sge i8 %nextcount, 8
+  br i1 %innerexitcond, label %innerexit, label %subloop
+
+; Should have promoted 'handle2' accesses.
+; CHECK: subloop:
+; CHECK-NEXT: phi i8* [
+; CHECK-NEXT: %count = phi i8 [
+; CHECK-NEXT: store i8 %n
+; CHECK-NOT: store
+; CHECK: br i1
+
+innerexit:
+  %offsetx1 = load i8*, i8** %handle1
+  %val = load i8, i8* %offsetx1
+  %cond = icmp eq i8 %val, %n
+  br i1 %cond, label %exit, label %loop
+
+; Should not have promoted offsetx1 loads.
+; CHECK: innerexit:
+; CHECK: %val = load i8, i8* %offsetx1
+; CHECK: %cond = icmp eq i8 %val, %n
+; CHECK: br i1 %cond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test5(i32 %i, i32** noalias %P2) {
+Entry:
+  br label %Loop
+; CHECK-LABEL: @test5(
+; CHECK: Entry:
+; CHECK-NEXT:   load i32, i32* @X
+; CHECK-NEXT:   br label %Loop
+
+
+Loop:   ; preds = %Loop, %0
+  %j = phi i32 [ 0, %Entry ], [ %Next, %Loop ]    ; <i32> [#uses=1]
+  %x = load i32, i32* @X   ; <i32> [#uses=1]
+  %x2 = add i32 %x, 1   ; <i32> [#uses=1]
+  store i32 %x2, i32* @X
+
+        store atomic i32* @X, i32** %P2 monotonic, align 8
+
+  %Next = add i32 %j, 1   ; <i32> [#uses=2]
+  %cond = icmp eq i32 %Next, 0    ; <i1> [#uses=1]
+  br i1 %cond, label %Out, label %Loop
+
+Out:
+  ret void
+; CHECK: Out:
+; CHECK-NEXT:   %[[LCSSAPHI:.*]] = phi i32 [ %x2
+; CHECK-NEXT:   store i32 %[[LCSSAPHI]], i32* @X
+; CHECK-NEXT:   ret void
+
+}
+
+
+; PR14753 - Preserve TBAA tags when promoting values in a loop.
+define void @test6(i32 %n, float* nocapture %a, i32* %gi) {
+entry:
+  store i32 0, i32* %gi, align 4, !tbaa !0
+  %cmp1 = icmp slt i32 0, %n
+  br i1 %cmp1, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %storemerge2 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %idxprom = sext i32 %storemerge2 to i64
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %idxprom
+  store float 0.000000e+00, float* %arrayidx, align 4, !tbaa !3
+  %0 = load i32, i32* %gi, align 4, !tbaa !0
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %gi, align 4, !tbaa !0
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  ret void
+
+; CHECK: for.body.lr.ph:
+; CHECK-NEXT:  %gi.promoted = load i32, i32* %gi, align 4, !tbaa !0
+; CHECK: for.cond.for.end_crit_edge:
+; CHECK-NEXT:  %[[LCSSAPHI:.*]] = phi i32 [ %inc
+; CHECK-NEXT:  store i32 %[[LCSSAPHI]], i32* %gi, align 4, !tbaa !0
+}
+
+declare i32 @opaque(i32) argmemonly
+declare void @capture(i32*)
+
+; We can promote even if opaque may throw.
+define i32 @test7() {
+; CHECK-LABEL: @test7(
+; CHECK: entry:
+; CHECK-NEXT: %local = alloca
+; CHECK-NEXT: call void @capture(i32* %local)
+; CHECK-NEXT: load i32, i32* %local
+; CHECK-NEXT: br label %loop
+; CHECK: exit:
+; CHECK-NEXT: %[[LCSSAPHI:.*]] = phi i32 [ %x2, %loop ]
+; CHECK-NEXT: store i32 %[[LCSSAPHI]], i32* %local
+; CHECK-NEXT: %ret = load i32, i32* %local
+; CHECK-NEXT: ret i32 %ret
+entry:
+  %local = alloca i32
+  call void @capture(i32* %local)
+  br label %loop
+
+loop:
+  %j = phi i32 [ 0, %entry ], [ %next, %loop ]
+  %x = load i32, i32* %local
+  %x2 = call i32 @opaque(i32 %x) ; Note this does not capture %local
+  store i32 %x2, i32* %local
+  %next = add i32 %j, 1
+  %cond = icmp eq i32 %next, 0
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  %ret = load i32, i32* %local
+  ret i32 %ret
+}
+
+; Make sure we don't promote if the store is really control-flow dependent.
+define i32 @test7bad() {
+; CHECK-LABEL: @test7bad(
+; CHECK: entry:
+; CHECK-NEXT: %local = alloca
+; CHECK-NEXT: call void @capture(i32* %local)
+; CHECK-NEXT: br label %loop
+; CHECK: if:
+; CHECK-NEXT: store i32 %x2, i32* %local
+; CHECK-NEXT: br label %else
+; CHECK: exit:
+; CHECK-NEXT: %ret = load i32, i32* %local
+; CHECK-NEXT: ret i32 %ret
+entry:
+  %local = alloca i32
+  call void @capture(i32* %local)  
+  br label %loop
+loop:
+  %j = phi i32 [ 0, %entry ], [ %next, %else ]
+  %x = load i32, i32* %local
+  %x2 = call i32 @opaque(i32 %x) ; Note this does not capture %local
+  %cmp = icmp eq i32 %x2, 0
+  br i1 %cmp, label %if, label %else
+
+if:  
+  store i32 %x2, i32* %local
+  br label %else
+
+else:
+  %next = add i32 %j, 1
+  %cond = icmp eq i32 %next, 0
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  %ret = load i32, i32* %local
+  ret i32 %ret
+}
+
+; Even if neither the load nor the store or guaranteed to execute because
+; opaque() may throw, we can still promote - the load not being guaranteed
+; doesn't block us, because %local is always dereferenceable.
+define i32 @test8() {
+; CHECK-LABEL: @test8(
+; CHECK: entry:
+; CHECK-NEXT: %local = alloca
+; CHECK-NEXT: call void @capture(i32* %local)
+; CHECK-NEXT: load i32, i32* %local
+; CHECK-NEXT: br label %loop
+; CHECK: exit:
+; CHECK-NEXT: %[[LCSSAPHI:.*]] = phi i32 [ %x2, %loop ]
+; CHECK-NEXT: store i32 %[[LCSSAPHI]], i32* %local
+; CHECK-NEXT: %ret = load i32, i32* %local
+; CHECK-NEXT: ret i32 %ret
+entry:
+  %local = alloca i32
+  call void @capture(i32* %local)  
+  br label %loop
+
+loop:
+  %j = phi i32 [ 0, %entry ], [ %next, %loop ]
+  %throwaway = call i32 @opaque(i32 %j)
+  %x = load i32, i32* %local  
+  %x2 = call i32 @opaque(i32 %x)
+  store i32 %x2, i32* %local
+  %next = add i32 %j, 1
+  %cond = icmp eq i32 %next, 0
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  %ret = load i32, i32* %local
+  ret i32 %ret
+}
+
+
+; If the store is "guaranteed modulo exceptions", and the load depends on
+; control flow, we can only promote if the pointer is otherwise known to be
+; dereferenceable
+define i32 @test9() {
+; CHECK-LABEL: @test9(
+; CHECK: entry:
+; CHECK-NEXT: %local = alloca
+; CHECK-NEXT: call void @capture(i32* %local)
+; CHECK-NEXT: load i32, i32* %local
+; CHECK-NEXT: br label %loop
+; CHECK: exit:
+; CHECK-NEXT: %[[LCSSAPHI:.*]] = phi i32 [ %x2, %else ]
+; CHECK-NEXT: store i32 %[[LCSSAPHI]], i32* %local
+; CHECK-NEXT: %ret = load i32, i32* %local
+; CHECK-NEXT: ret i32 %ret
+entry:
+  %local = alloca i32
+  call void @capture(i32* %local)  
+  br label %loop
+
+loop:
+  %j = phi i32 [ 0, %entry ], [ %next, %else ]  
+  %j2 = call i32 @opaque(i32 %j)
+  %cmp = icmp eq i32 %j2, 0
+  br i1 %cmp, label %if, label %else
+
+if:  
+  %x = load i32, i32* %local
+  br label %else
+
+else:
+  %x2 = phi i32 [ 0, %loop ], [ %x, %if]
+  store i32 %x2, i32* %local
+  %next = add i32 %j, 1
+  %cond = icmp eq i32 %next, 0
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  %ret = load i32, i32* %local
+  ret i32 %ret
+}
+
+define i32 @test9bad(i32 %i) {
+; CHECK-LABEL: @test9bad(
+; CHECK: entry:
+; CHECK-NEXT: %local = alloca
+; CHECK-NEXT: call void @capture(i32* %local)
+; CHECK-NEXT: %notderef = getelementptr
+; CHECK-NEXT: br label %loop
+; CHECK: if:
+; CHECK-NEXT: load i32, i32* %notderef
+; CHECK-NEXT: br label %else
+; CHECK: exit:
+; CHECK-NEXT: %ret = load i32, i32* %notderef
+; CHECK-NEXT: ret i32 %ret
+entry:
+  %local = alloca i32
+  call void @capture(i32* %local)  
+  %notderef = getelementptr i32, i32* %local, i32 %i
+  br label %loop
+
+loop:
+  %j = phi i32 [ 0, %entry ], [ %next, %else ]  
+  %j2 = call i32 @opaque(i32 %j)
+  %cmp = icmp eq i32 %j2, 0
+  br i1 %cmp, label %if, label %else
+
+if:  
+  %x = load i32, i32* %notderef
+  br label %else
+
+else:
+  %x2 = phi i32 [ 0, %loop ], [ %x, %if]
+  store i32 %x2, i32* %notderef
+  %next = add i32 %j, 1
+  %cond = icmp eq i32 %next, 0
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  %ret = load i32, i32* %notderef
+  ret i32 %ret
+}
+
+define void @test10(i32 %i) {
+Entry:
+  br label %Loop
+; CHECK-LABEL: @test10(
+; CHECK: Entry:
+; CHECK-NEXT:   load atomic i32, i32* @X unordered, align 4
+; CHECK-NEXT:   br label %Loop
+
+
+Loop:   ; preds = %Loop, %0
+  %j = phi i32 [ 0, %Entry ], [ %Next, %Loop ]    ; <i32> [#uses=1]
+  %x = load atomic i32, i32* @X unordered, align 4
+  %x2 = add i32 %x, 1
+  store atomic i32 %x2, i32* @X unordered, align 4
+  %Next = add i32 %j, 1
+  %cond = icmp eq i32 %Next, 0
+  br i1 %cond, label %Out, label %Loop
+
+Out:
+  ret void
+; CHECK: Out:
+; CHECK-NEXT:   %[[LCSSAPHI:.*]] = phi i32 [ %x2
+; CHECK-NEXT:   store atomic i32 %[[LCSSAPHI]], i32* @X unordered, align 4
+; CHECK-NEXT:   ret void
+
+}
+
+; Early exit is known not to be taken on first iteration and thus doesn't
+; effect whether load is known to execute.
+define void @test11(i32 %i) {
+Entry:
+  br label %Loop
+; CHECK-LABEL: @test11(
+; CHECK: Entry:
+; CHECK-NEXT:   load i32, i32* @X
+; CHECK-NEXT:   br label %Loop
+
+
+Loop:   ; preds = %Loop, %0
+  %j = phi i32 [ 0, %Entry ], [ %Next, %body ]    ; <i32> [#uses=1]
+  %early.test = icmp ult i32 %j, 32
+  br i1 %early.test, label %body, label %Early
+body:
+  %x = load i32, i32* @X   ; <i32> [#uses=1]
+  %x2 = add i32 %x, 1   ; <i32> [#uses=1]
+  store i32 %x2, i32* @X
+  %Next = add i32 %j, 1   ; <i32> [#uses=2]
+  %cond = icmp eq i32 %Next, 0    ; <i1> [#uses=1]
+  br i1 %cond, label %Out, label %Loop
+
+Early:
+; CHECK: Early:
+; CHECK-NEXT:   %[[LCSSAPHI:.*]] = phi i32 [ %x2
+; CHECK-NEXT:   store i32 %[[LCSSAPHI]], i32* @X
+; CHECK-NEXT:   ret void
+  ret void
+Out:
+  ret void
+; CHECK: Out:
+; CHECK-NEXT:   %[[LCSSAPHI:.*]] = phi i32 [ %x2
+; CHECK-NEXT:   store i32 %[[LCSSAPHI]], i32* @X
+; CHECK-NEXT:   ret void
+
+}
+
+!0 = !{!4, !4, i64 0}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
+!3 = !{!5, !5, i64 0}
+!4 = !{!"int", !1}
+!5 = !{!"float", !1}

Added: llvm/trunk/test/Transforms/LICM/sink-foldable.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/sink-foldable.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/sink-foldable.ll (added)
+++ llvm/trunk/test/Transforms/LICM/sink-foldable.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,150 @@
+; REQUIRES: aarch64-registered-target
+
+; RUN: opt < %s  -licm -S   | FileCheck %s
+
+target triple = "aarch64--linux-gnueabi"
+
+; CHECK-LABEL:@test1
+; CHECK-LABEL:loopexit1:
+; CHECK: %[[PHI:.+]] = phi i8** [ %arrayidx0, %if.end ]
+; CHECK: getelementptr inbounds i8*, i8** %[[PHI]], i64 1
+define i8** @test1(i32 %j, i8** readonly %P, i8* readnone %Q) {
+entry:
+  %cmp0 = icmp slt i32 0, %j
+  br i1 %cmp0, label %for.body.lr.ph, label %return
+
+for.body.lr.ph:
+  br label %for.body
+
+for.body:
+  %P.addr = phi i8** [ %P, %for.body.lr.ph ], [ %arrayidx0, %if.end  ]
+  %i0 = phi i32 [ 0, %for.body.lr.ph ], [ %i.add, %if.end]
+
+  %i0.ext = sext i32 %i0 to i64
+  %arrayidx0 = getelementptr inbounds i8*, i8** %P.addr, i64 %i0.ext
+  %l0 = load i8*, i8** %arrayidx0, align 8
+  %cmp1 = icmp ugt i8* %l0, %Q
+  br i1 %cmp1, label %loopexit0, label %if.end
+
+if.end:                                           ; preds = %for.body
+  %arrayidx1 = getelementptr inbounds i8*, i8** %arrayidx0, i64 1
+  %l1 = load i8*, i8** %arrayidx1, align 8
+  %cmp4 = icmp ugt i8* %l1, %Q
+  %i.add = add nsw i32 %i0, 2
+  br i1 %cmp4, label %loopexit1, label %for.body
+
+loopexit0:
+  %p1 = phi i8** [%arrayidx0, %for.body]
+  br label %return
+
+loopexit1:
+  %p2 = phi i8** [%arrayidx1, %if.end]
+  br label  %return
+
+return:
+  %retval.0 = phi i8** [ %p1, %loopexit0 ], [%p2, %loopexit1], [ null, %entry ]
+  ret i8** %retval.0
+}
+
+; CHECK-LABEL: @test2
+; CHECK-LABEL: loopexit2:
+; CHECK: %[[PHI:.*]] = phi i8** [ %add.ptr, %if.end ]
+; CHECK: getelementptr inbounds i8*, i8** %[[PHI]]
+define i8** @test2(i32 %j, i8** readonly %P, i8* readnone %Q) {
+
+entry:
+  br label %for.body
+
+for.cond:
+  %i.addr.0 = phi i32 [ %add, %if.end ]
+  %P.addr.0 = phi i8** [ %add.ptr, %if.end ]
+  %cmp = icmp slt i32 %i.addr.0, %j
+  br i1 %cmp, label %for.body, label %loopexit0
+
+for.body:
+  %P.addr = phi i8** [ %P, %entry ], [ %P.addr.0, %for.cond ]
+  %i.addr = phi i32 [ 0, %entry ], [ %i.addr.0, %for.cond ]
+
+  %idx.ext = sext i32 %i.addr to i64
+  %add.ptr = getelementptr inbounds i8*, i8** %P.addr, i64 %idx.ext
+  %l0 = load i8*, i8** %add.ptr, align 8
+
+  %cmp1 = icmp ugt i8* %l0, %Q
+  br i1 %cmp1, label %loopexit1, label %if.end
+
+if.end:
+  %add.i = add i32 %i.addr, 1
+  %idx2.ext = sext i32 %add.i to i64
+  %arrayidx2 = getelementptr inbounds i8*, i8** %add.ptr, i64 %idx2.ext
+  %l1 = load i8*, i8** %arrayidx2, align 8
+  %cmp2 = icmp ugt i8* %l1, %Q
+  %add = add nsw i32 %add.i, 1
+  br i1 %cmp2, label %loopexit2, label %for.cond
+
+loopexit0:
+  %p0 = phi i8** [ null, %for.cond ]
+  br label %return
+
+loopexit1:
+  %p1 = phi i8** [ %add.ptr, %for.body ]
+  br label %return
+
+loopexit2:
+  %p2 = phi i8** [ %arrayidx2, %if.end ]
+  br label %return
+
+return:
+  %retval.0 = phi i8** [ %p1, %loopexit1 ], [ %p2, %loopexit2 ], [ %p0, %loopexit0 ]
+  ret i8** %retval.0
+}
+
+
+; CHECK-LABEL: @test3
+; CHECK-LABEL: loopexit1:
+; CHECK: %[[ADD:.*]]  = phi i64 [ %add, %if.end ]
+; CHECK: %[[ADDR:.*]] = phi i8** [ %P.addr, %if.end ]
+; CHECK: %[[TRUNC:.*]] = trunc i64 %[[ADD]] to i32
+; CHECK: getelementptr inbounds i8*, i8** %[[ADDR]], i32 %[[TRUNC]]
+; CHECK: call void @dummy(i32 %[[TRUNC]])
+define i8** @test3(i64 %j, i8** readonly %P, i8* readnone %Q) {
+entry:
+  %cmp0 = icmp slt i64 0, %j
+  br i1 %cmp0, label %for.body.lr.ph, label %return
+
+for.body.lr.ph:
+  br label %for.body
+
+for.body:
+  %P.addr = phi i8** [ %P, %for.body.lr.ph ], [ %arrayidx0, %if.end  ]
+  %i0 = phi i32 [ 0, %for.body.lr.ph ], [ %i.add, %if.end]
+
+  %i0.ext = sext i32 %i0 to i64
+  %arrayidx0 = getelementptr inbounds i8*, i8** %P.addr, i64 %i0.ext
+  %l0 = load i8*, i8** %arrayidx0, align 8
+  %cmp1 = icmp ugt i8* %l0, %Q
+  br i1 %cmp1, label %loopexit0, label %if.end
+
+if.end:                                           ; preds = %for.body
+  %add = add i64 %i0.ext, 1
+  %trunc = trunc i64 %add to i32
+  %arrayidx1 = getelementptr inbounds i8*, i8** %P.addr, i32 %trunc
+  %l1 = load i8*, i8** %arrayidx1, align 8
+  %cmp4 = icmp ugt i8* %l1, %Q
+  %i.add = add nsw i32 %i0, 2
+  br i1 %cmp4, label %loopexit1, label %for.body
+
+loopexit0:
+  %p1 = phi i8** [%arrayidx0, %for.body]
+  br label %return
+
+loopexit1:
+  %p2 = phi i8** [%arrayidx1, %if.end]
+  call void @dummy(i32 %trunc)
+  br label  %return
+
+return:
+  %retval.0 = phi i8** [ %p1, %loopexit0 ], [%p2, %loopexit1], [ null, %entry ]
+  ret i8** %retval.0
+}
+
+declare void @dummy(i32)

Added: llvm/trunk/test/Transforms/LICM/sink-promote.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/sink-promote.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/sink-promote.ll (added)
+++ llvm/trunk/test/Transforms/LICM/sink-promote.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,50 @@
+; RUN: opt < %s -basicaa -licm -S | FileCheck %s
+
+; Test moved from sinking.ll, as it tests sinking of a store who alone touches
+; a memory location in a loop.
+; Store can be sunk out of exit block containing indirectbr instructions after
+; D50925. Updated to use an argument instead of undef, due to PR38989.
+define void @test12(i32* %ptr) {
+; CHECK-LABEL: @test12
+; CHECK: store
+; CHECK-NEXT: br label %lab4
+  br label %lab4
+
+lab4:
+  br label %lab20
+
+lab5:
+  br label %lab20
+
+lab6:
+  br label %lab4
+
+lab7:
+  br i1 undef, label %lab8, label %lab13
+
+lab8:
+  br i1 undef, label %lab13, label %lab10
+
+lab10:
+  br label %lab7
+
+lab13:
+  ret void
+
+lab20:
+  br label %lab21
+
+lab21:
+; CHECK: lab21:
+; CHECK-NOT: store
+; CHECK: br i1 false, label %lab21, label %lab22
+  store i32 36127957, i32* %ptr, align 4
+  br i1 undef, label %lab21, label %lab22
+
+lab22:
+; CHECK: lab22:
+; CHECK-NOT: store
+; CHECK-NEXT: indirectbr i8* undef
+  indirectbr i8* undef, [label %lab5, label %lab6, label %lab7]
+}
+

Added: llvm/trunk/test/Transforms/LICM/sink.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/sink.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/sink.ll (added)
+++ llvm/trunk/test/Transforms/LICM/sink.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,64 @@
+; RUN: opt -S -licm < %s | FileCheck %s --check-prefix=CHECK-LICM
+; RUN: opt -S -licm < %s | opt -S -loop-sink | FileCheck %s --check-prefix=CHECK-SINK
+; RUN: opt -S < %s -passes='require<opt-remark-emit>,loop(licm),loop-sink' \
+; RUN:     | FileCheck %s --check-prefix=CHECK-SINK
+; RUN: opt -S -licm -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s --check-prefix=CHECK-LICM
+
+; Original source code:
+; int g;
+; int foo(int p, int x) {
+;   for (int i = 0; i != x; i++)
+;     if (__builtin_expect(i == p, 0)) {
+;       x += g; x *= g;
+;     }
+;   return x;
+; }
+;
+; Load of global value g should not be hoisted to preheader.
+
+ at g = global i32 0, align 4
+
+define i32 @foo(i32, i32) #0 !prof !2 {
+  %3 = icmp eq i32 %1, 0
+  br i1 %3, label %._crit_edge, label %.lr.ph.preheader
+
+.lr.ph.preheader:
+  br label %.lr.ph
+
+; CHECK-LICM: .lr.ph.preheader:
+; CHECK-LICM: load i32, i32* @g
+; CHECK-LICM: br label %.lr.ph
+
+.lr.ph:
+  %.03 = phi i32 [ %8, %.combine ], [ 0, %.lr.ph.preheader ]
+  %.012 = phi i32 [ %.1, %.combine ], [ %1, %.lr.ph.preheader ]
+  %4 = icmp eq i32 %.03, %0
+  br i1 %4, label %.then, label %.combine, !prof !1
+
+.then:
+  %5 = load i32, i32* @g, align 4
+  %6 = add nsw i32 %5, %.012
+  %7 = mul nsw i32 %6, %5
+  br label %.combine
+
+; CHECK-SINK: .then:
+; CHECK-SINK: load i32, i32* @g
+; CHECK-SINK: br label %.combine
+
+.combine:
+  %.1 = phi i32 [ %7, %.then ], [ %.012, %.lr.ph ]
+  %8 = add nuw nsw i32 %.03, 1
+  %9 = icmp eq i32 %8, %.1
+  br i1 %9, label %._crit_edge.loopexit, label %.lr.ph
+
+._crit_edge.loopexit:
+  %.1.lcssa = phi i32 [ %.1, %.combine ]
+  br label %._crit_edge
+
+._crit_edge:
+  %.01.lcssa = phi i32 [ 0, %2 ], [ %.1.lcssa, %._crit_edge.loopexit ]
+  ret i32 %.01.lcssa
+}
+
+!1 = !{!"branch_weights", i32 1, i32 2000}
+!2 = !{!"function_entry_count", i64 1}

Added: llvm/trunk/test/Transforms/LICM/sinking.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/sinking.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/sinking.ll (added)
+++ llvm/trunk/test/Transforms/LICM/sinking.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,743 @@
+; RUN: opt < %s -basicaa -licm -S | FileCheck %s
+; RUN: opt < %s -debugify -basicaa -licm -S | FileCheck %s -check-prefix=DEBUGIFY
+; RUN: opt < %s -basicaa -licm -S -enable-mssa-loop-dependency=true -verify-memoryssa | FileCheck %s
+
+
+declare i32 @strlen(i8*) readonly nounwind
+
+declare void @foo()
+
+; Sink readonly function.
+define i32 @test1(i8* %P) {
+	br label %Loop
+
+Loop:		; preds = %Loop, %0
+	%A = call i32 @strlen( i8* %P ) readonly
+	br i1 false, label %Loop, label %Out
+
+Out:		; preds = %Loop
+	ret i32 %A
+; CHECK-LABEL: @test1(
+; CHECK: Out:
+; CHECK-NEXT: call i32 @strlen
+; CHECK-NEXT: ret i32 %A
+}
+
+declare double @sin(double) readnone nounwind
+
+; Sink readnone function out of loop with unknown memory behavior.
+define double @test2(double %X) {
+	br label %Loop
+
+Loop:		; preds = %Loop, %0
+	call void @foo( )
+	%A = call double @sin( double %X ) readnone
+	br i1 true, label %Loop, label %Out
+
+Out:		; preds = %Loop
+	ret double %A
+; CHECK-LABEL: @test2(
+; CHECK: Out:
+; CHECK-NEXT: call double @sin
+; CHECK-NEXT: ret double %A
+}
+
+; FIXME: Should be able to sink this case
+define i32 @test2b(i32 %X) {
+	br label %Loop
+
+Loop:		; preds = %Loop, %0
+	call void @foo( )
+	%A = sdiv i32 10, %X
+	br i1 true, label %Loop, label %Out
+
+Out:		; preds = %Loop
+	ret i32 %A
+; CHECK-LABEL: @test2b(
+; CHECK: Out:
+; CHECK-NEXT: sdiv
+; CHECK-NEXT: ret i32 %A
+}
+
+define double @test2c(double* %P) {
+	br label %Loop
+
+Loop:		; preds = %Loop, %0
+	call void @foo( )
+	%A = load double, double* %P, !invariant.load !{}
+	br i1 true, label %Loop, label %Out
+
+Out:		; preds = %Loop
+	ret double %A
+; CHECK-LABEL: @test2c(
+; CHECK: Out:
+; CHECK-NEXT: load double
+; CHECK-NEXT: ret double %A
+}
+
+; This testcase checks to make sure the sinker does not cause problems with
+; critical edges.
+define void @test3() {
+Entry:
+	br i1 false, label %Loop, label %Exit
+Loop:
+	%X = add i32 0, 1
+	br i1 false, label %Loop, label %Exit
+Exit:
+	%Y = phi i32 [ 0, %Entry ], [ %X, %Loop ]
+	ret void
+        
+; CHECK-LABEL: @test3(
+; CHECK:     Exit.loopexit:
+; CHECK-NEXT:  %X.le = add i32 0, 1
+; CHECK-NEXT:  br label %Exit
+
+}
+
+; If the result of an instruction is only used outside of the loop, sink
+; the instruction to the exit blocks instead of executing it on every
+; iteration of the loop.
+;
+define i32 @test4(i32 %N) {
+Entry:
+	br label %Loop
+Loop:		; preds = %Loop, %Entry
+	%N_addr.0.pn = phi i32 [ %dec, %Loop ], [ %N, %Entry ]	
+	%tmp.6 = mul i32 %N, %N_addr.0.pn		; <i32> [#uses=1]
+	%tmp.7 = sub i32 %tmp.6, %N		; <i32> [#uses=1]
+	%dec = add i32 %N_addr.0.pn, -1		; <i32> [#uses=1]
+	%tmp.1 = icmp ne i32 %N_addr.0.pn, 1		; <i1> [#uses=1]
+	br i1 %tmp.1, label %Loop, label %Out
+Out:		; preds = %Loop
+	ret i32 %tmp.7
+; CHECK-LABEL: @test4(
+; CHECK:     Out:
+; CHECK-NEXT:  %[[LCSSAPHI:.*]] = phi i32 [ %N_addr.0.pn
+; CHECK-NEXT:  mul i32 %N, %[[LCSSAPHI]]
+; CHECK-NEXT:  sub i32 %tmp.6.le, %N
+; CHECK-NEXT:  ret i32
+}
+
+; To reduce register pressure, if a load is hoistable out of the loop, and the
+; result of the load is only used outside of the loop, sink the load instead of
+; hoisting it!
+;
+ at X = global i32 5		; <i32*> [#uses=1]
+
+define i32 @test5(i32 %N) {
+Entry:
+	br label %Loop
+Loop:		; preds = %Loop, %Entry
+	%N_addr.0.pn = phi i32 [ %dec, %Loop ], [ %N, %Entry ]	
+	%tmp.6 = load i32, i32* @X		; <i32> [#uses=1]
+	%dec = add i32 %N_addr.0.pn, -1		; <i32> [#uses=1]
+	%tmp.1 = icmp ne i32 %N_addr.0.pn, 1		; <i1> [#uses=1]
+	br i1 %tmp.1, label %Loop, label %Out
+Out:		; preds = %Loop
+	ret i32 %tmp.6
+; CHECK-LABEL: @test5(
+; CHECK:     Out:
+; CHECK-NEXT:  %tmp.6.le = load i32, i32* @X
+; CHECK-NEXT:  ret i32 %tmp.6.le
+}
+
+
+
+; The loop sinker was running from the bottom of the loop to the top, causing
+; it to miss opportunities to sink instructions that depended on sinking other
+; instructions from the loop.  Instead they got hoisted, which is better than
+; leaving them in the loop, but increases register pressure pointlessly.
+
+	%Ty = type { i32, i32 }
+ at X2 = external global %Ty
+
+define i32 @test6() {
+	br label %Loop
+Loop:
+	%dead = getelementptr %Ty, %Ty* @X2, i64 0, i32 0
+	%sunk2 = load i32, i32* %dead
+	br i1 false, label %Loop, label %Out
+Out:		; preds = %Loop
+	ret i32 %sunk2
+; CHECK-LABEL: @test6(
+; CHECK:     Out:
+; CHECK-NEXT:  %dead.le = getelementptr %Ty, %Ty* @X2, i64 0, i32 0
+; CHECK-NEXT:  %sunk2.le = load i32, i32* %dead.le
+; CHECK-NEXT:  ret i32 %sunk2.le
+}
+
+
+
+; This testcase ensures that we can sink instructions from loops with
+; multiple exits.
+;
+define i32 @test7(i32 %N, i1 %C) {
+Entry:
+	br label %Loop
+Loop:		; preds = %ContLoop, %Entry
+	%N_addr.0.pn = phi i32 [ %dec, %ContLoop ], [ %N, %Entry ]
+	%tmp.6 = mul i32 %N, %N_addr.0.pn
+	%tmp.7 = sub i32 %tmp.6, %N		; <i32> [#uses=2]
+	%dec = add i32 %N_addr.0.pn, -1		; <i32> [#uses=1]
+	br i1 %C, label %ContLoop, label %Out1
+ContLoop:
+	%tmp.1 = icmp ne i32 %N_addr.0.pn, 1
+	br i1 %tmp.1, label %Loop, label %Out2
+Out1:		; preds = %Loop
+	ret i32 %tmp.7
+Out2:		; preds = %ContLoop
+	ret i32 %tmp.7
+; CHECK-LABEL: @test7(
+; CHECK:     Out1:
+; CHECK-NEXT:  %[[LCSSAPHI:.*]] = phi i32 [ %N_addr.0.pn
+; CHECK-NEXT:  mul i32 %N, %[[LCSSAPHI]]
+; CHECK-NEXT:  sub i32 %tmp.6.le, %N
+; CHECK-NEXT:  ret
+; CHECK:     Out2:
+; CHECK-NEXT:  %[[LCSSAPHI:.*]] = phi i32 [ %N_addr.0.pn
+; CHECK-NEXT:  mul i32 %N, %[[LCSSAPHI]]
+; CHECK-NEXT:  sub i32 %tmp.6.le4, %N
+; CHECK-NEXT:  ret
+}
+
+
+; This testcase checks to make sure we can sink values which are only live on
+; some exits out of the loop, and that we can do so without breaking dominator
+; info.
+define i32 @test8(i1 %C1, i1 %C2, i32* %P, i32* %Q) {
+Entry:
+	br label %Loop
+Loop:		; preds = %Cont, %Entry
+	br i1 %C1, label %Cont, label %exit1
+Cont:		; preds = %Loop
+	%X = load i32, i32* %P		; <i32> [#uses=2]
+	store i32 %X, i32* %Q
+	%V = add i32 %X, 1		; <i32> [#uses=1]
+	br i1 %C2, label %Loop, label %exit2
+exit1:		; preds = %Loop
+	ret i32 0
+exit2:		; preds = %Cont
+	ret i32 %V
+; CHECK-LABEL: @test8(
+; CHECK:     exit1:
+; CHECK-NEXT:  ret i32 0
+; CHECK:     exit2:
+; CHECK-NEXT:  %[[LCSSAPHI:.*]] = phi i32 [ %X
+; CHECK-NEXT:  %V.le = add i32 %[[LCSSAPHI]], 1
+; CHECK-NEXT:  ret i32 %V.le
+}
+
+
+define void @test9() {
+loopentry.2.i:
+	br i1 false, label %no_exit.1.i.preheader, label %loopentry.3.i.preheader
+no_exit.1.i.preheader:		; preds = %loopentry.2.i
+	br label %no_exit.1.i
+no_exit.1.i:		; preds = %endif.8.i, %no_exit.1.i.preheader
+	br i1 false, label %return.i, label %endif.8.i
+endif.8.i:		; preds = %no_exit.1.i
+	%inc.1.i = add i32 0, 1		; <i32> [#uses=1]
+	br i1 false, label %no_exit.1.i, label %loopentry.3.i.preheader.loopexit
+loopentry.3.i.preheader.loopexit:		; preds = %endif.8.i
+	br label %loopentry.3.i.preheader
+loopentry.3.i.preheader:		; preds = %loopentry.3.i.preheader.loopexit, %loopentry.2.i
+	%arg_num.0.i.ph13000 = phi i32 [ 0, %loopentry.2.i ], [ %inc.1.i, %loopentry.3.i.preheader.loopexit ]		; <i32> [#uses=0]
+	ret void
+return.i:		; preds = %no_exit.1.i
+	ret void
+
+; CHECK-LABEL: @test9(
+; CHECK: loopentry.3.i.preheader.loopexit:
+; CHECK-NEXT:  %inc.1.i.le = add i32 0, 1
+; CHECK-NEXT:  br label %loopentry.3.i.preheader
+}
+
+
+; Potentially trapping instructions may be sunk as long as they are guaranteed
+; to be executed.
+define i32 @test10(i32 %N) {
+Entry:
+	br label %Loop
+Loop:		; preds = %Loop, %Entry
+	%N_addr.0.pn = phi i32 [ %dec, %Loop ], [ %N, %Entry ]		; <i32> [#uses=3]
+	%tmp.6 = sdiv i32 %N, %N_addr.0.pn		; <i32> [#uses=1]
+	%dec = add i32 %N_addr.0.pn, -1		; <i32> [#uses=1]
+	%tmp.1 = icmp ne i32 %N_addr.0.pn, 0		; <i1> [#uses=1]
+	br i1 %tmp.1, label %Loop, label %Out
+Out:		; preds = %Loop
+	ret i32 %tmp.6
+        
+; CHECK-LABEL: @test10(
+; CHECK: Out: 
+; CHECK-NEXT:  %[[LCSSAPHI:.*]] = phi i32 [ %N_addr.0.pn
+; CHECK-NEXT:  %tmp.6.le = sdiv i32 %N, %[[LCSSAPHI]]
+; CHECK-NEXT:  ret i32 %tmp.6.le
+}
+
+; Should delete, not sink, dead instructions.
+define void @test11() {
+	br label %Loop
+Loop:
+	%dead1 = getelementptr %Ty, %Ty* @X2, i64 0, i32 0
+	%dead2 = getelementptr %Ty, %Ty* @X2, i64 0, i32 1
+	br i1 false, label %Loop, label %Out
+Out:
+	ret void
+; CHECK-LABEL: @test11(
+; CHECK:     Out:
+; CHECK-NEXT:  ret void
+
+; The GEP in dead1 is adding a zero offset, so the DIExpression can be kept as
+; a "register location".
+; The GEP in dead2 is adding a 4 bytes to the pointer, so the DIExpression is
+; turned into an "implicit location" using DW_OP_stack_value.
+;
+; DEBUGIFY-LABEL: @test11(
+; DEBUGIFY: call void @llvm.dbg.value(metadata %Ty* @X2, metadata {{.*}}, metadata !DIExpression())
+; DEBUGIFY: call void @llvm.dbg.value(metadata %Ty* @X2, metadata {{.*}}, metadata !DIExpression(DW_OP_plus_uconst, 4, DW_OP_stack_value))
+}
+
+ at c = common global [1 x i32] zeroinitializer, align 4
+
+; Test a *many* way nested loop with multiple exit blocks both of which exit
+; multiple loop nests. This exercises LCSSA corner cases.
+define i32 @PR18753(i1* %a, i1* %b, i1* %c, i1* %d) {
+entry:
+  br label %l1.header
+
+l1.header:
+  %iv = phi i64 [ %iv.next, %l1.latch ], [ 0, %entry ]
+  %arrayidx.i = getelementptr inbounds [1 x i32], [1 x i32]* @c, i64 0, i64 %iv
+  br label %l2.header
+
+l2.header:
+  %x0 = load i1, i1* %c, align 4
+  br i1 %x0, label %l1.latch, label %l3.preheader
+
+l3.preheader:
+  br label %l3.header
+
+l3.header:
+  %x1 = load i1, i1* %d, align 4
+  br i1 %x1, label %l2.latch, label %l4.preheader
+
+l4.preheader:
+  br label %l4.header
+
+l4.header:
+  %x2 = load i1, i1* %a
+  br i1 %x2, label %l3.latch, label %l4.body
+
+l4.body:
+  call void @f(i32* %arrayidx.i)
+  %x3 = load i1, i1* %b
+  %l = trunc i64 %iv to i32
+  br i1 %x3, label %l4.latch, label %exit
+
+l4.latch:
+  call void @g()
+  %x4 = load i1, i1* %b, align 4
+  br i1 %x4, label %l4.header, label %exit
+
+l3.latch:
+  br label %l3.header
+
+l2.latch:
+  br label %l2.header
+
+l1.latch:
+  %iv.next = add nsw i64 %iv, 1
+  br label %l1.header
+
+exit:
+  %lcssa = phi i32 [ %l, %l4.latch ], [ %l, %l4.body ]
+; CHECK-LABEL: @PR18753(
+; CHECK:       exit:
+; CHECK-NEXT:    %[[LCSSAPHI:.*]] = phi i64 [ %iv, %l4.latch ], [ %iv, %l4.body ]
+; CHECK-NEXT:    %l.le = trunc i64 %[[LCSSAPHI]] to i32
+; CHECK-NEXT:    ret i32 %l.le
+
+  ret i32 %lcssa
+}
+
+; @test12 moved to sink-promote.ll, as it tests sinking and promotion.
+
+; Test that we don't crash when trying to sink stores and there's no preheader
+; available (which is used for creating loads that may be used by the SSA
+; updater)
+define void @test13() {
+; CHECK-LABEL: @test13
+  br label %lab59
+
+lab19:
+  br i1 undef, label %lab20, label %lab38
+
+lab20:
+  br label %lab60
+
+lab21:
+  br i1 undef, label %lab22, label %lab38
+
+lab22:
+  br label %lab38
+
+lab38:
+  ret void
+
+lab59:
+  indirectbr i8* undef, [label %lab60, label %lab38]
+
+lab60:
+; CHECK: lab60:
+; CHECK: store
+; CHECK-NEXT: indirectbr
+  store i32 2145244101, i32* undef, align 4
+  indirectbr i8* undef, [label %lab21, label %lab19]
+}
+
+; Check if LICM can sink a sinkable instruction the exit blocks through
+; a non-trivially replacable PHI node.
+;
+; CHECK-LABEL: @test14
+; CHECK-LABEL: Loop:
+; CHECK-NOT: mul
+; CHECK-NOT: sub
+;
+; CHECK-LABEL: Out12.split.loop.exit:
+; CHECK: %[[LCSSAPHI:.*]] = phi i32 [ %N_addr.0.pn, %ContLoop ]
+; CHECK: %[[MUL:.*]] = mul i32 %N, %[[LCSSAPHI]]
+; CHECK: br label %Out12
+;
+; CHECK-LABEL: Out12.split.loop.exit1:
+; CHECK: %[[LCSSAPHI2:.*]] = phi i32 [ %N_addr.0.pn, %Loop ]
+; CHECK: %[[MUL2:.*]] = mul i32 %N, %[[LCSSAPHI2]]
+; CHECK: %[[SUB:.*]] = sub i32 %[[MUL2]], %N
+; CHECK: br label %Out12
+;
+; CHECK-LABEL: Out12:
+; CHECK: phi i32 [ %[[MUL]], %Out12.split.loop.exit ], [ %[[SUB]], %Out12.split.loop.exit1 ]
+define i32 @test14(i32 %N, i32 %N2, i1 %C) {
+Entry:
+        br label %Loop
+Loop:
+        %N_addr.0.pn = phi i32 [ %dec, %ContLoop ], [ %N, %Entry ]
+        %sink.mul = mul i32 %N, %N_addr.0.pn
+        %sink.sub = sub i32 %sink.mul, %N
+        %dec = add i32 %N_addr.0.pn, -1
+        br i1 %C, label %ContLoop, label %Out12
+ContLoop:
+        %tmp.1 = icmp ne i32 %N_addr.0.pn, 1
+        br i1 %tmp.1, label %Loop, label %Out12
+Out12:
+  %tmp = phi i32 [%sink.mul,  %ContLoop], [%sink.sub, %Loop]
+  ret i32 %tmp
+}
+
+; In this test, splitting predecessors is not really required because the
+; operations of sinkable instructions (sub and mul) are same. In this case, we
+; can sink the same sinkable operations and modify the PHI to pass the operands
+; to the shared operations. As of now, we split predecessors of non-trivially
+; replicalbe PHIs by default in LICM because all incoming edges of a
+; non-trivially replacable PHI in LCSSA is critical.
+;
+; CHECK-LABEL: @test15
+; CHECK-LABEL: Loop:
+; CHECK-NOT: mul
+; CHECK-NOT: sub
+;
+; CHECK-LABEL: Out12.split.loop.exit:
+; CHECK: %[[LCSSAPHI:.*]] = phi i32 [ %N_addr.0.pn, %ContLoop ]
+; CHECK: %[[MUL:.*]] = mul i32 %N, %[[LCSSAPHI]]
+; CHECK: %[[SUB:.*]] = sub i32 %[[MUL]], %N2
+; CHECK: br label %Out12
+;
+; CHECK-LABEL: Out12.split.loop.exit1:
+; CHECK: %[[LCSSAPHI2:.*]] = phi i32 [ %N_addr.0.pn, %Loop ]
+; CHECK: %[[MUL2:.*]] = mul i32 %N, %[[LCSSAPHI2]]
+; CHECK: %[[SUB2:.*]] = sub i32 %[[MUL2]], %N
+; CHECK: br label %Out12
+;
+; CHECK-LABEL: Out12:
+; CHECK: phi i32 [ %[[SUB]], %Out12.split.loop.exit ], [ %[[SUB2]], %Out12.split.loop.exit1 ]
+define i32 @test15(i32 %N, i32 %N2, i1 %C) {
+Entry:
+        br label %Loop
+Loop:
+        %N_addr.0.pn = phi i32 [ %dec, %ContLoop ], [ %N, %Entry ]
+        %sink.mul = mul i32 %N, %N_addr.0.pn
+        %sink.sub = sub i32 %sink.mul, %N
+        %sink.sub2 = sub i32 %sink.mul, %N2
+        %dec = add i32 %N_addr.0.pn, -1
+        br i1 %C, label %ContLoop, label %Out12
+ContLoop:
+        %tmp.1 = icmp ne i32 %N_addr.0.pn, 1
+        br i1 %tmp.1, label %Loop, label %Out12
+Out12:
+  %tmp = phi i32 [%sink.sub2, %ContLoop], [%sink.sub, %Loop]
+  ret i32 %tmp
+}
+
+; Sink through a non-trivially replacable PHI node which use the same sinkable
+; instruction multiple times.
+;
+; CHECK-LABEL: @test16
+; CHECK-LABEL: Loop:
+; CHECK-NOT: mul
+;
+; CHECK-LABEL: Out.split.loop.exit:
+; CHECK: %[[PHI:.*]] = phi i32 [ %l2, %ContLoop ]
+; CHECK: br label %Out
+;
+; CHECK-LABEL: Out.split.loop.exit1:
+; CHECK: %[[SINKABLE:.*]] = mul i32 %l2.lcssa, %t.le
+; CHECK: br label %Out
+;
+; CHECK-LABEL: Out:
+; CHECK: %idx = phi i32 [ %[[PHI]], %Out.split.loop.exit ], [ %[[SINKABLE]], %Out.split.loop.exit1 ]
+define i32 @test16(i1 %c, i8** %P, i32* %P2, i64 %V) {
+entry:
+  br label %loop.ph
+loop.ph:
+  br label %Loop
+Loop:
+  %iv = phi i64 [ 0, %loop.ph ], [ %next, %ContLoop ]
+  %l2 = call i32 @getv()
+  %t = trunc i64 %iv to i32
+  %sinkable = mul i32 %l2,  %t
+  switch i32 %l2, label %ContLoop [
+    i32 32, label %Out
+    i32 46, label %Out
+    i32 95, label %Out
+  ]
+ContLoop:
+  %next = add nuw i64 %iv, 1
+  %c1 = call i1 @getc()
+  br i1 %c1, label %Loop, label %Out
+Out:
+  %idx = phi i32 [ %l2, %ContLoop ], [ %sinkable, %Loop ], [ %sinkable, %Loop ], [ %sinkable, %Loop ]
+  ret i32 %idx
+}
+
+; Sink a sinkable instruction through multiple non-trivially replacable PHIs in
+; differect exit blocks.
+;
+; CHECK-LABEL: @test17
+; CHECK-LABEL: Loop:
+; CHECK-NOT: mul
+;
+; CHECK-LABEL:OutA.split.loop.exit{{.*}}:
+; CHECK:  %[[OP1:.*]] = phi i32 [ %N_addr.0.pn, %ContLoop1 ]
+; CHECK:  %[[SINKABLE:.*]] = mul i32 %N, %[[OP1]]
+; CHECK:  br label %OutA
+;
+; CHECK-LABEL:OutA:
+; CHECK: phi i32{{.*}}[ %[[SINKABLE]], %OutA.split.loop.exit{{.*}} ]
+;
+; CHECK-LABEL:OutB.split.loop.exit{{.*}}:
+; CHECK:  %[[OP2:.*]] = phi i32 [ %N_addr.0.pn, %ContLoop2 ]
+; CHECK:  %[[SINKABLE2:.*]] = mul i32 %N, %[[OP2]]
+; CHECK:  br label %OutB
+;
+; CHECK-LABEL:OutB:
+; CHECK:  phi i32 {{.*}}[ %[[SINKABLE2]], %OutB.split.loop.exit{{.*}} ]
+define i32 @test17(i32 %N, i32 %N2) {
+Entry:
+        br label %Loop
+Loop:
+        %N_addr.0.pn = phi i32 [ %dec, %ContLoop3 ], [ %N, %Entry ]
+        %sink.mul = mul i32 %N, %N_addr.0.pn
+        %c0 = call i1 @getc()
+        br i1 %c0 , label %ContLoop1, label %OutA
+ContLoop1:
+        %c1 = call i1 @getc()
+        br i1 %c1, label %ContLoop2, label %OutA
+
+ContLoop2:
+        %c2 = call i1 @getc()
+        br i1 %c2, label %ContLoop3, label %OutB
+ContLoop3:
+        %c3 = call i1 @getc()
+        %dec = add i32 %N_addr.0.pn, -1
+        br i1 %c3, label %Loop, label %OutB
+OutA:
+        %tmp1 = phi i32 [%sink.mul, %ContLoop1], [%N2, %Loop]
+        br label %Out12
+OutB:
+        %tmp2 = phi i32 [%sink.mul, %ContLoop2], [%dec, %ContLoop3]
+        br label %Out12
+Out12:
+  %tmp = phi i32 [%tmp1, %OutA], [%tmp2, %OutB]
+  ret i32 %tmp
+}
+
+
+; Sink a sinkable instruction through both trivially and non-trivially replacable PHIs.
+;
+; CHECK-LABEL: @test18
+; CHECK-LABEL: Loop:
+; CHECK-NOT: mul
+; CHECK-NOT: sub
+;
+; CHECK-LABEL:Out12.split.loop.exit:
+; CHECK:  %[[OP:.*]] = phi i32 [ %iv, %ContLoop ]
+; CHECK:  %[[DEC:.*]] = phi i32 [ %dec, %ContLoop ]
+; CHECK:  %[[SINKMUL:.*]] = mul i32 %N, %[[OP]]
+; CHECK:  %[[SINKSUB:.*]] = sub i32 %[[SINKMUL]], %N2
+; CHECK:  br label %Out12
+;
+; CHECK-LABEL:Out12.split.loop.exit1:
+; CHECK:  %[[OP2:.*]] = phi i32 [ %iv, %Loop ]
+; CHECK:  %[[SINKMUL2:.*]] = mul i32 %N, %[[OP2]]
+; CHECK:  %[[SINKSUB2:.*]] = sub i32 %[[SINKMUL2]], %N2
+; CHECK:  br label %Out12
+;
+; CHECK-LABEL:Out12:
+; CHECK:  %tmp1 = phi i32 [ %[[SINKSUB]], %Out12.split.loop.exit ], [ %[[SINKSUB2]], %Out12.split.loop.exit1 ]
+; CHECK:  %tmp2 = phi i32 [ %[[DEC]], %Out12.split.loop.exit ], [ %[[SINKSUB2]], %Out12.split.loop.exit1 ]
+; CHECK:  %add = add i32 %tmp1, %tmp2
+define i32 @test18(i32 %N, i32 %N2) {
+Entry:
+        br label %Loop
+Loop:
+        %iv = phi i32 [ %dec, %ContLoop ], [ %N, %Entry ]
+        %sink.mul = mul i32 %N, %iv
+        %sink.sub = sub i32 %sink.mul, %N2
+        %c0 = call i1 @getc()
+        br i1 %c0, label %ContLoop, label %Out12
+ContLoop:
+        %dec = add i32 %iv, -1
+        %c1 = call i1 @getc()
+        br i1 %c1, label %Loop, label %Out12
+Out12:
+  %tmp1 = phi i32 [%sink.sub, %ContLoop], [%sink.sub, %Loop]
+  %tmp2 = phi i32 [%dec, %ContLoop], [%sink.sub, %Loop]
+  %add = add i32 %tmp1, %tmp2
+  ret i32 %add
+}
+
+; Do not sink an instruction through a non-trivially replacable PHI, to avoid
+; assert while splitting predecessors, if the terminator of predecessor is an
+; indirectbr.
+; CHECK-LABEL: @test19
+; CHECK-LABEL: L0:
+; CHECK: %sinkable = mul
+; CHECK: %sinkable2 = add
+
+define i32 @test19(i1 %cond, i1 %cond2, i8* %address, i32 %v1) nounwind {
+entry:
+  br label %L0
+L0:
+  %indirect.goto.dest = select i1 %cond, i8* blockaddress(@test19, %exit), i8* %address
+  %v2 = call i32 @getv()
+  %sinkable = mul i32 %v1, %v2
+  %sinkable2 = add i32 %v1, %v2
+  indirectbr i8* %indirect.goto.dest, [label %L1, label %exit]
+
+L1:
+  %indirect.goto.dest2 = select i1 %cond2, i8* blockaddress(@test19, %exit), i8* %address
+  indirectbr i8* %indirect.goto.dest2, [label %L0, label %exit]
+
+exit:
+  %r = phi i32 [%sinkable, %L0], [%sinkable2, %L1]
+  ret i32 %r
+}
+
+
+; Do not sink through a non-trivially replacable PHI if splitting predecessors
+; not allowed in SplitBlockPredecessors().
+;
+; CHECK-LABEL: @test20
+; CHECK-LABEL: while.cond
+; CHECK: %sinkable = mul
+; CHECK: %sinkable2 = add
+define void @test20(i32* %s, i1 %b, i32 %v1, i32 %v2) personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  br label %while.cond
+while.cond:
+  %v = call i32 @getv()
+  %sinkable = mul i32 %v, %v2
+  %sinkable2 = add  i32 %v, %v2
+  br i1 %b, label %try.cont, label %while.body
+while.body:
+  invoke void @may_throw()
+          to label %while.body2 unwind label %catch.dispatch
+while.body2:
+  invoke void @may_throw2()
+          to label %while.cond unwind label %catch.dispatch
+catch.dispatch:
+  %.lcssa1 = phi i32 [ %sinkable, %while.body ], [ %sinkable2, %while.body2 ]
+  %cp = cleanuppad within none []
+  store i32 %.lcssa1, i32* %s
+  cleanupret from %cp unwind to caller
+try.cont:
+  ret void
+}
+
+; The sinkable call should be sunk into an exit block split. After splitting
+; the exit block, BlockColor for new blocks should be added properly so
+; that we should be able to access valid ColorVector.
+;
+; CHECK-LABEL:@test21_pr36184
+; CHECK-LABEL: Loop
+; CHECK-NOT: %sinkableCall
+; CHECK-LABEL:Out.split.loop.exit
+; CHECK: %sinkableCall
+define i32 @test21_pr36184(i8* %P) personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  br label %loop.ph
+
+loop.ph:
+  br label %Loop
+
+Loop:
+  %sinkableCall = call i32 @strlen( i8* %P ) readonly
+  br i1 undef, label %ContLoop, label %Out
+
+ContLoop:
+  br i1 undef, label %Loop, label %Out
+
+Out:
+  %idx = phi i32 [ %sinkableCall, %Loop ], [0, %ContLoop ]
+  ret i32 %idx
+}
+
+; We do not support splitting a landingpad block if BlockColors is not empty.
+; CHECK-LABEL: @test22
+; CHECK-LABEL: while.body2
+; CHECK-LABEL: %mul
+; CHECK-NOT: lpadBB.split{{.*}}
+define void @test22(i1 %b, i32 %v1, i32 %v2) personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  br label %while.cond
+while.cond:
+  br i1 %b, label %try.cont, label %while.body
+
+while.body:
+  invoke void @may_throw()
+          to label %while.body2 unwind label %lpadBB
+
+while.body2:
+  %v = call i32 @getv()
+  %mul = mul i32 %v, %v2
+  invoke void @may_throw2()
+          to label %while.cond unwind label %lpadBB
+lpadBB:
+  %.lcssa1 = phi i32 [ 0, %while.body ], [ %mul, %while.body2 ]
+  landingpad { i8*, i32 }
+               catch i8* null
+  br label %lpadBBSucc1
+
+lpadBBSucc1:
+  ret void
+
+try.cont:
+  ret void
+}
+
+declare void @may_throw()
+declare void @may_throw2()
+declare i32 @__CxxFrameHandler3(...)
+declare i32 @getv()
+declare i1 @getc()
+declare void @f(i32*)
+declare void @g()

Added: llvm/trunk/test/Transforms/LICM/speculate.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/speculate.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/speculate.ll (added)
+++ llvm/trunk/test/Transforms/LICM/speculate.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,188 @@
+; RUN: opt -S -licm < %s | FileCheck %s
+; RUN: opt -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
+
+; UDiv is safe to speculate if the denominator is known non-zero.
+
+; CHECK-LABEL: @safe_udiv(
+; CHECK:      %div = udiv i64 %x, 2
+; CHECK-NEXT: br label %for.body
+
+define void @safe_udiv(i64 %x, i64 %m, i64 %n, i32* %p, i64* %q) nounwind {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc
+  %i.02 = phi i64 [ %inc, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %p, i64 %i.02
+  %0 = load i32, i32* %arrayidx, align 4
+  %tobool = icmp eq i32 %0, 0
+  br i1 %tobool, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %div = udiv i64 %x, 2
+  %arrayidx1 = getelementptr inbounds i64, i64* %q, i64 %i.02
+  store i64 %div, i64* %arrayidx1, align 8
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %for.body
+  %inc = add i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
+; UDiv is unsafe to speculate if the denominator is not known non-zero.
+
+; CHECK-LABEL: @unsafe_udiv(
+; CHECK-NOT:  udiv
+; CHECK: for.body:
+
+define void @unsafe_udiv(i64 %x, i64 %m, i64 %n, i32* %p, i64* %q) nounwind {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc
+  %i.02 = phi i64 [ %inc, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %p, i64 %i.02
+  %0 = load i32, i32* %arrayidx, align 4
+  %tobool = icmp eq i32 %0, 0
+  br i1 %tobool, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %div = udiv i64 %x, %m
+  %arrayidx1 = getelementptr inbounds i64, i64* %q, i64 %i.02
+  store i64 %div, i64* %arrayidx1, align 8
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %for.body
+  %inc = add i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
+; SDiv is safe to speculate if the denominator is known non-zero and
+; known to have at least one zero bit.
+
+; CHECK-LABEL: @safe_sdiv(
+; CHECK:      %div = sdiv i64 %x, 2
+; CHECK-NEXT: br label %for.body
+
+define void @safe_sdiv(i64 %x, i64 %m, i64 %n, i32* %p, i64* %q) nounwind {
+entry:
+  %and = and i64 %m, -3
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc
+  %i.02 = phi i64 [ %inc, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %p, i64 %i.02
+  %0 = load i32, i32* %arrayidx, align 4
+  %tobool = icmp eq i32 %0, 0
+  br i1 %tobool, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %div = sdiv i64 %x, 2
+  %arrayidx1 = getelementptr inbounds i64, i64* %q, i64 %i.02
+  store i64 %div, i64* %arrayidx1, align 8
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %for.body
+  %inc = add i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
+; SDiv is unsafe to speculate if the denominator is not known non-zero.
+
+; CHECK-LABEL: @unsafe_sdiv_a(
+; CHECK-NOT:  sdiv
+; CHECK: for.body:
+
+define void @unsafe_sdiv_a(i64 %x, i64 %m, i64 %n, i32* %p, i64* %q) nounwind {
+entry:
+  %or = or i64 %m, 1
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc
+  %i.02 = phi i64 [ %inc, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %p, i64 %i.02
+  %0 = load i32, i32* %arrayidx, align 4
+  %tobool = icmp eq i32 %0, 0
+  br i1 %tobool, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %div = sdiv i64 %x, %or
+  %arrayidx1 = getelementptr inbounds i64, i64* %q, i64 %i.02
+  store i64 %div, i64* %arrayidx1, align 8
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %for.body
+  %inc = add i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
+; SDiv is unsafe to speculate if the denominator is not known to have a zero bit.
+
+; CHECK-LABEL: @unsafe_sdiv_b(
+; CHECK-NOT:  sdiv
+; CHECK: for.body:
+
+define void @unsafe_sdiv_b(i64 %x, i64 %m, i64 %n, i32* %p, i64* %q) nounwind {
+entry:
+  %and = and i64 %m, -3
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc
+  %i.02 = phi i64 [ %inc, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %p, i64 %i.02
+  %0 = load i32, i32* %arrayidx, align 4
+  %tobool = icmp eq i32 %0, 0
+  br i1 %tobool, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %div = sdiv i64 %x, %and
+  %arrayidx1 = getelementptr inbounds i64, i64* %q, i64 %i.02
+  store i64 %div, i64* %arrayidx1, align 8
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %for.body
+  %inc = add i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
+; SDiv is unsafe to speculate inside an infinite loop.
+
+define void @unsafe_sdiv_c(i64 %a, i64 %b, i64* %p) {
+entry:
+; CHECK: entry:
+; CHECK-NOT: sdiv
+; CHECK: br label %for.body
+  br label %for.body
+
+for.body:
+  %c = icmp eq i64 %b, 0
+  br i1 %c, label %backedge, label %if.then
+
+if.then:
+  %d = sdiv i64 %a, %b
+  store i64 %d, i64* %p
+  br label %backedge
+
+backedge:
+  br label %for.body
+}

Added: llvm/trunk/test/Transforms/LICM/store-hoisting.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/store-hoisting.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/store-hoisting.ll (added)
+++ llvm/trunk/test/Transforms/LICM/store-hoisting.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,462 @@
+; RUN: opt -S -basicaa -licm %s | FileCheck -check-prefixes=CHECK,AST %s
+; RUN: opt -S -basicaa -licm -enable-mssa-loop-dependency=true %s | FileCheck  -check-prefixes=CHECK,MSSA %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck -check-prefixes=CHECK,AST %s
+
+define void @test(i32* %loc) {
+; CHECK-LABEL: @test
+; CHECK-LABEL: entry:
+; CHECK: store i32 0, i32* %loc
+; CHECK-LABEL: loop:
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  store i32 0, i32* %loc
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_multiexit(i32* %loc, i1 %earlycnd) {
+; CHECK-LABEL: @test_multiexit
+; CHECK-LABEL: entry:
+; CHECK: store i32 0, i32* %loc
+; CHECK-LABEL: loop:
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %backedge]
+  store i32 0, i32* %loc
+  %iv.next = add i32 %iv, 1
+  br i1 %earlycnd, label %exit1, label %backedge
+  
+backedge:
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit2
+
+exit1:
+  ret void
+exit2:
+  ret void
+}
+
+define i32* @false_negative_2use(i32* %loc) {
+; CHECK-LABEL: @false_negative_2use
+; AST-LABEL: exit:
+; AST: store i32 0, i32* %loc
+; MSSA-LABEL: entry:
+; MSSA: store i32 0, i32* %loc
+; MSSA-LABEL: loop:
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  store i32 0, i32* %loc
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i32* %loc
+}
+
+define void @neg_lv_value(i32* %loc) {
+; CHECK-LABEL: @neg_lv_value
+; CHECK-LABEL: exit:
+; CHECK: store i32 %iv.lcssa, i32* %loc
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  store i32 %iv, i32* %loc
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @neg_lv_addr(i32* %loc) {
+; CHECK-LABEL: @neg_lv_addr
+; CHECK-LABEL: loop:
+; CHECK: store i32 0, i32* %p
+; CHECK-LABEL: exit:
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  %p = getelementptr i32, i32* %loc, i32 %iv
+  store i32 0, i32* %p
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @neg_mod(i32* %loc) {
+; CHECK-LABEL: @neg_mod
+; CHECK-LABEL: exit:
+; CHECK: store i32 %iv.lcssa, i32* %loc
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  store i32 0, i32* %loc
+  store i32 %iv, i32* %loc
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Hoisting the store is actually valid here, as it dominates the load.
+define void @neg_ref(i32* %loc) {
+; CHECK-LABEL: @neg_ref
+; CHECK-LABEL: exit1:
+; CHECK: store i32 0, i32* %loc
+; CHECK-LABEL: exit2:
+; CHECK: store i32 0, i32* %loc
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %backedge]
+  store i32 0, i32* %loc
+  %v = load i32, i32* %loc
+  %earlycnd = icmp eq i32 %v, 198
+  br i1 %earlycnd, label %exit1, label %backedge
+  
+backedge:
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit2
+
+exit1:
+  ret void
+exit2:
+  ret void
+}
+
+; Hoisting the store here leads to a miscompile.
+define void @neg_ref2(i32* %loc) {
+; CHECK-LABEL: @neg_ref2
+; CHECK-LABEL: exit1:
+; CHECK: store i32 0, i32* %loc
+; CHECK-LABEL: exit2:
+; CHECK: store i32 0, i32* %loc
+entry:
+  store i32 198, i32* %loc
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %backedge]
+  %v = load i32, i32* %loc
+  store i32 0, i32* %loc
+  %earlycnd = icmp eq i32 %v, 198
+  br i1 %earlycnd, label %exit1, label %backedge
+  
+backedge:
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit2
+
+exit1:
+  ret void
+exit2:
+  ret void
+}
+
+declare void @modref()
+
+define void @neg_modref(i32* %loc) {
+; CHECK-LABEL: @neg_modref
+; CHECK-LABEL: loop:
+; CHECK: store i32 0, i32* %loc
+; CHECK-LABEL: exit:
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  store i32 0, i32* %loc
+  call void @modref()
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @neg_fence(i32* %loc) {
+; CHECK-LABEL: @neg_fence
+; CHECK-LABEL: loop:
+; CHECK: store i32 0, i32* %loc
+; CHECK-LABEL: exit:
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  store i32 0, i32* %loc
+  fence seq_cst
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @neg_volatile(i32* %loc) {
+; CHECK-LABEL: @neg_volatile
+; CHECK-LABEL: loop:
+; CHECK: store volatile i32 0, i32* %loc
+; CHECK-LABEL: exit:
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  store volatile i32 0, i32* %loc
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @neg_release(i32* %loc) {
+; CHECK-LABEL: @neg_release
+; CHECK-LABEL: loop:
+; CHECK: store atomic i32 0, i32* %loc release, align 4
+; CHECK-LABEL: exit:
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  store atomic i32 0, i32* %loc release, align 4
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @neg_seq_cst(i32* %loc) {
+; CHECK-LABEL: @neg_seq_cst
+; CHECK-LABEL: loop:
+; CHECK: store atomic i32 0, i32* %loc seq_cst, align 4
+; CHECK-LABEL: exit:
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  store atomic i32 0, i32* %loc seq_cst, align 4
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+declare void @maythrow() inaccessiblememonly
+
+define void @neg_early_exit(i32* %loc) {
+; CHECK-LABEL: @neg_early_exit
+; CHECK-LABEL: body:
+; CHECK: store i32 0, i32* %loc
+; CHECK-LABEL: exit:
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %body]
+  %is_null = icmp eq i32* %loc, null
+  br i1 %is_null, label %exit, label %body
+body:
+  call void @maythrow()
+  store i32 0, i32* %loc
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @neg_early_throw(i32* %loc) {
+; CHECK-LABEL: @neg_early_throw
+; CHECK-LABEL: loop:
+; CHECK: store i32 0, i32* %loc
+; CHECK-LABEL: exit:
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  call void @maythrow()
+  store i32 0, i32* %loc
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_late_throw(i32* %loc) {
+; CHECK-LABEL: @test_late_throw
+; CHECK-LABEL: entry:
+; CHECK: store i32 0, i32* %loc
+; CHECK-LABEL: loop:
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  store i32 0, i32* %loc
+  call void @maythrow()
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; TODO: could validly hoist the store here since we know what value
+; the load must observe.
+define i32 @test_dominated_read(i32* %loc) {
+; CHECK-LABEL: @test_dominated_read
+; CHECK-LABEL: exit:
+; CHECK: store i32 0, i32* %loc
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  store i32 0, i32* %loc
+  %reload = load i32, i32* %loc
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i32 %reload
+}
+
+; TODO: could validly hoist the store since we already hoisted the load and
+; it's no longer in the loop.
+define i32 @test_dominating_read(i32* %loc) {
+; CHECK-LABEL: @test_dominating_read
+; CHECK-LABEL: exit:
+; CHECK: store i32 0, i32* %loc
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  %reload = load i32, i32* %loc
+  store i32 0, i32* %loc
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i32 %reload
+}
+
+declare void @readonly() readonly
+
+; TODO: can legally hoist since value read by call is known
+define void @test_dominated_readonly(i32* %loc) {
+; CHECK-LABEL: @test_dominated_readonly
+; CHECK-LABEL: loop:
+; CHECK: store i32 0, i32* %loc
+; CHECK-LABEL: exit:
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  store i32 0, i32* %loc
+  call void @readonly()
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; While technically possible to hoist the store to %loc, this runs across
+; a funemental limitation of alias sets since both stores and the call are
+; within the same alias set and we can't distinguish them cheaply.
+define void @test_aliasset_fn(i32* %loc, i32* %loc2) {
+; CHECK-LABEL: @test_aliasset_fn
+; CHECK-LABEL: loop:
+; CHECK: store i32 0, i32* %loc
+; CHECK-LABEL: exit:
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop]
+  store i32 0, i32* %loc
+  call void @readonly()
+  store i32 %iv, i32* %loc2
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+
+; If we can't tell if the value is read before the write, we can't hoist the
+; write over the potential read (since we don't know the value read)
+define void @neg_may_read(i32* %loc, i1 %maybe) {
+; CHECK-LABEL: @neg_may_read
+; CHECK-LABEL: loop:
+; CHECK: store i32 0, i32* %loc
+; CHECK-LABEL: exit:
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %merge]
+  ;; maybe is a placeholder for an unanalyzable condition
+  br i1 %maybe, label %taken, label %merge
+taken:
+  call void @readonly()
+  br label %merge
+merge:
+  store i32 0, i32* %loc
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv, 200
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LICM/strlen.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/strlen.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/strlen.ll (added)
+++ llvm/trunk/test/Transforms/LICM/strlen.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,19 @@
+; RUN: opt -S -inferattrs -basicaa -licm < %s | FileCheck %s
+
+define void @test(i64* noalias %loc, i8* noalias %a) {
+; CHECK-LABEL: @test
+; CHECK: @strlen
+; CHECK-LABEL: loop:
+  br label %loop
+
+loop:
+  %res = call i64 @strlen(i8* %a)
+  store i64 %res, i64* %loc
+  br label %loop
+}
+
+; CHECK: declare i64 @strlen(i8* nocapture) #0
+; CHECK: attributes #0 = { argmemonly nounwind readonly }
+declare i64 @strlen(i8*)
+
+

Added: llvm/trunk/test/Transforms/LICM/unrolled-deeply-nested.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/unrolled-deeply-nested.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/unrolled-deeply-nested.ll (added)
+++ llvm/trunk/test/Transforms/LICM/unrolled-deeply-nested.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,76 @@
+; Test that LICM correctly detects conflicting accesses to memory in deeply
+; nested subloops. This works in the legacy PM due to a special retained map of
+; alias information for inner loops, and in the new PM it is recomputed for each
+; loop.
+;
+; RUN: opt -S -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(licm)' < %s | FileCheck %s
+; RUN: opt -S -basicaa -licm < %s | FileCheck %s
+
+define i32 @test(i32* %a, i64 %n.0, i64 %n.0.0, i64 %n.0.0.0, i64 %n.0.0.0.0) nounwind uwtable readonly {
+; CHECK-LABEL: define i32 @test
+entry:
+  %b = alloca i32
+  %c = alloca i32
+  %a.i8 = bitcast i32* %a to i8*
+  %b.i8 = bitcast i32* %b to i8*
+  %c.i8 = bitcast i32* %c to i8*
+  br label %l.0.header
+; CHECK: %b = alloca i32
+; CHECK: %c = alloca i32
+; CHECK: %[[AI8:.*]] = bitcast i32* %a to i8*
+; CHECK: %[[BI8:.*]] = bitcast i32* %b to i8*
+; CHECK: %[[CI8:.*]] = bitcast i32* %c to i8*
+; CHECK-NOT: load
+; CHECK: br
+
+l.0.header:
+  %iv.0 = phi i64 [ %iv.0.next, %l.0.latch ], [ 0, %entry ]
+  %iv.0.next = add i64 %iv.0, 1
+  %exitcond.0 = icmp eq i64 %iv.0.next, %n.0
+  %a.val = load i32, i32* %a
+  store i32 %a.val, i32* %b
+  %c.val = trunc i64 %iv.0 to i32
+  store i32 %c.val, i32* %c
+  br label %l.0.0.header
+; CHECK: %[[AV:.*]] = load i32, i32* %a
+; CHECK: store i32 %[[AV]], i32* %b
+; CHECK: %[[CT:.*]] = trunc i64 {{.*}} to i32
+; CHECK: store i32 %[[CT]], i32* %c
+; CHECK: br
+
+l.0.0.header:
+  %iv.0.0 = phi i64 [ %iv.0.0.next, %l.0.0.latch ], [ 0, %l.0.header ]
+  %iv.0.0.next = add i64 %iv.0.0, 1
+  %exitcond.0.0 = icmp eq i64 %iv.0.0.next, %n.0.0
+  br label %l.0.0.0.header
+; CHECK: br
+
+l.0.0.0.header:
+  %iv.0.0.0 = phi i64 [ %iv.0.0.0.next, %l.0.0.0.header ], [ 0, %l.0.0.header ]
+  %iv.0.0.0.next = add i64 %iv.0.0.0, 1
+  %exitcond.0.0.0 = icmp eq i64 %iv.0.0.0.next, %n.0.0.0
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a.i8, i8* %c.i8, i64 4, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %b.i8, i8* %c.i8, i64 4, i1 false)
+  br i1 %exitcond.0.0.0, label %l.0.0.0.header, label %l.0.0.latch
+; CHECK: call void @llvm.memcpy.{{.*}}(i8* %[[AI8]], i8* %[[CI8]], i64 4
+; CHECK: call void @llvm.memcpy.{{.*}}(i8* %[[BI8]], i8* %[[CI8]], i64 4
+; CHECK: br
+
+l.0.0.latch:
+  br i1 %exitcond.0.0, label %l.0.0.header, label %l.0.latch
+; CHECK: br
+
+l.0.latch:
+  %b.val = load i32, i32* %b
+  br i1 %exitcond.0, label %exit, label %l.0.header
+; CHECK: %[[BV:.*]] = load i32, i32* %b
+; CHECK: br
+
+exit:
+  %result.lcssa = phi i32 [ %b.val, %l.0.latch ]
+  ret i32 %b.val
+; CHECK: %[[LCSSA:.*]] = phi i32 [ %[[BV]], %{{.*}} ]
+; CHECK: ret i32 %[[LCSSA]]
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1)

Added: llvm/trunk/test/Transforms/LICM/update-scev.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/update-scev.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/update-scev.ll (added)
+++ llvm/trunk/test/Transforms/LICM/update-scev.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,31 @@
+; RUN: opt -S -licm < %s | FileCheck %s --check-prefix=IR-AFTER-TRANSFORM
+; RUN: opt -analyze -scalar-evolution -licm -scalar-evolution < %s | FileCheck %s --check-prefix=SCEV-EXPRS
+
+declare void @clobber()
+
+define void @f_0(i1* %loc) {
+; IR-AFTER-TRANSFORM-LABEL: @f_0(
+; IR-AFTER-TRANSFORM: loop.outer:
+; IR-AFTER-TRANSFORM-NEXT:  call void @clobber()
+; IR-AFTER-TRANSFORM-NEXT:  %cond = load i1, i1* %loc
+; IR-AFTER-TRANSFORM-NEXT:  br label %loop.inner
+
+; SCEV-EXPRS: Classifying expressions for: @f_0
+; SCEV-EXPRS: Classifying expressions for: @f_0
+; SCEV-EXPRS:  %cond = load i1, i1* %loc
+; SCEV-EXPRS-NEXT:   -->  {{.*}} LoopDispositions: { %loop.outer: Variant, %loop.inner: Invariant }
+
+entry:
+  br label %loop.outer
+
+loop.outer:
+  call void @clobber()
+  br label %loop.inner
+
+loop.inner:
+  %cond = load i1, i1* %loc
+  br i1 %cond, label %loop.inner, label %leave.inner
+
+leave.inner:
+  br label %loop.outer
+}

Added: llvm/trunk/test/Transforms/LICM/volatile-alias.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LICM/volatile-alias.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LICM/volatile-alias.ll (added)
+++ llvm/trunk/test/Transforms/LICM/volatile-alias.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,56 @@
+; RUN: opt -basicaa -sroa -loop-rotate -licm -S < %s | FileCheck %s
+; RUN: opt -basicaa -sroa -loop-rotate %s | opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S | FileCheck %s
+; RUN: opt -basicaa -sroa -loop-rotate -licm -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s | FileCheck %s
+; The objects *p and *q are aliased to each other, but even though *q is
+; volatile, *p can be considered invariant in the loop. Check if it is moved
+; out of the loop.
+; CHECK: load i32, i32* %p
+; CHECK: for.body:
+; CHECK: load volatile i32, i32* %q
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Function Attrs: nounwind uwtable
+define i32 @foo(i32* dereferenceable(4) nonnull %p, i32* %q, i32 %n) #0 {
+entry:
+  %p.addr = alloca i32*, align 8
+  %q.addr = alloca i32*, align 8
+  %n.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  %s = alloca i32, align 4
+  store i32* %p, i32** %p.addr, align 8
+  store i32* %q, i32** %q.addr, align 8
+  store i32 %n, i32* %n.addr, align 4
+  store i32 0, i32* %s, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %1 = load i32, i32* %n.addr, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32*, i32** %p.addr, align 8
+  %3 = load i32, i32* %2, align 4
+  %4 = load i32*, i32** %q.addr, align 8
+  %5 = load volatile i32, i32* %4, align 4
+  %add = add nsw i32 %3, %5
+  %6 = load i32, i32* %s, align 4
+  %add1 = add nsw i32 %6, %add
+  store i32 %add1, i32* %s, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %7 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %8 = load i32, i32* %s, align 4
+  ret i32 %8
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,32 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -scoped-noalias -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=SCOPE -check-prefix=ALL %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=NOSCOPE -check-prefix=ALL %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
+; This fails to vectorize if the !alias.scope is not used
+
+; ALL-LABEL: @vectorize_alias_scope(
+; SCOPE: load float, float addrspace(1)* %c
+; SCOPE: bitcast float addrspace(1)* %a to <2 x float> addrspace(1)*
+; SCOPE: store <2 x float> zeroinitializer
+; SCOPE: store float %ld.c, float addrspace(1)* %b,
+
+; NOSCOPE: store float
+; NOSCOPE: load float
+; NOSCOPE: store float
+; NOSCOPE: store float
+define amdgpu_kernel void @vectorize_alias_scope(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
+entry:
+  %a.idx.1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  store float 0.0, float addrspace(1)* %a, align 4, !noalias !0
+  %ld.c = load float, float addrspace(1)* %c, align 4, !alias.scope !0
+  store float 0.0, float addrspace(1)* %a.idx.1, align 4, !noalias !0
+  store float %ld.c, float addrspace(1)* %b, align 4, !noalias !0
+  ret void
+}
+
+attributes #0 = { nounwind }
+
+!0 = !{!1}
+!1 = distinct !{!1, !2, !"some scope"}
+!2 = distinct !{!2, !"some domain"}

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,210 @@
+; RUN: opt -S -load-store-vectorizer -mattr=-unaligned-buffer-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=ALIGNED -check-prefix=ALL %s
+; RUN: opt -S -load-store-vectorizer -mattr=+unaligned-buffer-access,+unaligned-scratch-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=UNALIGNED -check-prefix=ALL %s
+; RUN: opt -S -passes='function(load-store-vectorizer)' -mattr=-unaligned-buffer-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=ALIGNED -check-prefix=ALL %s
+; RUN: opt -S -passes='function(load-store-vectorizer)' -mattr=+unaligned-buffer-access,+unaligned-scratch-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=UNALIGNED -check-prefix=ALL %s
+
+target triple = "amdgcn--"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
+; ALL-LABEL: @load_unknown_offset_align1_i8(
+; ALL: alloca [128 x i8], align 1
+; UNALIGNED: load <2 x i8>, <2 x i8> addrspace(5)* %{{[0-9]+}}, align 1{{$}}
+
+; ALIGNED: load i8, i8 addrspace(5)* %ptr0, align 1{{$}}
+; ALIGNED: load i8, i8 addrspace(5)* %ptr1, align 1{{$}}
+define amdgpu_kernel void @load_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 {
+  %alloca = alloca [128 x i8], align 1, addrspace(5)
+  %ptr0 = getelementptr inbounds [128 x i8], [128 x i8] addrspace(5)* %alloca, i32 0, i32 %offset
+  %val0 = load i8, i8 addrspace(5)* %ptr0, align 1
+  %ptr1 = getelementptr inbounds i8, i8 addrspace(5)* %ptr0, i32 1
+  %val1 = load i8, i8 addrspace(5)* %ptr1, align 1
+  %add = add i8 %val0, %val1
+  store i8 %add, i8 addrspace(1)* %out
+  ret void
+}
+
+; ALL-LABEL: @load_unknown_offset_align1_i16(
+; ALL: alloca [128 x i16], align 1, addrspace(5){{$}}
+; UNALIGNED: load <2 x i16>, <2 x i16> addrspace(5)* %{{[0-9]+}}, align 1{{$}}
+
+; ALIGNED: load i16, i16 addrspace(5)* %ptr0, align 1{{$}}
+; ALIGNED: load i16, i16 addrspace(5)* %ptr1, align 1{{$}}
+define amdgpu_kernel void @load_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 {
+  %alloca = alloca [128 x i16], align 1, addrspace(5)
+  %ptr0 = getelementptr inbounds [128 x i16], [128 x i16] addrspace(5)* %alloca, i32 0, i32 %offset
+  %val0 = load i16, i16 addrspace(5)* %ptr0, align 1
+  %ptr1 = getelementptr inbounds i16, i16 addrspace(5)* %ptr0, i32 1
+  %val1 = load i16, i16 addrspace(5)* %ptr1, align 1
+  %add = add i16 %val0, %val1
+  store i16 %add, i16 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Although the offset is unknown here, we know it is a multiple
+; of the element size, so should still be align 4
+
+; ALL-LABEL: @load_unknown_offset_align1_i32(
+; ALL: alloca [128 x i32], align 1
+; UNALIGNED: load <2 x i32>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 1{{$}}
+
+; ALIGNED: load i32, i32 addrspace(5)* %ptr0, align 1
+; ALIGNED: load i32, i32 addrspace(5)* %ptr1, align 1
+define amdgpu_kernel void @load_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
+  %alloca = alloca [128 x i32], align 1, addrspace(5)
+  %ptr0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca, i32 0, i32 %offset
+  %val0 = load i32, i32 addrspace(5)* %ptr0, align 1
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(5)* %ptr0, i32 1
+  %val1 = load i32, i32 addrspace(5)* %ptr1, align 1
+  %add = add i32 %val0, %val1
+  store i32 %add, i32 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Should always increase alignment of the load
+; Make sure alloca alignment isn't decreased
+; ALL-LABEL: @load_alloca16_unknown_offset_align1_i32(
+; ALL: alloca [128 x i32], align 16
+
+; UNALIGNED: load <2 x i32>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 1{{$}}
+; ALIGNED: load <2 x i32>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 4{{$}}
+define amdgpu_kernel void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
+  %alloca = alloca [128 x i32], align 16, addrspace(5)
+  %ptr0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca, i32 0, i32 %offset
+  %val0 = load i32, i32 addrspace(5)* %ptr0, align 1
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(5)* %ptr0, i32 1
+  %val1 = load i32, i32 addrspace(5)* %ptr1, align 1
+  %add = add i32 %val0, %val1
+  store i32 %add, i32 addrspace(1)* %out
+  ret void
+}
+
+; ALL-LABEL: @store_unknown_offset_align1_i8(
+; ALL: alloca [128 x i8], align 1
+; UNALIGNED: store <2 x i8> <i8 9, i8 10>, <2 x i8> addrspace(5)* %{{[0-9]+}}, align 1{{$}}
+
+; ALIGNED: store i8 9, i8 addrspace(5)* %ptr0, align 1{{$}}
+; ALIGNED: store i8 10, i8 addrspace(5)* %ptr1, align 1{{$}}
+define amdgpu_kernel void @store_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 {
+  %alloca = alloca [128 x i8], align 1, addrspace(5)
+  %ptr0 = getelementptr inbounds [128 x i8], [128 x i8] addrspace(5)* %alloca, i32 0, i32 %offset
+  store i8 9, i8 addrspace(5)* %ptr0, align 1
+  %ptr1 = getelementptr inbounds i8, i8 addrspace(5)* %ptr0, i32 1
+  store i8 10, i8 addrspace(5)* %ptr1, align 1
+  ret void
+}
+
+; ALL-LABEL: @store_unknown_offset_align1_i16(
+; ALL: alloca [128 x i16], align 1
+; UNALIGNED: store <2 x i16> <i16 9, i16 10>, <2 x i16> addrspace(5)* %{{[0-9]+}}, align 1{{$}}
+
+; ALIGNED: store i16 9, i16 addrspace(5)* %ptr0, align 1{{$}}
+; ALIGNED: store i16 10, i16 addrspace(5)* %ptr1, align 1{{$}}
+define amdgpu_kernel void @store_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 {
+  %alloca = alloca [128 x i16], align 1, addrspace(5)
+  %ptr0 = getelementptr inbounds [128 x i16], [128 x i16] addrspace(5)* %alloca, i32 0, i32 %offset
+  store i16 9, i16 addrspace(5)* %ptr0, align 1
+  %ptr1 = getelementptr inbounds i16, i16 addrspace(5)* %ptr0, i32 1
+  store i16 10, i16 addrspace(5)* %ptr1, align 1
+  ret void
+}
+
+; FIXME: Although the offset is unknown here, we know it is a multiple
+; of the element size, so it still should be align 4.
+
+; ALL-LABEL: @store_unknown_offset_align1_i32(
+; ALL: alloca [128 x i32], align 1
+
+; UNALIGNED: store <2 x i32> <i32 9, i32 10>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 1{{$}}
+
+; ALIGNED: store i32 9, i32 addrspace(5)* %ptr0, align 1
+; ALIGNED: store i32 10, i32 addrspace(5)* %ptr1, align 1
+define amdgpu_kernel void @store_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
+  %alloca = alloca [128 x i32], align 1, addrspace(5)
+  %ptr0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca, i32 0, i32 %offset
+  store i32 9, i32 addrspace(5)* %ptr0, align 1
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(5)* %ptr0, i32 1
+  store i32 10, i32 addrspace(5)* %ptr1, align 1
+  ret void
+}
+
+; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32(
+; ALIGNED: %alloca = alloca [8 x i32], align 4, addrspace(5)
+; ALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32> addrspace(5)* %1, align 4
+
+; UNALIGNED: %alloca = alloca [8 x i32], align 1, addrspace(5)
+; UNALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32> addrspace(5)* %1, align 1
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32() {
+  %alloca = alloca [8 x i32], align 1, addrspace(5)
+  %out = bitcast [8 x i32] addrspace(5)* %alloca to i32 addrspace(5)*
+  %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3
+
+  store i32 9, i32 addrspace(5)* %out, align 1
+  store i32 1, i32 addrspace(5)* %out.gep.1, align 1
+  store i32 23, i32 addrspace(5)* %out.gep.2, align 1
+  store i32 19, i32 addrspace(5)* %out.gep.3, align 1
+  ret void
+}
+
+; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8(
+; ALIGNED: %alloca = alloca [8 x i8], align 4, addrspace(5)
+; ALIGNED: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8> addrspace(5)* %1, align 4
+
+; UNALIGNED: %alloca = alloca [8 x i8], align 1, addrspace(5)
+; UNALIGNED: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8> addrspace(5)* %1, align 1
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8() {
+  %alloca = alloca [8 x i8], align 1, addrspace(5)
+  %out = bitcast [8 x i8] addrspace(5)* %alloca to i8 addrspace(5)*
+  %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i8 1
+  %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i8 2
+  %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i8 3
+
+  store i8 9, i8 addrspace(5)* %out, align 1
+  store i8 1, i8 addrspace(5)* %out.gep.1, align 1
+  store i8 23, i8 addrspace(5)* %out.gep.2, align 1
+  store i8 19, i8 addrspace(5)* %out.gep.3, align 1
+  ret void
+}
+
+; ALL-LABEL: @merge_private_load_4_vector_elts_loads_v4i32(
+; ALIGNED: %alloca = alloca [8 x i32], align 4, addrspace(5)
+; ALIGNED: load <4 x i32>, <4 x i32> addrspace(5)* %1, align 4
+
+; UNALIGNED: %alloca = alloca [8 x i32], align 1, addrspace(5)
+; UNALIGNED: load <4 x i32>, <4 x i32> addrspace(5)* %1, align 1
+define amdgpu_kernel void @merge_private_load_4_vector_elts_loads_v4i32() {
+  %alloca = alloca [8 x i32], align 1, addrspace(5)
+  %out = bitcast [8 x i32] addrspace(5)* %alloca to i32 addrspace(5)*
+  %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3
+
+  %load0 = load i32, i32 addrspace(5)* %out, align 1
+  %load1 = load i32, i32 addrspace(5)* %out.gep.1, align 1
+  %load2 = load i32, i32 addrspace(5)* %out.gep.2, align 1
+  %load3 = load i32, i32 addrspace(5)* %out.gep.3, align 1
+  ret void
+}
+
+; ALL-LABEL: @merge_private_load_4_vector_elts_loads_v4i8(
+; ALIGNED: %alloca = alloca [8 x i8], align 4, addrspace(5)
+; ALIGNED: load <4 x i8>, <4 x i8> addrspace(5)* %1, align 4
+
+; UNALIGNED: %alloca = alloca [8 x i8], align 1, addrspace(5)
+; UNALIGNED: load <4 x i8>, <4 x i8> addrspace(5)* %1, align 1
+define amdgpu_kernel void @merge_private_load_4_vector_elts_loads_v4i8() {
+  %alloca = alloca [8 x i8], align 1, addrspace(5)
+  %out = bitcast [8 x i8] addrspace(5)* %alloca to i8 addrspace(5)*
+  %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i8 1
+  %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i8 2
+  %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i8 3
+
+  %load0 = load i8, i8 addrspace(5)* %out, align 1
+  %load1 = load i8, i8 addrspace(5)* %out.gep.1, align 1
+  %load2 = load i8, i8 addrspace(5)* %out.gep.2, align 1
+  %load3 = load i8, i8 addrspace(5)* %out.gep.3, align 1
+  ret void
+}
+
+attributes #0 = { nounwind }

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/complex-index.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,52 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
+declare i64 @_Z12get_local_idj(i32)
+
+declare i64 @_Z12get_group_idj(i32)
+
+declare double @llvm.fmuladd.f64(double, double, double)
+
+; CHECK-LABEL: @factorizedVsNonfactorizedAccess(
+; CHECK: load <2 x float>
+; CHECK: store <2 x float>
+define amdgpu_kernel void @factorizedVsNonfactorizedAccess(float addrspace(1)* nocapture %c) {
+entry:
+  %call = tail call i64 @_Z12get_local_idj(i32 0)
+  %call1 = tail call i64 @_Z12get_group_idj(i32 0)
+  %div = lshr i64 %call, 4
+  %div2 = lshr i64 %call1, 3
+  %mul = shl i64 %div2, 7
+  %rem = shl i64 %call, 3
+  %mul3 = and i64 %rem, 120
+  %add = or i64 %mul, %mul3
+  %rem4 = shl i64 %call1, 7
+  %mul5 = and i64 %rem4, 896
+  %mul6 = shl nuw nsw i64 %div, 3
+  %add7 = add nuw i64 %mul5, %mul6
+  %mul9 = shl i64 %add7, 10
+  %add10 = add i64 %mul9, %add
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %c, i64 %add10
+  %load1 = load float, float addrspace(1)* %arrayidx, align 4
+  %conv = fpext float %load1 to double
+  %mul11 = fmul double %conv, 0x3FEAB481D8F35506
+  %conv12 = fptrunc double %mul11 to float
+  %conv18 = fpext float %conv12 to double
+  %storeval1 = tail call double @llvm.fmuladd.f64(double 0x3FF4FFAFBBEC946A, double 0.000000e+00, double %conv18)
+  %cstoreval1 = fptrunc double %storeval1 to float
+  store float %cstoreval1, float addrspace(1)* %arrayidx, align 4
+
+  %add23 = or i64 %add10, 1
+  %arrayidx24 = getelementptr inbounds float, float addrspace(1)* %c, i64 %add23
+  %load2 = load float, float addrspace(1)* %arrayidx24, align 4
+  %conv25 = fpext float %load2 to double
+  %mul26 = fmul double %conv25, 0x3FEAB481D8F35506
+  %conv27 = fptrunc double %mul26 to float
+  %conv34 = fpext float %conv27 to double
+  %storeval2 = tail call double @llvm.fmuladd.f64(double 0x3FF4FFAFBBEC946A, double 0.000000e+00, double %conv34)
+  %cstoreval2 = fptrunc double %storeval2 to float
+  store float %cstoreval2, float addrspace(1)* %arrayidx24, align 4
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,151 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+; CHECK-LABEL: @basic_merge_sext_index(
+; CHECK: sext i32 %id.x to i64
+; CHECK: load <2 x float>
+; CHECK: store <2 x float> zeroinitializer
+define amdgpu_kernel void @basic_merge_sext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
+entry:
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %sext.id.x = sext i32 %id.x to i64
+  %a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %sext.id.x
+  %c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %sext.id.x
+  %a.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %a.idx.x, i64 1
+  %c.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %c.idx.x, i64 1
+
+  %ld.c = load float, float addrspace(1)* %c.idx.x, align 4
+  %ld.c.idx.1 = load float, float addrspace(1)* %c.idx.x.1, align 4
+
+  store float 0.0, float addrspace(1)* %a.idx.x, align 4
+  store float 0.0, float addrspace(1)* %a.idx.x.1, align 4
+
+  %add = fadd float %ld.c, %ld.c.idx.1
+  store float %add, float addrspace(1)* %b, align 4
+  ret void
+}
+
+; CHECK-LABEL: @basic_merge_zext_index(
+; CHECK: zext i32 %id.x to i64
+; CHECK: load <2 x float>
+; CHECK: store <2 x float>
+define amdgpu_kernel void @basic_merge_zext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
+entry:
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %zext.id.x = zext i32 %id.x to i64
+  %a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %zext.id.x
+  %c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %zext.id.x
+  %a.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %a.idx.x, i64 1
+  %c.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %c.idx.x, i64 1
+
+  %ld.c = load float, float addrspace(1)* %c.idx.x, align 4
+  %ld.c.idx.1 = load float, float addrspace(1)* %c.idx.x.1, align 4
+  store float 0.0, float addrspace(1)* %a.idx.x, align 4
+  store float 0.0, float addrspace(1)* %a.idx.x.1, align 4
+
+  %add = fadd float %ld.c, %ld.c.idx.1
+  store float %add, float addrspace(1)* %b, align 4
+  ret void
+}
+
+; CHECK-LABEL: @merge_op_zext_index(
+; CHECK: load <2 x float>
+; CHECK: store <2 x float>
+define amdgpu_kernel void @merge_op_zext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 {
+entry:
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %shl = shl i32 %id.x, 2
+  %zext.id.x = zext i32 %shl to i64
+  %a.0 = getelementptr inbounds float, float addrspace(1)* %a, i64 %zext.id.x
+  %c.0 = getelementptr inbounds float, float addrspace(1)* %c, i64 %zext.id.x
+
+  %id.x.1 = or i32 %shl, 1
+  %id.x.1.ext = zext i32 %id.x.1 to i64
+
+  %a.1 = getelementptr inbounds float, float addrspace(1)* %a, i64 %id.x.1.ext
+  %c.1 = getelementptr inbounds float, float addrspace(1)* %c, i64 %id.x.1.ext
+
+  %ld.c.0 = load float, float addrspace(1)* %c.0, align 4
+  store float 0.0, float addrspace(1)* %a.0, align 4
+  %ld.c.1 = load float, float addrspace(1)* %c.1, align 4
+  store float 0.0, float addrspace(1)* %a.1, align 4
+
+  %add = fadd float %ld.c.0, %ld.c.1
+  store float %add, float addrspace(1)* %b, align 4
+  ret void
+}
+
+; CHECK-LABEL: @merge_op_sext_index(
+; CHECK: load <2 x float>
+; CHECK: store <2 x float>
+define amdgpu_kernel void @merge_op_sext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 {
+entry:
+  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %shl = shl i32 %id.x, 2
+  %zext.id.x = sext i32 %shl to i64
+  %a.0 = getelementptr inbounds float, float addrspace(1)* %a, i64 %zext.id.x
+  %c.0 = getelementptr inbounds float, float addrspace(1)* %c, i64 %zext.id.x
+
+  %id.x.1 = or i32 %shl, 1
+  %id.x.1.ext = sext i32 %id.x.1 to i64
+
+  %a.1 = getelementptr inbounds float, float addrspace(1)* %a, i64 %id.x.1.ext
+  %c.1 = getelementptr inbounds float, float addrspace(1)* %c, i64 %id.x.1.ext
+
+  %ld.c.0 = load float, float addrspace(1)* %c.0, align 4
+  store float 0.0, float addrspace(1)* %a.0, align 4
+  %ld.c.1 = load float, float addrspace(1)* %c.1, align 4
+  store float 0.0, float addrspace(1)* %a.1, align 4
+
+  %add = fadd float %ld.c.0, %ld.c.1
+  store float %add, float addrspace(1)* %b, align 4
+  ret void
+}
+
+; This case fails to vectorize if not using the extra extension
+; handling in isConsecutiveAccess.
+
+; CHECK-LABEL: @zext_trunc_phi_1(
+; CHECK: loop:
+; CHECK: load <2 x i32>
+; CHECK: store <2 x i32>
+define amdgpu_kernel void @zext_trunc_phi_1(i32 addrspace(1)* nocapture noalias %a, i32 addrspace(1)* nocapture noalias %b, i32 addrspace(1)* nocapture readonly noalias %c, i32 %n, i64 %arst, i64 %aoeu) #0 {
+entry:
+  %cmp0 = icmp eq i32 %n, 0
+  br i1 %cmp0, label %exit, label %loop
+
+loop:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %loop ], [ 0, %entry ]
+  %trunc.iv = trunc i64 %indvars.iv to i32
+  %idx = shl i32 %trunc.iv, 4
+
+  %idx.ext = zext i32 %idx to i64
+  %c.0 = getelementptr inbounds i32, i32 addrspace(1)* %c, i64 %idx.ext
+  %a.0 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idx.ext
+
+  %idx.1 = or i32 %idx, 1
+  %idx.1.ext = zext i32 %idx.1 to i64
+  %c.1 = getelementptr inbounds i32, i32 addrspace(1)* %c, i64 %idx.1.ext
+  %a.1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idx.1.ext
+
+  %ld.c.0 = load i32, i32 addrspace(1)* %c.0, align 4
+  store i32 %ld.c.0, i32 addrspace(1)* %a.0, align 4
+  %ld.c.1 = load i32, i32 addrspace(1)* %c.1, align 4
+  store i32 %ld.c.1, i32 addrspace(1)* %a.1, align 4
+
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,135 @@
+; RUN: opt -S -mtriple=amdgcn--amdhsa -load-store-vectorizer < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn--amdhsa -passes='function(load-store-vectorizer)' < %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
+; Check that vectorizer can find a GEP through bitcast
+; CHECK-LABEL: @vect_zext_bitcast_f32_to_i32_idx
+; CHECK: load <4 x i32>
+define void @vect_zext_bitcast_f32_to_i32_idx(float addrspace(1)* %arg1, i32 %base) {
+  %add1 = add nuw i32 %base, 0
+  %zext1 = zext i32 %add1 to i64
+  %gep1 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 %zext1
+  %f2i1 = bitcast float addrspace(1)* %gep1 to i32 addrspace(1)*
+  %load1 = load i32, i32 addrspace(1)* %f2i1, align 4
+  %add2 = add nuw i32 %base, 1
+  %zext2 = zext i32 %add2 to i64
+  %gep2 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 %zext2
+  %f2i2 = bitcast float addrspace(1)* %gep2 to i32 addrspace(1)*
+  %load2 = load i32, i32 addrspace(1)* %f2i2, align 4
+  %add3 = add nuw i32 %base, 2
+  %zext3 = zext i32 %add3 to i64
+  %gep3 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 %zext3
+  %f2i3 = bitcast float addrspace(1)* %gep3 to i32 addrspace(1)*
+  %load3 = load i32, i32 addrspace(1)* %f2i3, align 4
+  %add4 = add nuw i32 %base, 3
+  %zext4 = zext i32 %add4 to i64
+  %gep4 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 %zext4
+  %f2i4 = bitcast float addrspace(1)* %gep4 to i32 addrspace(1)*
+  %load4 = load i32, i32 addrspace(1)* %f2i4, align 4
+  ret void
+}
+
+; CHECK-LABEL: @vect_zext_bitcast_i8_st1_to_i32_idx
+; CHECK: load i32
+; CHECK: load i32
+; CHECK: load i32
+; CHECK: load i32
+define void @vect_zext_bitcast_i8_st1_to_i32_idx(i8 addrspace(1)* %arg1, i32 %base) {
+  %add1 = add nuw i32 %base, 0
+  %zext1 = zext i32 %add1 to i64
+  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext1
+  %f2i1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
+  %load1 = load i32, i32 addrspace(1)* %f2i1, align 4
+  %add2 = add nuw i32 %base, 1
+  %zext2 = zext i32 %add2 to i64
+  %gep2 = getelementptr inbounds i8,i8 addrspace(1)* %arg1, i64 %zext2
+  %f2i2 = bitcast i8 addrspace(1)* %gep2 to i32 addrspace(1)*
+  %load2 = load i32, i32 addrspace(1)* %f2i2, align 4
+  %add3 = add nuw i32 %base, 2
+  %zext3 = zext i32 %add3 to i64
+  %gep3 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext3
+  %f2i3 = bitcast i8 addrspace(1)* %gep3 to i32 addrspace(1)*
+  %load3 = load i32, i32 addrspace(1)* %f2i3, align 4
+  %add4 = add nuw i32 %base, 3
+  %zext4 = zext i32 %add4 to i64
+  %gep4 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext4
+  %f2i4 = bitcast i8 addrspace(1)* %gep4 to i32 addrspace(1)*
+  %load4 = load i32, i32 addrspace(1)* %f2i4, align 4
+  ret void
+}
+
+; CHECK-LABEL: @vect_zext_bitcast_i8_st4_to_i32_idx
+; CHECK: load <4 x i32>
+define void @vect_zext_bitcast_i8_st4_to_i32_idx(i8 addrspace(1)* %arg1, i32 %base) {
+  %add1 = add nuw i32 %base, 0
+  %zext1 = zext i32 %add1 to i64
+  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext1
+  %f2i1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
+  %load1 = load i32, i32 addrspace(1)* %f2i1, align 4
+  %add2 = add nuw i32 %base, 4
+  %zext2 = zext i32 %add2 to i64
+  %gep2 = getelementptr inbounds i8,i8 addrspace(1)* %arg1, i64 %zext2
+  %f2i2 = bitcast i8 addrspace(1)* %gep2 to i32 addrspace(1)*
+  %load2 = load i32, i32 addrspace(1)* %f2i2, align 4
+  %add3 = add nuw i32 %base, 8
+  %zext3 = zext i32 %add3 to i64
+  %gep3 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext3
+  %f2i3 = bitcast i8 addrspace(1)* %gep3 to i32 addrspace(1)*
+  %load3 = load i32, i32 addrspace(1)* %f2i3, align 4
+  %add4 = add nuw i32 %base, 12
+  %zext4 = zext i32 %add4 to i64
+  %gep4 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext4
+  %f2i4 = bitcast i8 addrspace(1)* %gep4 to i32 addrspace(1)*
+  %load4 = load i32, i32 addrspace(1)* %f2i4, align 4
+  ret void
+}
+
+; CHECK-LABEL: @vect_zext_bitcast_negative_ptr_delta
+; CHECK: load <2 x i32>
+define void @vect_zext_bitcast_negative_ptr_delta(i32 addrspace(1)* %p, i32 %base) {
+  %p.bitcasted = bitcast i32 addrspace(1)* %p to i16 addrspace(1)*
+  %a.offset = add nuw i32 %base, 4
+  %t.offset.zexted = zext i32 %base to i64
+  %a.offset.zexted = zext i32 %a.offset to i64
+  %t.ptr = getelementptr inbounds i16, i16 addrspace(1)* %p.bitcasted, i64 %t.offset.zexted
+  %a.ptr = getelementptr inbounds i16, i16 addrspace(1)* %p.bitcasted, i64 %a.offset.zexted
+  %b.ptr = getelementptr inbounds i16, i16 addrspace(1)* %t.ptr, i64 6
+  %a.ptr.bitcasted = bitcast i16 addrspace(1)* %a.ptr to i32 addrspace(1)*
+  %b.ptr.bitcasted = bitcast i16 addrspace(1)* %b.ptr to i32 addrspace(1)*
+  %a.val = load i32, i32 addrspace(1)* %a.ptr.bitcasted
+  %b.val = load i32, i32 addrspace(1)* %b.ptr.bitcasted
+  ret void
+}
+
+; Check i1 corner case
+; CHECK-LABEL: @zexted_i1_gep_index
+; CHECK: load i32
+; CHECK: load i32
+define void @zexted_i1_gep_index(i32 addrspace(1)* %p, i32 %val) {
+  %selector = icmp eq i32 %val, 0
+  %flipped = xor i1 %selector, 1
+  %index.0 = zext i1 %selector to i64
+  %index.1 = zext i1 %flipped to i64
+  %gep.0 = getelementptr inbounds i32, i32 addrspace(1)* %p, i64 %index.0
+  %gep.1 = getelementptr inbounds i32, i32 addrspace(1)* %p, i64 %index.1
+  %val0 = load i32, i32 addrspace(1)* %gep.0
+  %val1 = load i32, i32 addrspace(1)* %gep.1
+  ret void
+}
+
+; Check i1 corner case
+; CHECK-LABEL: @sexted_i1_gep_index
+; CHECK: load i32
+; CHECK: load i32
+define void @sexted_i1_gep_index(i32 addrspace(1)* %p, i32 %val) {
+  %selector = icmp eq i32 %val, 0
+  %flipped = xor i1 %selector, 1
+  %index.0 = sext i1 %selector to i64
+  %index.1 = sext i1 %flipped to i64
+  %gep.0 = getelementptr inbounds i32, i32 addrspace(1)* %p, i64 %index.0
+  %gep.1 = getelementptr inbounds i32, i32 addrspace(1)* %p, i64 %index.1
+  %val0 = load i32, i32 addrspace(1)* %gep.0
+  %val1 = load i32, i32 addrspace(1)* %gep.1
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,118 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
+; Check position of the inserted vector load/store.  Vectorized loads should be
+; inserted at the position of the first load in the chain, and stores should be
+; inserted at the position of the last store.
+
+; CHECK-LABEL: @insert_load_point(
+; CHECK: %z = add i32 %x, 4
+; CHECK: load <2 x float>
+; CHECK: %w = add i32 %y, 9
+; CHECK: %foo = add i32 %z, %w
+define amdgpu_kernel void @insert_load_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 {
+entry:
+  %a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx
+  %c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %idx
+  %a.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %a.idx.x, i64 1
+  %c.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %c.idx.x, i64 1
+
+  %z = add i32 %x, 4
+  %ld.c = load float, float addrspace(1)* %c.idx.x, align 4
+  %w = add i32 %y, 9
+  %ld.c.idx.1 = load float, float addrspace(1)* %c.idx.x.1, align 4
+  %foo = add i32 %z, %w
+
+  store float 0.0, float addrspace(1)* %a.idx.x, align 4
+  store float 0.0, float addrspace(1)* %a.idx.x.1, align 4
+
+  %add = fadd float %ld.c, %ld.c.idx.1
+  store float %add, float addrspace(1)* %b, align 4
+  store i32 %foo, i32 addrspace(3)* null, align 4
+  ret void
+}
+
+; CHECK-LABEL: @insert_store_point(
+; CHECK: %z = add i32 %x, 4
+; CHECK: %w = add i32 %y, 9
+; CHECK: store <2 x float>
+; CHECK: %foo = add i32 %z, %w
+define amdgpu_kernel void @insert_store_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 {
+entry:
+  %a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx
+  %c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %idx
+  %a.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %a.idx.x, i64 1
+  %c.idx.x.1 = getelementptr inbounds float, float addrspace(1)* %c.idx.x, i64 1
+
+  %ld.c = load float, float addrspace(1)* %c.idx.x, align 4
+  %ld.c.idx.1 = load float, float addrspace(1)* %c.idx.x.1, align 4
+
+  %z = add i32 %x, 4
+  store float 0.0, float addrspace(1)* %a.idx.x, align 4
+  %w = add i32 %y, 9
+  store float 0.0, float addrspace(1)* %a.idx.x.1, align 4
+  %foo = add i32 %z, %w
+
+  %add = fadd float %ld.c, %ld.c.idx.1
+  store float %add, float addrspace(1)* %b, align 4
+  store i32 %foo, i32 addrspace(3)* null, align 4
+  ret void
+}
+
+; Here we have four stores, with an aliasing load before the last one.  We can
+; vectorize the first three stores as <3 x float>, but this vectorized store must
+; be inserted at the location of the third scalar store, not the fourth one.
+;
+; CHECK-LABEL: @insert_store_point_alias
+; CHECK: store <3 x float>
+; CHECK: load float, float addrspace(1)* %a.idx.2
+; CHECK: store float
+; CHECK-SAME: %a.idx.3
+define float @insert_store_point_alias(float addrspace(1)* nocapture %a, i64 %idx) {
+  %a.idx = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx
+  %a.idx.1 = getelementptr inbounds float, float addrspace(1)* %a.idx, i64 1
+  %a.idx.2 = getelementptr inbounds float, float addrspace(1)* %a.idx.1, i64 1
+  %a.idx.3 = getelementptr inbounds float, float addrspace(1)* %a.idx.2, i64 1
+
+  store float 0.0, float addrspace(1)* %a.idx, align 4
+  store float 0.0, float addrspace(1)* %a.idx.1, align 4
+  store float 0.0, float addrspace(1)* %a.idx.2, align 4
+  %x = load float, float addrspace(1)* %a.idx.2, align 4
+  store float 0.0, float addrspace(1)* %a.idx.3, align 4
+
+  ret float %x
+}
+
+; Here we have four stores, with an aliasing load before the last one.  We
+; could vectorize two of the stores before the load (although we currently
+; don't), but the important thing is that we *don't* sink the store to
+; a[idx + 1] below the load.
+;
+; CHECK-LABEL: @insert_store_point_alias_ooo
+; CHECK: store float
+; CHECK-SAME: %a.idx.3
+; CHECK: store float
+; CHECK-SAME: %a.idx.1
+; CHECK: store float
+; CHECK-SAME: %a.idx.2
+; CHECK: load float, float addrspace(1)* %a.idx.2
+; CHECK: store float
+; CHECK-SAME: %a.idx
+define float @insert_store_point_alias_ooo(float addrspace(1)* nocapture %a, i64 %idx) {
+  %a.idx = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx
+  %a.idx.1 = getelementptr inbounds float, float addrspace(1)* %a.idx, i64 1
+  %a.idx.2 = getelementptr inbounds float, float addrspace(1)* %a.idx.1, i64 1
+  %a.idx.3 = getelementptr inbounds float, float addrspace(1)* %a.idx.2, i64 1
+
+  store float 0.0, float addrspace(1)* %a.idx.3, align 4
+  store float 0.0, float addrspace(1)* %a.idx.1, align 4
+  store float 0.0, float addrspace(1)* %a.idx.2, align 4
+  %x = load float, float addrspace(1)* %a.idx.2, align 4
+  store float 0.0, float addrspace(1)* %a.idx, align 4
+
+  ret float %x
+}
+
+attributes #0 = { nounwind }

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,29 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
+; This is NOT OK to vectorize, as either load may alias either store.
+
+; CHECK: load double
+; CHECK: store double 0.000000e+00, double addrspace(1)* %a,
+; CHECK: load double
+; CHECK: store double 0.000000e+00, double addrspace(1)* %a.idx.1
+define amdgpu_kernel void @interleave(double addrspace(1)* nocapture %a, double addrspace(1)* nocapture %b, double addrspace(1)* nocapture readonly %c) #0 {
+entry:
+  %a.idx.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1
+  %c.idx.1 = getelementptr inbounds double, double addrspace(1)* %c, i64 1
+
+  %ld.c = load double, double addrspace(1)* %c, align 8 ; may alias store to %a
+  store double 0.0, double addrspace(1)* %a, align 8
+
+  %ld.c.idx.1 = load double, double addrspace(1)* %c.idx.1, align 8 ; may alias store to %a
+  store double 0.0, double addrspace(1)* %a.idx.1, align 8
+
+  %add = fadd double %ld.c, %ld.c.idx.1
+  store double %add, double addrspace(1)* %b
+
+  ret void
+}
+
+attributes #0 = { nounwind }

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/invariant-load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/invariant-load.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/invariant-load.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/invariant-load.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,29 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
+; CHECK-LABEL: @interleave
+; CHECK: load <2 x double>, <2 x double> addrspace(1)* %{{.}}, align 8{{$}}
+; CHECK: store <2 x double> zeroinitializer
+; CHECK: store double %add
+define amdgpu_kernel void @interleave(double addrspace(1)* nocapture %a, double addrspace(1)* nocapture %b, double addrspace(1)* nocapture readonly %c) #0 {
+entry:
+  %a.idx.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1
+  %c.idx.1 = getelementptr inbounds double, double addrspace(1)* %c, i64 1
+
+  %ld.c = load double, double addrspace(1)* %c, align 8
+  store double 0.0, double addrspace(1)* %a, align 8 ; Cannot alias invariant load
+
+  %ld.c.idx.1 = load double, double addrspace(1)* %c.idx.1, align 8, !invariant.load !0
+  store double 0.0, double addrspace(1)* %a.idx.1, align 8
+
+  %add = fadd double %ld.c, %ld.c.idx.1
+  store double %add, double addrspace(1)* %b
+
+  ret void
+}
+
+attributes #0 = { nounwind }
+
+!0 = !{}

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/lit.local.cfg?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/lit.local.cfg (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/lit.local.cfg Tue Apr 16 21:52:47 2019
@@ -0,0 +1,3 @@
+if not 'AMDGPU' in config.root.targets:
+    config.unsupported = True
+

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,223 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4,-unaligned-scratch-access  -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT4,ELT4-ALIGNED,ALIGNED,ALL %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,-unaligned-scratch-access  -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ELT8-ALIGNED,ALIGNED,ALL %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,-unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ELT16-ALIGNED,ALIGNED,ALL %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4,+unaligned-scratch-access  -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT4,ELT4-UNALIGNED,UNALIGNED,ALL %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,+unaligned-scratch-access  -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ELT8-UNALIGNED,UNALIGNED,ALL %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ELT16-UNALIGNED,UNALIGNED,ALL %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
+; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32
+; ELT4-ALIGNED: store i32
+; ELT4-ALIGNED: store i32
+; ELT4-ALIGNED: store i32
+; ELT4-ALIGNED: store i32
+
+; ELT8: store <2 x i32>
+; ELT8: store <2 x i32>
+
+; ELT16-UNALIGNED: store <4 x i32>
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32(i32 addrspace(5)* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3
+
+  store i32 9, i32 addrspace(5)* %out
+  store i32 1, i32 addrspace(5)* %out.gep.1
+  store i32 23, i32 addrspace(5)* %out.gep.2
+  store i32 19, i32 addrspace(5)* %out.gep.3
+  ret void
+}
+
+; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32_align1(
+; ALIGNED: store i32 9, i32 addrspace(5)* %out, align 1
+; ALIGNED: store i32 1, i32 addrspace(5)* %out.gep.1, align 1
+; ALIGNED: store i32 23, i32 addrspace(5)* %out.gep.2, align 1
+; ALIGNED: store i32 19, i32 addrspace(5)* %out.gep.3, align 1
+
+; ELT16-UNALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32> addrspace(5)* %1, align 1
+
+; ELT8-UNALIGNED: store <2 x i32> <i32 9, i32 1>, <2 x i32> addrspace(5)* %1, align 1
+; ELT8-UNALIGNED: store <2 x i32> <i32 23, i32 19>, <2 x i32> addrspace(5)* %2, align 1
+
+; ELT4-UNALIGNED: store i32
+; ELT4-UNALIGNED: store i32
+; ELT4-UNALIGNED: store i32
+; ELT4-UNALIGNED: store i32
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align1(i32 addrspace(5)* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3
+
+  store i32 9, i32 addrspace(5)* %out, align 1
+  store i32 1, i32 addrspace(5)* %out.gep.1, align 1
+  store i32 23, i32 addrspace(5)* %out.gep.2, align 1
+  store i32 19, i32 addrspace(5)* %out.gep.3, align 1
+  ret void
+}
+
+; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32_align2(
+; ALIGNED: store i32 9, i32 addrspace(5)* %out, align 2
+; ALIGNED: store i32 1, i32 addrspace(5)* %out.gep.1, align 2
+; ALIGNED: store i32 23, i32 addrspace(5)* %out.gep.2, align 2
+; ALIGNED: store i32 19, i32 addrspace(5)* %out.gep.3, align 2
+
+; ELT16-UNALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32> addrspace(5)* %1, align 2
+
+; ELT8-UNALIGNED: store <2 x i32>
+; ELT8-UNALIGNED: store <2 x i32>
+
+; ELT4-UNALIGNED: store i32
+; ELT4-UNALIGNED: store i32
+; ELT4-UNALIGNED: store i32
+; ELT4-UNALIGNED: store i32
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align2(i32 addrspace(5)* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3
+
+  store i32 9, i32 addrspace(5)* %out, align 2
+  store i32 1, i32 addrspace(5)* %out.gep.1, align 2
+  store i32 23, i32 addrspace(5)* %out.gep.2, align 2
+  store i32 19, i32 addrspace(5)* %out.gep.3, align 2
+  ret void
+}
+
+; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8(
+; ALL: store <4 x i8>
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8(i8 addrspace(5)* %out) #0 {
+  %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i32 1
+  %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i32 2
+  %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i32 3
+
+  store i8 9, i8 addrspace(5)* %out, align 4
+  store i8 1, i8 addrspace(5)* %out.gep.1
+  store i8 23, i8 addrspace(5)* %out.gep.2
+  store i8 19, i8 addrspace(5)* %out.gep.3
+  ret void
+}
+
+; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8_align1(
+; ALIGNED: store i8
+; ALIGNED: store i8
+; ALIGNED: store i8
+; ALIGNED: store i8
+
+; UNALIGNED: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8> addrspace(5)* %1, align 1
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8_align1(i8 addrspace(5)* %out) #0 {
+  %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i32 1
+  %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i32 2
+  %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i32 3
+
+  store i8 9, i8 addrspace(5)* %out, align 1
+  store i8 1, i8 addrspace(5)* %out.gep.1, align 1
+  store i8 23, i8 addrspace(5)* %out.gep.2, align 1
+  store i8 19, i8 addrspace(5)* %out.gep.3, align 1
+  ret void
+}
+
+; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16(
+; ALL: store <2 x i16>
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16(i16 addrspace(5)* %out) #0 {
+  %out.gep.1 = getelementptr i16, i16 addrspace(5)* %out, i32 1
+
+  store i16 9, i16 addrspace(5)* %out, align 4
+  store i16 12, i16 addrspace(5)* %out.gep.1
+  ret void
+}
+
+; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align2(
+; ALIGNED: store i16
+; ALIGNED: store i16
+
+; UNALIGNED: store <2 x i16> <i16 9, i16 12>, <2 x i16> addrspace(5)* %1, align 2
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align2(i16 addrspace(5)* %out) #0 {
+  %out.gep.1 = getelementptr i16, i16 addrspace(5)* %out, i32 1
+
+  store i16 9, i16 addrspace(5)* %out, align 2
+  store i16 12, i16 addrspace(5)* %out.gep.1, align 2
+  ret void
+}
+
+; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align1(
+; ALIGNED: store i16
+; ALIGNED: store i16
+
+; UNALIGNED: store <2 x i16> <i16 9, i16 12>, <2 x i16> addrspace(5)* %1, align 1
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align1(i16 addrspace(5)* %out) #0 {
+  %out.gep.1 = getelementptr i16, i16 addrspace(5)* %out, i32 1
+
+  store i16 9, i16 addrspace(5)* %out, align 1
+  store i16 12, i16 addrspace(5)* %out.gep.1, align 1
+  ret void
+}
+
+; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align8(
+; ALL: store <2 x i16> <i16 9, i16 12>, <2 x i16> addrspace(5)* %1, align 8
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align8(i16 addrspace(5)* %out) #0 {
+  %out.gep.1 = getelementptr i16, i16 addrspace(5)* %out, i32 1
+
+  store i16 9, i16 addrspace(5)* %out, align 8
+  store i16 12, i16 addrspace(5)* %out.gep.1, align 2
+  ret void
+}
+
+; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i32
+; ELT4: store i32
+; ELT4: store i32
+; ELT4: store i32
+
+; ELT8: store <2 x i32>
+; ELT8: store i32
+
+; ELT16: store <3 x i32>
+define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32(i32 addrspace(5)* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2
+
+  store i32 9, i32 addrspace(5)* %out
+  store i32 1, i32 addrspace(5)* %out.gep.1
+  store i32 23, i32 addrspace(5)* %out.gep.2
+  ret void
+}
+
+; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i32_align1(
+; ALIGNED: store i32
+; ALIGNED: store i32
+; ALIGNED: store i32
+
+; ELT4-UNALIGNED: store i32
+; ELT4-UNALIGNED: store i32
+; ELT4-UNALIGNED: store i32
+
+; ELT8-UNALIGNED: store <2 x i32>
+; ELT8-UNALIGNED: store i32
+
+; ELT16-UNALIGNED: store <3 x i32>
+define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32_align1(i32 addrspace(5)* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2
+
+  store i32 9, i32 addrspace(5)* %out, align 1
+  store i32 1, i32 addrspace(5)* %out.gep.1, align 1
+  store i32 23, i32 addrspace(5)* %out.gep.2, align 1
+  ret void
+}
+
+; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i8_align1(
+; ALIGNED: store i8
+; ALIGNED: store i8
+; ALIGNED: store i8
+
+; UNALIGNED: store <3 x i8>
+define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i8_align1(i8 addrspace(5)* %out) #0 {
+  %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i8 1
+  %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i8 2
+
+  store i8 9, i8 addrspace(5)* %out, align 1
+  store i8 1, i8 addrspace(5)* %out.gep.1, align 1
+  store i8 23, i8 addrspace(5)* %out.gep.2, align 1
+  ret void
+}
+
+attributes #0 = { nounwind }

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,657 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s
+; Copy of test/CodeGen/AMDGPU/merge-stores.ll with some additions
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
+; TODO: Vector element tests
+; TODO: Non-zero base offset for load and store combinations
+; TODO: Same base addrspacecasted
+
+
+; CHECK-LABEL: @merge_global_store_2_constants_i8(
+; CHECK: store <2 x i8> <i8 -56, i8 123>, <2 x i8> addrspace(1)* %{{[0-9]+}}, align 2
+define amdgpu_kernel void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+
+  store i8 123, i8 addrspace(1)* %out.gep.1
+  store i8 456, i8 addrspace(1)* %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: @merge_global_store_2_constants_i8_natural_align
+; CHECK: store <2 x i8>
+define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+
+  store i8 123, i8 addrspace(1)* %out.gep.1
+  store i8 456, i8 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @merge_global_store_2_constants_i16
+; CHECK: store <2 x i16> <i16 456, i16 123>, <2 x i16> addrspace(1)* %{{[0-9]+}}, align 4
+define amdgpu_kernel void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
+
+  store i16 123, i16 addrspace(1)* %out.gep.1
+  store i16 456, i16 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: @merge_global_store_2_constants_0_i16
+; CHECK: store <2 x i16> zeroinitializer, <2 x i16> addrspace(1)* %{{[0-9]+}}, align 4
+define amdgpu_kernel void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
+
+  store i16 0, i16 addrspace(1)* %out.gep.1
+  store i16 0, i16 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: @merge_global_store_2_constants_i16_natural_align
+; CHECK: store <2 x i16>
+define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
+
+  store i16 123, i16 addrspace(1)* %out.gep.1
+  store i16 456, i16 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @merge_global_store_2_constants_half_natural_align
+; CHECK: store <2 x half>
+define amdgpu_kernel void @merge_global_store_2_constants_half_natural_align(half addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
+
+  store half 2.0, half addrspace(1)* %out.gep.1
+  store half 1.0, half addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @merge_global_store_2_constants_i32
+; CHECK: store <2 x i32> <i32 456, i32 123>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4
+define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+
+  store i32 123, i32 addrspace(1)* %out.gep.1
+  store i32 456, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @merge_global_store_2_constants_i32_f32
+; CHECK: store <2 x i32> <i32 456, i32 1065353216>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4
+define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
+  store float 1.0, float addrspace(1)* %out.gep.1.bc
+  store i32 456, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @merge_global_store_2_constants_f32_i32
+; CHECK  store <2 x float> <float 4.000000e+00, float 0x370EC00000000000>, <2 x float> addrspace(1)* %{{[0-9]+$}}
+define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
+  %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
+  store i32 123, i32 addrspace(1)* %out.gep.1.bc
+  store float 4.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @merge_global_store_4_constants_i32
+; CHECK: store <4 x i32> <i32 1234, i32 123, i32 456, i32 333>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
+define amdgpu_kernel void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
+
+  store i32 123, i32 addrspace(1)* %out.gep.1
+  store i32 456, i32 addrspace(1)* %out.gep.2
+  store i32 333, i32 addrspace(1)* %out.gep.3
+  store i32 1234, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @merge_global_store_4_constants_f32_order
+; CHECK: store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, <4 x float> addrspace(1)* %{{[0-9]+}}
+define amdgpu_kernel void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
+
+  store float 8.0, float addrspace(1)* %out
+  store float 1.0, float addrspace(1)* %out.gep.1
+  store float 2.0, float addrspace(1)* %out.gep.2
+  store float 4.0, float addrspace(1)* %out.gep.3
+  ret void
+}
+
+; First store is out of order.
+; CHECK-LABEL: @merge_global_store_4_constants_f32
+; CHECK: store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, <4 x float> addrspace(1)* %{{[0-9]+}}, align 4
+define amdgpu_kernel void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
+
+  store float 1.0, float addrspace(1)* %out.gep.1
+  store float 2.0, float addrspace(1)* %out.gep.2
+  store float 4.0, float addrspace(1)* %out.gep.3
+  store float 8.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @merge_global_store_4_constants_mixed_i32_f32
+; CHECK: store <4 x i32> <i32 1090519040, i32 11, i32 1073741824, i32 17>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
+define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
+
+  %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
+  %out.gep.3.bc = bitcast float addrspace(1)* %out.gep.3 to i32 addrspace(1)*
+
+  store i32 11, i32 addrspace(1)* %out.gep.1.bc
+  store float 2.0, float addrspace(1)* %out.gep.2
+  store i32 17, i32 addrspace(1)* %out.gep.3.bc
+  store float 8.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @merge_global_store_3_constants_i32
+; CHECK: store <3 x i32> <i32 1234, i32 123, i32 456>, <3 x i32> addrspace(1)* %{{[0-9]+}}, align 4
+define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+
+  store i32 123, i32 addrspace(1)* %out.gep.1
+  store i32 456, i32 addrspace(1)* %out.gep.2
+  store i32 1234, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @merge_global_store_2_constants_i64
+; CHECK: store <2 x i64> <i64 456, i64 123>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8
+define amdgpu_kernel void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
+
+  store i64 123, i64 addrspace(1)* %out.gep.1
+  store i64 456, i64 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @merge_global_store_4_constants_i64
+; CHECK: store <2 x i64> <i64 456, i64 333>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8
+; CHECK: store <2 x i64> <i64 1234, i64 123>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8
+define amdgpu_kernel void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
+  %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
+  %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
+
+  store i64 123, i64 addrspace(1)* %out.gep.1
+  store i64 456, i64 addrspace(1)* %out.gep.2
+  store i64 333, i64 addrspace(1)* %out.gep.3
+  store i64 1234, i64 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @merge_global_store_2_adjacent_loads_i32
+; CHECK: [[LOAD:%[^ ]+]] = load <2 x i32>
+; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i32> [[LOAD]], i32 0
+; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i32> [[LOAD]], i32 1
+; CHECK: [[INSERT0:%[^ ]+]] = insertelement <2 x i32> undef, i32 [[ELT0]], i32 0
+; CHECK: [[INSERT1:%[^ ]+]] = insertelement <2 x i32> [[INSERT0]], i32 [[ELT1]], i32 1
+; CHECK: store <2 x i32> [[INSERT1]]
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+
+  %lo = load i32, i32 addrspace(1)* %in
+  %hi = load i32, i32 addrspace(1)* %in.gep.1
+
+  store i32 %lo, i32 addrspace(1)* %out
+  store i32 %hi, i32 addrspace(1)* %out.gep.1
+  ret void
+}
+
+; CHECK-LABEL: @merge_global_store_2_adjacent_loads_i32_nonzero_base
+; CHECK: extractelement
+; CHECK: extractelement
+; CHECK: insertelement
+; CHECK: insertelement
+; CHECK: store <2 x i32>
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
+
+  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3
+  %lo = load i32, i32 addrspace(1)* %in.gep.0
+  %hi = load i32, i32 addrspace(1)* %in.gep.1
+
+  store i32 %lo, i32 addrspace(1)* %out.gep.0
+  store i32 %hi, i32 addrspace(1)* %out.gep.1
+  ret void
+}
+
+; CHECK-LABEL: @merge_global_store_2_adjacent_loads_shuffle_i32
+; CHECK: [[LOAD:%[^ ]+]] = load <2 x i32>
+; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i32> [[LOAD]], i32 0
+; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i32> [[LOAD]], i32 1
+; CHECK: [[INSERT0:%[^ ]+]] = insertelement <2 x i32> undef, i32 [[ELT1]], i32 0
+; CHECK: [[INSERT1:%[^ ]+]] = insertelement <2 x i32> [[INSERT0]], i32 [[ELT0]], i32 1
+; CHECK: store <2 x i32> [[INSERT1]]
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+
+  %lo = load i32, i32 addrspace(1)* %in
+  %hi = load i32, i32 addrspace(1)* %in.gep.1
+
+  store i32 %hi, i32 addrspace(1)* %out
+  store i32 %lo, i32 addrspace(1)* %out.gep.1
+  ret void
+}
+
+; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32
+; CHECK: load <4 x i32>
+; CHECK: store <4 x i32>
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
+  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
+
+  %x = load i32, i32 addrspace(1)* %in
+  %y = load i32, i32 addrspace(1)* %in.gep.1
+  %z = load i32, i32 addrspace(1)* %in.gep.2
+  %w = load i32, i32 addrspace(1)* %in.gep.3
+
+  store i32 %x, i32 addrspace(1)* %out
+  store i32 %y, i32 addrspace(1)* %out.gep.1
+  store i32 %z, i32 addrspace(1)* %out.gep.2
+  store i32 %w, i32 addrspace(1)* %out.gep.3
+  ret void
+}
+
+; CHECK-LABEL: @merge_global_store_3_adjacent_loads_i32
+; CHECK: load <3 x i32>
+; CHECK: store <3 x i32>
+define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
+
+  %x = load i32, i32 addrspace(1)* %in
+  %y = load i32, i32 addrspace(1)* %in.gep.1
+  %z = load i32, i32 addrspace(1)* %in.gep.2
+
+  store i32 %x, i32 addrspace(1)* %out
+  store i32 %y, i32 addrspace(1)* %out.gep.1
+  store i32 %z, i32 addrspace(1)* %out.gep.2
+  ret void
+}
+
+; CHECK-LABEL: @merge_global_store_4_adjacent_loads_f32
+; CHECK: load <4 x float>
+; CHECK: store <4 x float>
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
+  %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1
+  %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2
+  %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3
+
+  %x = load float, float addrspace(1)* %in
+  %y = load float, float addrspace(1)* %in.gep.1
+  %z = load float, float addrspace(1)* %in.gep.2
+  %w = load float, float addrspace(1)* %in.gep.3
+
+  store float %x, float addrspace(1)* %out
+  store float %y, float addrspace(1)* %out.gep.1
+  store float %z, float addrspace(1)* %out.gep.2
+  store float %w, float addrspace(1)* %out.gep.3
+  ret void
+}
+
+; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32_nonzero_base
+; CHECK: load <4 x i32>
+; CHECK: store <4 x i32>
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
+  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
+  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14
+  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9
+  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10
+
+  %x = load i32, i32 addrspace(1)* %in.gep.0
+  %y = load i32, i32 addrspace(1)* %in.gep.1
+  %z = load i32, i32 addrspace(1)* %in.gep.2
+  %w = load i32, i32 addrspace(1)* %in.gep.3
+
+  store i32 %x, i32 addrspace(1)* %out.gep.0
+  store i32 %y, i32 addrspace(1)* %out.gep.1
+  store i32 %z, i32 addrspace(1)* %out.gep.2
+  store i32 %w, i32 addrspace(1)* %out.gep.3
+  ret void
+}
+
+; CHECK-LABEL: @merge_global_store_4_adjacent_loads_inverse_i32
+; CHECK: load <4 x i32>
+; CHECK: store <4 x i32>
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
+  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
+
+  %x = load i32, i32 addrspace(1)* %in
+  %y = load i32, i32 addrspace(1)* %in.gep.1
+  %z = load i32, i32 addrspace(1)* %in.gep.2
+  %w = load i32, i32 addrspace(1)* %in.gep.3
+
+  ; Make sure the barrier doesn't stop this
+  tail call void @llvm.amdgcn.s.barrier() #1
+
+  store i32 %w, i32 addrspace(1)* %out.gep.3
+  store i32 %z, i32 addrspace(1)* %out.gep.2
+  store i32 %y, i32 addrspace(1)* %out.gep.1
+  store i32 %x, i32 addrspace(1)* %out
+
+  ret void
+}
+
+; CHECK-LABEL: @merge_global_store_4_adjacent_loads_shuffle_i32
+; CHECK: load <4 x i32>
+; CHECK: store <4 x i32>
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
+  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
+  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
+
+  %x = load i32, i32 addrspace(1)* %in
+  %y = load i32, i32 addrspace(1)* %in.gep.1
+  %z = load i32, i32 addrspace(1)* %in.gep.2
+  %w = load i32, i32 addrspace(1)* %in.gep.3
+
+  ; Make sure the barrier doesn't stop this
+  tail call void @llvm.amdgcn.s.barrier() #1
+
+  store i32 %w, i32 addrspace(1)* %out
+  store i32 %z, i32 addrspace(1)* %out.gep.1
+  store i32 %y, i32 addrspace(1)* %out.gep.2
+  store i32 %x, i32 addrspace(1)* %out.gep.3
+
+  ret void
+}
+
+; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i8
+; CHECK: load <4 x i8>
+; CHECK: extractelement <4 x i8>
+; CHECK: extractelement <4 x i8>
+; CHECK: extractelement <4 x i8>
+; CHECK: extractelement <4 x i8>
+; CHECK: insertelement <4 x i8>
+; CHECK: insertelement <4 x i8>
+; CHECK: insertelement <4 x i8>
+; CHECK: insertelement <4 x i8>
+; CHECK: store <4 x i8>
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
+  %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
+  %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
+  %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
+  %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
+  %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
+
+  %x = load i8, i8 addrspace(1)* %in, align 4
+  %y = load i8, i8 addrspace(1)* %in.gep.1
+  %z = load i8, i8 addrspace(1)* %in.gep.2
+  %w = load i8, i8 addrspace(1)* %in.gep.3
+
+  store i8 %x, i8 addrspace(1)* %out, align 4
+  store i8 %y, i8 addrspace(1)* %out.gep.1
+  store i8 %z, i8 addrspace(1)* %out.gep.2
+  store i8 %w, i8 addrspace(1)* %out.gep.3
+  ret void
+}
+
+; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i8_natural_align
+; CHECK: load <4 x i8>
+; CHECK: store <4 x i8>
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
+  %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
+  %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
+  %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
+  %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
+  %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
+
+  %x = load i8, i8 addrspace(1)* %in
+  %y = load i8, i8 addrspace(1)* %in.gep.1
+  %z = load i8, i8 addrspace(1)* %in.gep.2
+  %w = load i8, i8 addrspace(1)* %in.gep.3
+
+  store i8 %x, i8 addrspace(1)* %out
+  store i8 %y, i8 addrspace(1)* %out.gep.1
+  store i8 %z, i8 addrspace(1)* %out.gep.2
+  store i8 %w, i8 addrspace(1)* %out.gep.3
+  ret void
+}
+
+; CHECK-LABEL: @merge_global_store_4_vector_elts_loads_v4i32
+; CHECK: load <4 x i32>
+; CHECK: store <4 x i32>
+define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
+  %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in
+
+  %x = extractelement <4 x i32> %vec, i32 0
+  %y = extractelement <4 x i32> %vec, i32 1
+  %z = extractelement <4 x i32> %vec, i32 2
+  %w = extractelement <4 x i32> %vec, i32 3
+
+  store i32 %x, i32 addrspace(1)* %out
+  store i32 %y, i32 addrspace(1)* %out.gep.1
+  store i32 %z, i32 addrspace(1)* %out.gep.2
+  store i32 %w, i32 addrspace(1)* %out.gep.3
+  ret void
+}
+
+; CHECK-LABEL: @merge_local_store_2_constants_i8
+; CHECK: store <2 x i8> <i8 -56, i8 123>, <2 x i8> addrspace(3)* %{{[0-9]+}}, align 2
+define amdgpu_kernel void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
+  %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
+
+  store i8 123, i8 addrspace(3)* %out.gep.1
+  store i8 456, i8 addrspace(3)* %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: @merge_local_store_2_constants_i32
+; CHECK: store <2 x i32> <i32 456, i32 123>, <2 x i32> addrspace(3)* %{{[0-9]+}}, align 4
+define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
+
+  store i32 123, i32 addrspace(3)* %out.gep.1
+  store i32 456, i32 addrspace(3)* %out
+  ret void
+}
+
+; CHECK-LABEL: @merge_local_store_2_constants_i32_align_2
+; CHECK: store i32
+; CHECK: store i32
+define amdgpu_kernel void @merge_local_store_2_constants_i32_align_2(i32 addrspace(3)* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
+
+  store i32 123, i32 addrspace(3)* %out.gep.1, align 2
+  store i32 456, i32 addrspace(3)* %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: @merge_local_store_4_constants_i32
+; CHECK: store <4 x i32> <i32 1234, i32 123, i32 456, i32 333>, <4 x i32> addrspace(3)*
+define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
+
+  store i32 123, i32 addrspace(3)* %out.gep.1
+  store i32 456, i32 addrspace(3)* %out.gep.2
+  store i32 333, i32 addrspace(3)* %out.gep.3
+  store i32 1234, i32 addrspace(3)* %out
+  ret void
+}
+
+; CHECK-LABEL: @merge_global_store_5_constants_i32
+; CHECK: store <4 x i32> <i32 9, i32 12, i32 16, i32 -12>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
+; CHECK: store i32
+define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
+  store i32 9, i32 addrspace(1)* %out, align 4
+  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
+  store i32 12, i32 addrspace(1)* %idx1, align 4
+  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
+  store i32 16, i32 addrspace(1)* %idx2, align 4
+  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
+  store i32 -12, i32 addrspace(1)* %idx3, align 4
+  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
+  store i32 11, i32 addrspace(1)* %idx4, align 4
+  ret void
+}
+
+; CHECK-LABEL: @merge_global_store_6_constants_i32
+; CHECK: store <4 x i32> <i32 13, i32 15, i32 62, i32 63>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
+; CHECK: store <2 x i32> <i32 11, i32 123>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4
+define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
+  store i32 13, i32 addrspace(1)* %out, align 4
+  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
+  store i32 15, i32 addrspace(1)* %idx1, align 4
+  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
+  store i32 62, i32 addrspace(1)* %idx2, align 4
+  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
+  store i32 63, i32 addrspace(1)* %idx3, align 4
+  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
+  store i32 11, i32 addrspace(1)* %idx4, align 4
+  %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
+  store i32 123, i32 addrspace(1)* %idx5, align 4
+  ret void
+}
+
+; CHECK-LABEL: @merge_global_store_7_constants_i32
+; CHECK: store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
+; CHECK: store <3 x i32> <i32 98, i32 91, i32 212>, <3 x i32> addrspace(1)* %{{[0-9]+}}, align 4
+define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
+  store i32 34, i32 addrspace(1)* %out, align 4
+  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
+  store i32 999, i32 addrspace(1)* %idx1, align 4
+  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
+  store i32 65, i32 addrspace(1)* %idx2, align 4
+  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
+  store i32 33, i32 addrspace(1)* %idx3, align 4
+  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
+  store i32 98, i32 addrspace(1)* %idx4, align 4
+  %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
+  store i32 91, i32 addrspace(1)* %idx5, align 4
+  %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
+  store i32 212, i32 addrspace(1)* %idx6, align 4
+  ret void
+}
+
+; CHECK-LABEL: @merge_global_store_8_constants_i32
+; CHECK: store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
+; CHECK: store <4 x i32> <i32 98, i32 91, i32 212, i32 999>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
+define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
+  store i32 34, i32 addrspace(1)* %out, align 4
+  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
+  store i32 999, i32 addrspace(1)* %idx1, align 4
+  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
+  store i32 65, i32 addrspace(1)* %idx2, align 4
+  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
+  store i32 33, i32 addrspace(1)* %idx3, align 4
+  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
+  store i32 98, i32 addrspace(1)* %idx4, align 4
+  %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
+  store i32 91, i32 addrspace(1)* %idx5, align 4
+  %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
+  store i32 212, i32 addrspace(1)* %idx6, align 4
+  %idx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 7
+  store i32 999, i32 addrspace(1)* %idx7, align 4
+  ret void
+}
+
+; CHECK-LABEL: @copy_v3i32_align4
+; CHECK: %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
+; CHECK: store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
+define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
+  %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
+  store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @copy_v3i64_align4
+; CHECK: %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
+; CHECK: store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
+define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
+  %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
+  store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @copy_v3f32_align4
+; CHECK: %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
+; CHECK: store <3 x float>
+define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
+  %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
+  %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0>
+  store <3 x float> %fadd, <3 x float> addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @copy_v3f64_align4
+; CHECK: %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
+; CHECK: store <3 x double> %fadd, <3 x double> addrspace(1)* %out
+define amdgpu_kernel void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
+  %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
+  %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0>
+  store <3 x double> %fadd, <3 x double> addrspace(1)* %out
+  ret void
+}
+
+; Verify that we no longer hit asserts for this test case. No change expected.
+; CHECK-LABEL: @copy_vec_of_ptrs
+; CHECK: %in.gep.1 = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* %in, i32 1
+; CHECK: %vec1 = load <2 x i16*>, <2 x i16*> addrspace(1)* %in.gep.1
+; CHECK: %vec2 = load <2 x i16*>, <2 x i16*> addrspace(1)* %in, align 4
+; CHECK: %out.gep.1 = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* %out, i32 1
+; CHECK: store <2 x i16*> %vec1, <2 x i16*> addrspace(1)* %out.gep.1
+; CHECK: store <2 x i16*> %vec2, <2 x i16*> addrspace(1)* %out, align 4
+define amdgpu_kernel void @copy_vec_of_ptrs(<2 x i16*> addrspace(1)* %out,
+                                            <2 x i16*> addrspace(1)* %in ) #0 {
+  %in.gep.1 = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* %in, i32 1
+  %vec1 = load <2 x i16*>, <2 x i16*> addrspace(1)* %in.gep.1
+  %vec2 = load <2 x i16*>, <2 x i16*> addrspace(1)* %in, align 4
+
+  %out.gep.1 = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* %out, i32 1
+  store <2 x i16*> %vec1, <2 x i16*> addrspace(1)* %out.gep.1
+  store <2 x i16*> %vec2, <2 x i16*> addrspace(1)* %out, align 4
+  ret void
+}
+
+declare void @llvm.amdgcn.s.barrier() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { convergent nounwind }

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,91 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
+; CHECK-LABEL: @merge_v2i32_v2i32(
+; CHECK: load <4 x i32>
+; CHECK: store <4 x i32> zeroinitializer
+define amdgpu_kernel void @merge_v2i32_v2i32(<2 x i32> addrspace(1)* nocapture %a, <2 x i32> addrspace(1)* nocapture readonly %b) #0 {
+entry:
+  %a.1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %a, i64 1
+  %b.1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %b, i64 1
+
+  %ld.c = load <2 x i32>, <2 x i32> addrspace(1)* %b, align 4
+  %ld.c.idx.1 = load <2 x i32>, <2 x i32> addrspace(1)* %b.1, align 4
+
+  store <2 x i32> zeroinitializer, <2 x i32> addrspace(1)* %a, align 4
+  store <2 x i32> zeroinitializer, <2 x i32> addrspace(1)* %a.1, align 4
+
+  ret void
+}
+
+; CHECK-LABEL: @merge_v1i32_v1i32(
+; CHECK: load <2 x i32>
+; CHECK: store <2 x i32> zeroinitializer
+define amdgpu_kernel void @merge_v1i32_v1i32(<1 x i32> addrspace(1)* nocapture %a, <1 x i32> addrspace(1)* nocapture readonly %b) #0 {
+entry:
+  %a.1 = getelementptr inbounds <1 x i32>, <1 x i32> addrspace(1)* %a, i64 1
+  %b.1 = getelementptr inbounds <1 x i32>, <1 x i32> addrspace(1)* %b, i64 1
+
+  %ld.c = load <1 x i32>, <1 x i32> addrspace(1)* %b, align 4
+  %ld.c.idx.1 = load <1 x i32>, <1 x i32> addrspace(1)* %b.1, align 4
+
+  store <1 x i32> zeroinitializer, <1 x i32> addrspace(1)* %a, align 4
+  store <1 x i32> zeroinitializer, <1 x i32> addrspace(1)* %a.1, align 4
+
+  ret void
+}
+
+; CHECK-LABEL: @no_merge_v3i32_v3i32(
+; CHECK: load <3 x i32>
+; CHECK: load <3 x i32>
+; CHECK: store <3 x i32> zeroinitializer
+; CHECK: store <3 x i32> zeroinitializer
+define amdgpu_kernel void @no_merge_v3i32_v3i32(<3 x i32> addrspace(1)* nocapture %a, <3 x i32> addrspace(1)* nocapture readonly %b) #0 {
+entry:
+  %a.1 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %a, i64 1
+  %b.1 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %b, i64 1
+
+  %ld.c = load <3 x i32>, <3 x i32> addrspace(1)* %b, align 4
+  %ld.c.idx.1 = load <3 x i32>, <3 x i32> addrspace(1)* %b.1, align 4
+
+  store <3 x i32> zeroinitializer, <3 x i32> addrspace(1)* %a, align 4
+  store <3 x i32> zeroinitializer, <3 x i32> addrspace(1)* %a.1, align 4
+
+  ret void
+}
+
+; CHECK-LABEL: @merge_v2i16_v2i16(
+; CHECK: load <4 x i16>
+; CHECK: store <4 x i16> zeroinitializer
+define amdgpu_kernel void @merge_v2i16_v2i16(<2 x i16> addrspace(1)* nocapture %a, <2 x i16> addrspace(1)* nocapture readonly %b) #0 {
+entry:
+  %a.1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a, i64 1
+  %b.1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b, i64 1
+
+  %ld.c = load <2 x i16>, <2 x i16> addrspace(1)* %b, align 4
+  %ld.c.idx.1 = load <2 x i16>, <2 x i16> addrspace(1)* %b.1, align 4
+
+  store <2 x i16> zeroinitializer, <2 x i16> addrspace(1)* %a, align 4
+  store <2 x i16> zeroinitializer, <2 x i16> addrspace(1)* %a.1, align 4
+
+  ret void
+}
+
+; Ideally this would be merged
+; CHECK-LABEL: @merge_load_i32_v2i16(
+; CHECK: load i32,
+; CHECK: load <2 x i16>
+define amdgpu_kernel void @merge_load_i32_v2i16(i32 addrspace(1)* nocapture %a) #0 {
+entry:
+  %a.1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 1
+  %a.1.cast = bitcast i32 addrspace(1)* %a.1 to <2 x i16> addrspace(1)*
+
+  %ld.0 = load i32, i32 addrspace(1)* %a
+  %ld.1 = load <2 x i16>, <2 x i16> addrspace(1)* %a.1.cast
+
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,32 @@
+; RUN: opt -mtriple=amdgcn-- -load-store-vectorizer -S -o - %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
+ at lds = internal addrspace(3) global [512 x float] undef, align 4
+
+; The original load has an implicit alignment of 4, and should not
+; increase to an align 8 load.
+
+; CHECK-LABEL: @load_keep_base_alignment_missing_align(
+; CHECK: load <2 x float>, <2 x float> addrspace(3)* %{{[0-9]+}}, align 4
+define amdgpu_kernel void @load_keep_base_alignment_missing_align(float addrspace(1)* %out) {
+  %ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 11
+  %val0 = load float, float addrspace(3)* %ptr0
+
+  %ptr1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 12
+  %val1 = load float, float addrspace(3)* %ptr1
+  %add = fadd float %val0, %val1
+  store float %add, float addrspace(1)* %out
+  ret void
+}
+
+
+; CHECK-LABEL: @store_keep_base_alignment_missing_align(
+; CHECK: store <2 x float> zeroinitializer, <2 x float> addrspace(3)* %{{[0-9]+}}, align 4
+define amdgpu_kernel void @store_keep_base_alignment_missing_align() {
+  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 1
+  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 2
+  store float 0.0, float addrspace(3)* %arrayidx0
+  store float 0.0, float addrspace(3)* %arrayidx1
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,63 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
+; Checks that there is no crash when there are multiple tails
+; for a the same head starting a chain.
+ at 0 = internal addrspace(3) global [16384 x i32] undef
+
+; CHECK-LABEL: @no_crash(
+; CHECK: store <2 x i32> zeroinitializer
+; CHECK: store i32 0
+; CHECK: store i32 0
+
+define amdgpu_kernel void @no_crash(i32 %arg) {
+  %tmp2 = add i32 %arg, 14
+  %tmp3 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %tmp2
+  %tmp4 = add i32 %arg, 15
+  %tmp5 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %tmp4
+
+  store i32 0, i32 addrspace(3)* %tmp3, align 4
+  store i32 0, i32 addrspace(3)* %tmp5, align 4
+  store i32 0, i32 addrspace(3)* %tmp5, align 4
+  store i32 0, i32 addrspace(3)* %tmp5, align 4
+
+  ret void
+}
+
+; Check adjiacent memory locations are properly matched and the
+; longest chain vectorized
+
+; CHECK-LABEL: @interleave_get_longest
+; CHECK: load <4 x i32>
+; CHECK: load i32
+; CHECK: store <2 x i32> zeroinitializer
+; CHECK: load i32
+; CHECK: load i32
+; CHECK: load i32
+
+define amdgpu_kernel void @interleave_get_longest(i32 %arg) {
+  %a1 = add i32 %arg, 1
+  %a2 = add i32 %arg, 2
+  %a3 = add i32 %arg, 3
+  %a4 = add i32 %arg, 4
+  %tmp1 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %arg
+  %tmp2 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a1
+  %tmp3 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a2
+  %tmp4 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a3
+  %tmp5 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a4
+
+  %l1 = load i32, i32 addrspace(3)* %tmp2, align 4
+  %l2 = load i32, i32 addrspace(3)* %tmp1, align 4
+  store i32 0, i32 addrspace(3)* %tmp2, align 4
+  store i32 0, i32 addrspace(3)* %tmp1, align 4
+  %l3 = load i32, i32 addrspace(3)* %tmp2, align 4
+  %l4 = load i32, i32 addrspace(3)* %tmp3, align 4
+  %l5 = load i32, i32 addrspace(3)* %tmp4, align 4
+  %l6 = load i32, i32 addrspace(3)* %tmp5, align 4
+  %l7 = load i32, i32 addrspace(3)* %tmp5, align 4
+  %l8 = load i32, i32 addrspace(3)* %tmp5, align 4
+
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,22 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
+; CHECK-LABEL: @no_implicit_float(
+; CHECK: store i32
+; CHECK: store i32
+; CHECK: store i32
+; CHECK: store i32
+define amdgpu_kernel void @no_implicit_float(i32 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
+
+  store i32 123, i32 addrspace(1)* %out.gep.1
+  store i32 456, i32 addrspace(1)* %out.gep.2
+  store i32 333, i32 addrspace(1)* %out.gep.3
+  store i32 1234, i32 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind noimplicitfloat }

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,24 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
+; CHECK-LABEL: @optnone(
+; CHECK: store i32
+; CHECK: store i32
+define amdgpu_kernel void @optnone(i32 addrspace(1)* %out) noinline optnone {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+
+  store i32 123, i32 addrspace(1)* %out.gep.1
+  store i32 456, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @do_opt(
+; CHECK: store <2 x i32>
+define amdgpu_kernel void @do_opt(i32 addrspace(1)* %out) {
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+
+  store i32 123, i32 addrspace(1)* %out.gep.1
+  store i32 456, i32 addrspace(1)* %out
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,311 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+; CHECK-LABEL: @merge_v2p1i8(
+; CHECK: load <2 x i64>
+; CHECK: inttoptr i64 %{{[^ ]+}} to i8 addrspace(1)*
+; CHECK: inttoptr i64 %{{[^ ]+}} to i8 addrspace(1)*
+; CHECK: store <2 x i64> zeroinitializer
+define amdgpu_kernel void @merge_v2p1i8(i8 addrspace(1)* addrspace(1)* nocapture %a, i8 addrspace(1)* addrspace(1)* nocapture readonly %b) #0 {
+entry:
+  %a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1
+  %b.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %b, i64 1
+
+  %ld.c = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %b, align 4
+  %ld.c.idx.1 = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %b.1, align 4
+
+  store i8 addrspace(1)* null, i8 addrspace(1)* addrspace(1)* %a, align 4
+  store i8 addrspace(1)* null, i8 addrspace(1)* addrspace(1)* %a.1, align 4
+
+  ret void
+}
+
+; CHECK-LABEL: @merge_v2p3i8(
+; CHECK: load <2 x i32>
+; CHECK: inttoptr i32 %{{[^ ]+}} to i8 addrspace(3)*
+; CHECK: inttoptr i32 %{{[^ ]+}} to i8 addrspace(3)*
+; CHECK: store <2 x i32> zeroinitializer
+define amdgpu_kernel void @merge_v2p3i8(i8 addrspace(3)* addrspace(3)* nocapture %a, i8 addrspace(3)* addrspace(3)* nocapture readonly %b) #0 {
+entry:
+  %a.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a, i64 1
+  %b.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %b, i64 1
+
+  %ld.c = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %b, align 4
+  %ld.c.idx.1 = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %b.1, align 4
+
+  store i8 addrspace(3)* null, i8 addrspace(3)* addrspace(3)* %a, align 4
+  store i8 addrspace(3)* null, i8 addrspace(3)* addrspace(3)* %a.1, align 4
+
+  ret void
+}
+
+; CHECK-LABEL: @merge_load_i64_ptr64(
+; CHECK: load <2 x i64>
+; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1
+; CHECK: inttoptr i64 [[ELT1]] to i8 addrspace(1)*
+define amdgpu_kernel void @merge_load_i64_ptr64(i64 addrspace(1)* nocapture %a) #0 {
+entry:
+  %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
+  %a.1.cast = bitcast i64 addrspace(1)* %a.1 to i8 addrspace(1)* addrspace(1)*
+
+  %ld.0 = load i64, i64 addrspace(1)* %a
+  %ld.1 = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a.1.cast
+
+  ret void
+}
+
+; CHECK-LABEL: @merge_load_ptr64_i64(
+; CHECK: load <2 x i64>
+; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 0
+; CHECK: inttoptr i64 [[ELT0]] to i8 addrspace(1)*
+define amdgpu_kernel void @merge_load_ptr64_i64(i64 addrspace(1)* nocapture %a) #0 {
+entry:
+  %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
+  %a.1 =  getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
+
+  %ld.0 = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a.cast
+  %ld.1 = load i64, i64 addrspace(1)* %a.1
+
+  ret void
+}
+
+; CHECK-LABEL: @merge_store_ptr64_i64(
+; CHECK: [[ELT0:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr0 to i64
+; CHECK: insertelement <2 x i64> undef, i64 [[ELT0]], i32 0
+; CHECK: store <2 x i64>
+define amdgpu_kernel void @merge_store_ptr64_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, i64 %val1) #0 {
+entry:
+  %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
+  %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
+
+
+  store i8 addrspace(1)* %ptr0, i8 addrspace(1)* addrspace(1)* %a.cast
+  store i64 %val1, i64 addrspace(1)* %a.1
+
+  ret void
+}
+
+; CHECK-LABEL: @merge_store_i64_ptr64(
+; CHECK: [[ELT1:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr1 to i64
+; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1]], i32 1
+; CHECK: store <2 x i64>
+define amdgpu_kernel void @merge_store_i64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(1)* %ptr1) #0 {
+entry:
+  %a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1
+  %a.cast = bitcast i8 addrspace(1)* addrspace(1)* %a to i64 addrspace(1)*
+
+  store i64 %val0, i64 addrspace(1)* %a.cast
+  store i8 addrspace(1)* %ptr1, i8 addrspace(1)* addrspace(1)* %a.1
+
+  ret void
+}
+
+; CHECK-LABEL: @merge_load_i32_ptr32(
+; CHECK: load <2 x i32>
+; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i32> %{{[^ ]+}}, i32 1
+; CHECK: inttoptr i32 [[ELT1]] to i8 addrspace(3)*
+define amdgpu_kernel void @merge_load_i32_ptr32(i32 addrspace(3)* nocapture %a) #0 {
+entry:
+  %a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1
+  %a.1.cast = bitcast i32 addrspace(3)* %a.1 to i8 addrspace(3)* addrspace(3)*
+
+  %ld.0 = load i32, i32 addrspace(3)* %a
+  %ld.1 = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a.1.cast
+
+  ret void
+}
+
+; CHECK-LABEL: @merge_load_ptr32_i32(
+; CHECK: load <2 x i32>
+; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i32> %{{[^ ]+}}, i32 0
+; CHECK: inttoptr i32 [[ELT0]] to i8 addrspace(3)*
+define amdgpu_kernel void @merge_load_ptr32_i32(i32 addrspace(3)* nocapture %a) #0 {
+entry:
+  %a.cast = bitcast i32 addrspace(3)* %a to i8 addrspace(3)* addrspace(3)*
+  %a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1
+
+  %ld.0 = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a.cast
+  %ld.1 = load i32, i32 addrspace(3)* %a.1
+
+  ret void
+}
+
+; CHECK-LABEL: @merge_store_ptr32_i32(
+; CHECK: [[ELT0:%[^ ]+]] = ptrtoint i8 addrspace(3)* %ptr0 to i32
+; CHECK: insertelement <2 x i32> undef, i32 [[ELT0]], i32 0
+; CHECK: store <2 x i32>
+define amdgpu_kernel void @merge_store_ptr32_i32(i32 addrspace(3)* nocapture %a, i8 addrspace(3)* %ptr0, i32 %val1) #0 {
+entry:
+  %a.cast = bitcast i32 addrspace(3)* %a to i8 addrspace(3)* addrspace(3)*
+  %a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1
+
+  store i8 addrspace(3)* %ptr0, i8 addrspace(3)* addrspace(3)* %a.cast
+  store i32 %val1, i32 addrspace(3)* %a.1
+
+  ret void
+}
+
+; CHECK-LABEL: @merge_store_i32_ptr32(
+; CHECK: [[ELT1:%[^ ]+]] = ptrtoint i8 addrspace(3)* %ptr1 to i32
+; CHECK: insertelement <2 x i32> %{{[^ ]+}}, i32 [[ELT1]], i32 1
+; CHECK: store <2 x i32>
+define amdgpu_kernel void @merge_store_i32_ptr32(i8 addrspace(3)* addrspace(3)* nocapture %a, i32 %val0, i8 addrspace(3)* %ptr1) #0 {
+entry:
+  %a.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a, i32 1
+  %a.cast = bitcast i8 addrspace(3)* addrspace(3)* %a to i32 addrspace(3)*
+
+  store i32 %val0, i32 addrspace(3)* %a.cast
+  store i8 addrspace(3)* %ptr1, i8 addrspace(3)* addrspace(3)* %a.1
+
+  ret void
+}
+
+; CHECK-LABEL: @no_merge_store_ptr32_i64(
+; CHECK: store i8 addrspace(3)*
+; CHECK: store i64
+define amdgpu_kernel void @no_merge_store_ptr32_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(3)* %ptr0, i64 %val1) #0 {
+entry:
+  %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(3)* addrspace(1)*
+  %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
+
+
+  store i8 addrspace(3)* %ptr0, i8 addrspace(3)* addrspace(1)* %a.cast
+  store i64 %val1, i64 addrspace(1)* %a.1
+
+  ret void
+}
+
+; CHECK-LABEL: @no_merge_store_i64_ptr32(
+; CHECK: store i64
+; CHECK: store i8 addrspace(3)*
+define amdgpu_kernel void @no_merge_store_i64_ptr32(i8 addrspace(3)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(3)* %ptr1) #0 {
+entry:
+  %a.1 =  getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %a, i64 1
+  %a.cast = bitcast i8 addrspace(3)* addrspace(1)* %a to i64 addrspace(1)*
+
+  store i64 %val0, i64 addrspace(1)* %a.cast
+  store i8 addrspace(3)* %ptr1, i8 addrspace(3)* addrspace(1)* %a.1
+
+  ret void
+}
+
+; CHECK-LABEL: @no_merge_load_i64_ptr32(
+; CHECK: load i64,
+; CHECK: load i8 addrspace(3)*,
+define amdgpu_kernel void @no_merge_load_i64_ptr32(i64 addrspace(1)* nocapture %a) #0 {
+entry:
+  %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
+  %a.1.cast = bitcast i64 addrspace(1)* %a.1 to i8 addrspace(3)* addrspace(1)*
+
+  %ld.0 = load i64, i64 addrspace(1)* %a
+  %ld.1 = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %a.1.cast
+
+  ret void
+}
+
+; CHECK-LABEL: @no_merge_load_ptr32_i64(
+; CHECK: load i8 addrspace(3)*,
+; CHECK: load i64,
+define amdgpu_kernel void @no_merge_load_ptr32_i64(i64 addrspace(1)* nocapture %a) #0 {
+entry:
+  %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(3)* addrspace(1)*
+  %a.1 =  getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
+
+  %ld.0 = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %a.cast
+  %ld.1 = load i64, i64 addrspace(1)* %a.1
+
+  ret void
+}
+
+; XXX - This isn't merged for some reason
+; CHECK-LABEL: @merge_v2p1i8_v2p1i8(
+; CHECK: load <2 x i8 addrspace(1)*>
+; CHECK: load <2 x i8 addrspace(1)*>
+; CHECK: store <2 x i8 addrspace(1)*>
+; CHECK: store <2 x i8 addrspace(1)*>
+define amdgpu_kernel void @merge_v2p1i8_v2p1i8(<2 x i8 addrspace(1)*> addrspace(1)* nocapture noalias %a, <2 x i8 addrspace(1)*> addrspace(1)* nocapture readonly noalias %b) #0 {
+entry:
+  %a.1 = getelementptr inbounds <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %a, i64 1
+  %b.1 = getelementptr inbounds <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %b, i64 1
+
+  %ld.c = load <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %b, align 4
+  %ld.c.idx.1 = load <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %b.1, align 4
+
+  store <2 x i8 addrspace(1)*> zeroinitializer, <2 x i8 addrspace(1)*> addrspace(1)* %a, align 4
+  store <2 x i8 addrspace(1)*> zeroinitializer, <2 x i8 addrspace(1)*> addrspace(1)* %a.1, align 4
+  ret void
+}
+
+; CHECK-LABEL: @merge_load_ptr64_f64(
+; CHECK: load <2 x i64>
+; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 0
+; CHECK: [[ELT0_INT:%[^ ]+]] = inttoptr i64 [[ELT0]] to i8 addrspace(1)*
+; CHECK: [[ELT1_INT:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1
+; CHECK: bitcast i64 [[ELT1_INT]] to double
+define amdgpu_kernel void @merge_load_ptr64_f64(double addrspace(1)* nocapture %a) #0 {
+entry:
+  %a.cast = bitcast double addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
+  %a.1 =  getelementptr inbounds double, double addrspace(1)* %a, i64 1
+
+  %ld.0 = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a.cast
+  %ld.1 = load double, double addrspace(1)* %a.1
+
+  ret void
+}
+
+; CHECK-LABEL: @merge_load_f64_ptr64(
+; CHECK: load <2 x i64>
+; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 0
+; CHECK: bitcast i64 [[ELT0]] to double
+; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1
+; CHECK: inttoptr i64 [[ELT1]] to i8 addrspace(1)*
+define amdgpu_kernel void @merge_load_f64_ptr64(double addrspace(1)* nocapture %a) #0 {
+entry:
+  %a.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1
+  %a.1.cast = bitcast double addrspace(1)* %a.1 to i8 addrspace(1)* addrspace(1)*
+
+  %ld.0 = load double, double addrspace(1)* %a
+  %ld.1 = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a.1.cast
+
+  ret void
+}
+
+; CHECK-LABEL: @merge_store_ptr64_f64(
+; CHECK: [[ELT0_INT:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr0 to i64
+; CHECK: insertelement <2 x i64> undef, i64 [[ELT0_INT]], i32 0
+; CHECK: [[ELT1_INT:%[^ ]+]] = bitcast double %val1 to i64
+; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1_INT]], i32 1
+; CHECK: store <2 x i64>
+define amdgpu_kernel void @merge_store_ptr64_f64(double addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, double %val1) #0 {
+entry:
+  %a.cast = bitcast double addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
+  %a.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1
+
+  store i8 addrspace(1)* %ptr0, i8 addrspace(1)* addrspace(1)* %a.cast
+  store double %val1, double addrspace(1)* %a.1
+
+  ret void
+}
+
+; CHECK-LABEL: @merge_store_f64_ptr64(
+; CHECK: [[ELT0_INT:%[^ ]+]] = bitcast double %val0 to i64
+; CHECK: insertelement <2 x i64> undef, i64 [[ELT0_INT]], i32 0
+; CHECK: [[ELT1_INT:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr1 to i64
+; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1_INT]], i32 1
+; CHECK: store <2 x i64>
+define amdgpu_kernel void @merge_store_f64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, double %val0, i8 addrspace(1)* %ptr1) #0 {
+entry:
+  %a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1
+  %a.cast = bitcast i8 addrspace(1)* addrspace(1)* %a to double addrspace(1)*
+
+  store double %val0, double addrspace(1)* %a.cast
+  store i8 addrspace(1)* %ptr1, i8 addrspace(1)* addrspace(1)* %a.1
+
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/selects.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/selects.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/selects.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/selects.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,95 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -dce -S -o - %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
+define void @base_case(i1 %cnd, i32 addrspace(1)* %a, i32 addrspace(1)* %b, <3 x i32> addrspace(1)* %out) {
+; CHECK-LABEL: @base_case
+; CHECK: load <3 x i32>
+entry:
+  %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 1
+  %gep2 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 2
+  %gep4 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 1
+  %gep5 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 2
+  %selected = select i1 %cnd, i32 addrspace(1)* %a, i32 addrspace(1)* %b
+  %selected14 = select i1 %cnd, i32 addrspace(1)* %gep1, i32 addrspace(1)* %gep4
+  %selected25 = select i1 %cnd, i32 addrspace(1)* %gep2, i32 addrspace(1)* %gep5
+  %val0 = load i32, i32 addrspace(1)* %selected, align 4
+  %val1 = load i32, i32 addrspace(1)* %selected14, align 4
+  %val2 = load i32, i32 addrspace(1)* %selected25, align 4
+  %t0 = insertelement <3 x i32> undef, i32 %val0, i32 0
+  %t1 = insertelement <3 x i32> %t0, i32 %val1, i32 1
+  %t2 = insertelement <3 x i32> %t1, i32 %val2, i32 2
+  store <3 x i32> %t2, <3 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @scev_targeting_complex_case(i1 %cnd, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 %base, <2 x i32> addrspace(1)* %out) {
+; CHECK-LABEL: @scev_targeting_complex_case
+; CHECK: load <2 x i32>
+entry:
+  %base.x4 = shl i32 %base, 2
+  %base.x4.p1 = add i32 %base.x4, 1
+  %base.x4.p2 = add i32 %base.x4, 2
+  %base.x4.p3 = add i32 %base.x4, 3
+  %zext.x4 = zext i32 %base.x4 to i64
+  %zext.x4.p1 = zext i32 %base.x4.p1 to i64
+  %zext.x4.p2 = zext i32 %base.x4.p2 to i64
+  %zext.x4.p3 = zext i32 %base.x4.p3 to i64
+  %base.x16 = mul i64 %zext.x4, 4
+  %base.x16.p4 = shl i64 %zext.x4.p1, 2
+  %base.x16.p8 = shl i64 %zext.x4.p2, 2
+  %base.x16.p12 = mul i64 %zext.x4.p3, 4
+  %a.pi8 = bitcast i32 addrspace(1)* %a to i8 addrspace(1)*
+  %b.pi8 = bitcast i32 addrspace(1)* %b to i8 addrspace(1)*
+  %gep.a.base.x16 = getelementptr inbounds i8, i8 addrspace(1)* %a.pi8, i64 %base.x16
+  %gep.b.base.x16.p4 = getelementptr inbounds i8, i8 addrspace(1)* %b.pi8, i64 %base.x16.p4
+  %gep.a.base.x16.p8 = getelementptr inbounds i8, i8 addrspace(1)* %a.pi8, i64 %base.x16.p8
+  %gep.b.base.x16.p12 = getelementptr inbounds i8, i8 addrspace(1)* %b.pi8, i64 %base.x16.p12
+  %a.base.x16 = bitcast i8 addrspace(1)* %gep.a.base.x16 to i32 addrspace(1)*
+  %b.base.x16.p4 = bitcast i8 addrspace(1)* %gep.b.base.x16.p4 to i32 addrspace(1)*
+  %selected.base.x16.p0.or.4 = select i1 %cnd, i32 addrspace(1)* %a.base.x16, i32 addrspace(1)* %b.base.x16.p4
+  %gep.selected.base.x16.p8.or.12 = select i1 %cnd, i8 addrspace(1)* %gep.a.base.x16.p8, i8 addrspace(1)* %gep.b.base.x16.p12
+  %selected.base.x16.p8.or.12 = bitcast i8 addrspace(1)* %gep.selected.base.x16.p8.or.12 to i32 addrspace(1)*
+  %selected.base.x16.p40.or.44 = getelementptr inbounds i32, i32 addrspace(1)* %selected.base.x16.p0.or.4, i64 10
+  %selected.base.x16.p44.or.48 = getelementptr inbounds i32, i32 addrspace(1)* %selected.base.x16.p8.or.12, i64 9
+  %val0 = load i32, i32 addrspace(1)* %selected.base.x16.p40.or.44, align 4
+  %val1 = load i32, i32 addrspace(1)* %selected.base.x16.p44.or.48, align 4
+  %t0 = insertelement <2 x i32> undef, i32 %val0, i32 0
+  %t1 = insertelement <2 x i32> %t0, i32 %val1, i32 1
+  store <2 x i32> %t1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @nested_selects(i1 %cnd0, i1 %cnd1, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 %base, <2 x i32> addrspace(1)* %out) {
+; CHECK-LABEL: @nested_selects
+; CHECK: load <2 x i32>
+entry:
+  %base.p1 = add nsw i32 %base, 1
+  %base.p2 = add i32 %base, 2
+  %base.p3 = add nsw i32 %base, 3
+  %base.x4 = mul i32 %base, 4
+  %base.x4.p5 = add i32 %base.x4, 5
+  %base.x4.p6 = add i32 %base.x4, 6
+  %sext = sext i32 %base to i64
+  %sext.p1 = sext i32 %base.p1 to i64
+  %sext.p2 = sext i32 %base.p2 to i64
+  %sext.p3 = sext i32 %base.p3 to i64
+  %sext.x4.p5 = sext i32 %base.x4.p5 to i64
+  %sext.x4.p6 = sext i32 %base.x4.p6 to i64
+  %gep.a.base = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext
+  %gep.a.base.p1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext.p1
+  %gep.a.base.p2 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext.p2
+  %gep.a.base.p3 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext.p3
+  %gep.b.base.x4.p5 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext.x4.p5
+  %gep.b.base.x4.p6 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext.x4.p6
+  %selected.1.L = select i1 %cnd1, i32 addrspace(1)* %gep.a.base.p2, i32 addrspace(1)* %gep.b.base.x4.p5
+  %selected.1.R = select i1 %cnd1, i32 addrspace(1)* %gep.a.base.p3, i32 addrspace(1)* %gep.b.base.x4.p6
+  %selected.0.L = select i1 %cnd0, i32 addrspace(1)* %gep.a.base, i32 addrspace(1)* %selected.1.L
+  %selected.0.R = select i1 %cnd0, i32 addrspace(1)* %gep.a.base.p1, i32 addrspace(1)* %selected.1.R
+  %val0 = load i32, i32 addrspace(1)* %selected.0.L, align 4
+  %val1 = load i32, i32 addrspace(1)* %selected.0.R, align 4
+  %t0 = insertelement <2 x i32> undef, i32 %val0, i32 0
+  %t1 = insertelement <2 x i32> %t0, i32 %val1, i32 1
+  store <2 x i32> %t1, <2 x i32> addrspace(1)* %out
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,60 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
+; Check that, in the presence of an aliasing load, the stores preceding the
+; aliasing load are safe to vectorize.
+
+; CHECK-LABEL: store_vectorize_with_alias
+; CHECK: store <4 x float>
+; CHECK: load <4 x float>
+; CHECK: store <4 x float>
+
+; Function Attrs: nounwind
+define amdgpu_kernel void @store_vectorize_with_alias(i8 addrspace(1)* %a, i8 addrspace(1)* %b) #0 {
+bb:
+  %tmp = bitcast i8 addrspace(1)* %b to float addrspace(1)*
+  %tmp1 = load float, float addrspace(1)* %tmp, align 4
+
+  %tmp2 = bitcast i8 addrspace(1)* %a to float addrspace(1)*
+  store float %tmp1, float addrspace(1)* %tmp2, align 4
+  %tmp3 = getelementptr i8, i8 addrspace(1)* %a, i64 4
+  %tmp4 = bitcast i8 addrspace(1)* %tmp3 to float addrspace(1)*
+  store float %tmp1, float addrspace(1)* %tmp4, align 4
+  %tmp5 = getelementptr i8, i8 addrspace(1)* %a, i64 8
+  %tmp6 = bitcast i8 addrspace(1)* %tmp5 to float addrspace(1)*
+  store float %tmp1, float addrspace(1)* %tmp6, align 4
+  %tmp7 = getelementptr i8, i8 addrspace(1)* %a, i64 12
+  %tmp8 = bitcast i8 addrspace(1)* %tmp7 to float addrspace(1)*
+  store float %tmp1, float addrspace(1)* %tmp8, align 4
+
+  %tmp9 = getelementptr i8, i8 addrspace(1)* %b, i64 16
+  %tmp10 = bitcast i8 addrspace(1)* %tmp9 to float addrspace(1)*
+  %tmp11 = load float, float addrspace(1)* %tmp10, align 4
+  %tmp12 = getelementptr i8, i8 addrspace(1)* %b, i64 20
+  %tmp13 = bitcast i8 addrspace(1)* %tmp12 to float addrspace(1)*
+  %tmp14 = load float, float addrspace(1)* %tmp13, align 4
+  %tmp15 = getelementptr i8, i8 addrspace(1)* %b, i64 24
+  %tmp16 = bitcast i8 addrspace(1)* %tmp15 to float addrspace(1)*
+  %tmp17 = load float, float addrspace(1)* %tmp16, align 4
+  %tmp18 = getelementptr i8, i8 addrspace(1)* %b, i64 28
+  %tmp19 = bitcast i8 addrspace(1)* %tmp18 to float addrspace(1)*
+  %tmp20 = load float, float addrspace(1)* %tmp19, align 4
+
+  %tmp21 = getelementptr i8, i8 addrspace(1)* %a, i64 16
+  %tmp22 = bitcast i8 addrspace(1)* %tmp21 to float addrspace(1)*
+  store float %tmp11, float addrspace(1)* %tmp22, align 4
+  %tmp23 = getelementptr i8, i8 addrspace(1)* %a, i64 20
+  %tmp24 = bitcast i8 addrspace(1)* %tmp23 to float addrspace(1)*
+  store float %tmp14, float addrspace(1)* %tmp24, align 4
+  %tmp25 = getelementptr i8, i8 addrspace(1)* %a, i64 24
+  %tmp26 = bitcast i8 addrspace(1)* %tmp25 to float addrspace(1)*
+  store float %tmp17, float addrspace(1)* %tmp26, align 4
+  %tmp27 = getelementptr i8, i8 addrspace(1)* %a, i64 28
+  %tmp28 = bitcast i8 addrspace(1)* %tmp27 to float addrspace(1)*
+  store float %tmp20, float addrspace(1)* %tmp28, align 4
+
+  ret void
+}
+
+attributes #0 = { argmemonly nounwind }

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,201 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
+; Checks that we don't merge loads/stores of types smaller than one
+; byte, or vectors with elements smaller than one byte.
+
+%struct.foo = type { i32, i8 }
+
+declare void @use_i1(i1)
+declare void @use_i2(i2)
+declare void @use_i8(i8)
+declare void @use_foo(%struct.foo)
+declare void @use_v2i2(<2 x i2>)
+declare void @use_v4i2(<4 x i2>)
+declare void @use_v2i9(<2 x i9>)
+
+; CHECK-LABEL: @merge_store_2_constants_i1(
+; CHECK: store i1
+; CHECK: store i1
+define amdgpu_kernel void @merge_store_2_constants_i1(i1 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i1, i1 addrspace(1)* %out, i32 1
+  store i1 true, i1 addrspace(1)* %out.gep.1
+  store i1 false, i1 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @merge_store_2_constants_i2(
+; CHECK: store i2 1
+; CHECK: store i2 -1
+define amdgpu_kernel void @merge_store_2_constants_i2(i2 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i2, i2 addrspace(1)* %out, i32 1
+  store i2 1, i2 addrspace(1)* %out.gep.1
+  store i2 -1, i2 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @merge_different_store_sizes_i1_i8(
+; CHECK: store i1 true
+; CHECK: store i8 123
+define amdgpu_kernel void @merge_different_store_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
+  %out.i1 = bitcast i8 addrspace(1)* %out to i1 addrspace(1)*
+  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+  store i1 true, i1 addrspace(1)* %out.i1
+  store i8 123, i8 addrspace(1)* %out.gep.1
+  ret void
+}
+
+; CHECK-LABEL: @merge_different_store_sizes_i8_i1(
+; CHECK: store i8 123
+; CHECK: store i1 true
+define amdgpu_kernel void @merge_different_store_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
+  %out.i8 = bitcast i1 addrspace(1)* %out to i8 addrspace(1)*
+  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out.i8, i32 1
+  store i8 123, i8 addrspace(1)* %out.gep.1
+  store i1 true, i1 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @merge_store_2_constant_structs(
+; CHECK: store %struct.foo
+; CHECK: store %struct.foo
+define amdgpu_kernel void @merge_store_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr %struct.foo, %struct.foo addrspace(1)* %out, i32 1
+  store %struct.foo { i32 12, i8 3 }, %struct.foo addrspace(1)* %out.gep.1
+  store %struct.foo { i32 92, i8 9 }, %struct.foo addrspace(1)* %out
+  ret void
+}
+
+; sub-byte element size
+; CHECK-LABEL: @merge_store_2_constants_v2i2(
+; CHECK: store <2 x i2>
+; CHECK: store <2 x i2>
+define amdgpu_kernel void @merge_store_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr <2 x i2>, <2 x i2> addrspace(1)* %out, i32 1
+  store <2 x i2> <i2 1, i2 -1>, <2 x i2> addrspace(1)* %out.gep.1
+  store <2 x i2> <i2 -1, i2 1>, <2 x i2> addrspace(1)* %out
+  ret void
+}
+
+; sub-byte element size but byte size
+
+; CHECK-LABEL: @merge_store_2_constants_v4i2(
+; CHECK: store <4 x i2>
+; CHECK: store <4 x i2>
+define amdgpu_kernel void @merge_store_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr <4 x i2>, <4 x i2> addrspace(1)* %out, i32 1
+  store <4 x i2> <i2 1, i2 -1, i2 1, i2 -1>, <4 x i2> addrspace(1)* %out.gep.1
+  store <4 x i2> <i2 -1, i2 1, i2 -1, i2 1>, <4 x i2> addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @merge_load_2_constants_i1(
+; CHECK: load i1
+; CHECK: load i1
+define amdgpu_kernel void @merge_load_2_constants_i1(i1 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i1, i1 addrspace(1)* %out, i32 1
+  %x = load i1, i1 addrspace(1)* %out.gep.1
+  %y = load i1, i1 addrspace(1)* %out
+  call void @use_i1(i1 %x)
+  call void @use_i1(i1 %y)
+  ret void
+}
+
+; CHECK-LABEL: @merge_load_2_constants_i2(
+; CHECK: load i2
+; CHECK: load i2
+define amdgpu_kernel void @merge_load_2_constants_i2(i2 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i2, i2 addrspace(1)* %out, i32 1
+  %x = load i2, i2 addrspace(1)* %out.gep.1
+  %y = load i2, i2 addrspace(1)* %out
+  call void @use_i2(i2 %x)
+  call void @use_i2(i2 %y)
+  ret void
+}
+
+; CHECK-LABEL: @merge_different_load_sizes_i1_i8(
+; CHECK: load i1
+; CHECK: load i8
+define amdgpu_kernel void @merge_different_load_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
+  %out.i1 = bitcast i8 addrspace(1)* %out to i1 addrspace(1)*
+  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+  %x = load i1, i1 addrspace(1)* %out.i1
+  %y = load i8, i8 addrspace(1)* %out.gep.1
+  call void @use_i1(i1 %x)
+  call void @use_i8(i8 %y)
+  ret void
+}
+
+; CHECK-LABEL: @merge_different_load_sizes_i8_i1(
+; CHECK: load i8
+; CHECK: load i1
+define amdgpu_kernel void @merge_different_load_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
+  %out.i8 = bitcast i1 addrspace(1)* %out to i8 addrspace(1)*
+  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out.i8, i32 1
+  %x = load i8, i8 addrspace(1)* %out.gep.1
+  %y = load i1, i1 addrspace(1)* %out
+  call void @use_i8(i8 %x)
+  call void @use_i1(i1 %y)
+  ret void
+}
+
+; CHECK-LABEL: @merge_load_2_constant_structs(
+; CHECK: load %struct.foo
+; CHECK: load %struct.foo
+define amdgpu_kernel void @merge_load_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr %struct.foo, %struct.foo addrspace(1)* %out, i32 1
+  %x = load %struct.foo, %struct.foo addrspace(1)* %out.gep.1
+  %y = load %struct.foo, %struct.foo addrspace(1)* %out
+  call void @use_foo(%struct.foo %x)
+  call void @use_foo(%struct.foo %y)
+  ret void
+}
+
+; CHECK-LABEL: @merge_load_2_constants_v2i2(
+; CHECK: load <2 x i2>
+; CHECK: load <2 x i2>
+define amdgpu_kernel void @merge_load_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr <2 x i2>, <2 x i2> addrspace(1)* %out, i32 1
+  %x = load <2 x i2>, <2 x i2> addrspace(1)* %out.gep.1
+  %y = load <2 x i2>, <2 x i2> addrspace(1)* %out
+  call void @use_v2i2(<2 x i2> %x)
+  call void @use_v2i2(<2 x i2> %y)
+  ret void
+}
+
+; CHECK-LABEL: @merge_load_2_constants_v4i2(
+; CHECK: load <4 x i2>
+; CHECK: load <4 x i2>
+define amdgpu_kernel void @merge_load_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr <4 x i2>, <4 x i2> addrspace(1)* %out, i32 1
+  %x = load <4 x i2>, <4 x i2> addrspace(1)* %out.gep.1
+  %y = load <4 x i2>, <4 x i2> addrspace(1)* %out
+  call void @use_v4i2(<4 x i2> %x)
+  call void @use_v4i2(<4 x i2> %y)
+  ret void
+}
+
+; CHECK-LABEL: @merge_store_2_constants_i9(
+; CHECK: store i9 3
+; CHECK: store i9 -5
+define amdgpu_kernel void @merge_store_2_constants_i9(i9 addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr i9, i9 addrspace(1)* %out, i32 1
+  store i9 3, i9 addrspace(1)* %out.gep.1
+  store i9 -5, i9 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @merge_load_2_constants_v2i9(
+; CHECK: load <2 x i9>
+; CHECK: load <2 x i9>
+define amdgpu_kernel void @merge_load_2_constants_v2i9(<2 x i9> addrspace(1)* %out) #0 {
+  %out.gep.1 = getelementptr <2 x i9>, <2 x i9> addrspace(1)* %out, i32 1
+  %x = load <2 x i9>, <2 x i9> addrspace(1)* %out.gep.1
+  %y = load <2 x i9>, <2 x i9> addrspace(1)* %out
+  call void @use_v2i9(<2 x i9> %x)
+  call void @use_v2i9(<2 x i9> %y)
+  ret void
+}
+
+attributes #0 = { nounwind }

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/NVPTX/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/NVPTX/lit.local.cfg?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/NVPTX/lit.local.cfg (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/NVPTX/lit.local.cfg Tue Apr 16 21:52:47 2019
@@ -0,0 +1,3 @@
+if not 'NVPTX' in config.root.targets:
+    config.unsupported = True
+

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/NVPTX/merge-across-side-effects.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/NVPTX/merge-across-side-effects.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/NVPTX/merge-across-side-effects.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/NVPTX/merge-across-side-effects.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,209 @@
+; RUN: opt -mtriple=nvptx64-nvidia-cuda -load-store-vectorizer -S -o - %s | FileCheck %s
+
+; Check that the load/store vectorizer is willing to move loads/stores across
+; intervening instructions only if it's safe.
+;
+;  - Loads can be moved across instructions that don't write or throw.
+;  - Stores can only be moved across instructions which don't read, write, or
+;    throw.
+
+declare void @fn()
+declare void @fn_nounwind() #0
+declare void @fn_nounwind_writeonly() #1
+declare void @fn_nounwind_readonly() #2
+declare void @fn_writeonly() #3
+declare void @fn_readonly() #4
+declare void @fn_readnone() #5
+
+; CHECK-LABEL: @load_fn
+; CHECK: load
+; CHECK: call void @fn()
+; CHECK: load
+define void @load_fn(i32* %p) #0 {
+  %p.1 = getelementptr i32, i32* %p, i32 1
+
+  %v0 = load i32, i32* %p, align 8
+  call void @fn()
+  %v1 = load i32, i32* %p.1, align 4
+  ret void
+}
+
+; CHECK-LABEL: @load_fn_nounwind
+; CHECK: load
+; CHECK: call void @fn_nounwind()
+; CHECK: load
+define void @load_fn_nounwind(i32* %p) #0 {
+  %p.1 = getelementptr i32, i32* %p, i32 1
+
+  %v0 = load i32, i32* %p, align 8
+  call void @fn_nounwind() #0
+  %v1 = load i32, i32* %p.1, align 4
+  ret void
+}
+
+; CHECK-LABEL: @load_fn_nounwind_writeonly
+; CHECK: load
+; CHECK: call void @fn_nounwind_writeonly()
+; CHECK: load
+define void @load_fn_nounwind_writeonly(i32* %p) #0 {
+  %p.1 = getelementptr i32, i32* %p, i32 1
+
+  %v0 = load i32, i32* %p, align 8
+  call void @fn_nounwind_writeonly() #1
+  %v1 = load i32, i32* %p.1, align 4
+  ret void
+}
+
+; CHECK-LABEL: @load_fn_nounwind_readonly
+; CHECK-DAG: load <2 x i32>
+; CHECK-DAG: call void @fn_nounwind_readonly()
+define void @load_fn_nounwind_readonly(i32* %p) #0 {
+  %p.1 = getelementptr i32, i32* %p, i32 1
+
+  %v0 = load i32, i32* %p, align 8
+  call void @fn_nounwind_readonly() #2
+  %v1 = load i32, i32* %p.1, align 4
+  ret void
+}
+
+; CHECK-LABEL: @load_fn_readonly
+; CHECK: load
+; CHECK: call void @fn_readonly
+; CHECK: load
+define void @load_fn_readonly(i32* %p) #0 {
+  %p.1 = getelementptr i32, i32* %p, i32 1
+
+  %v0 = load i32, i32* %p, align 8
+  call void @fn_readonly() #4
+  %v1 = load i32, i32* %p.1, align 4
+  ret void
+}
+
+; CHECK-LABEL: @load_fn_writeonly
+; CHECK: load
+; CHECK: call void @fn_writeonly()
+; CHECK: load
+define void @load_fn_writeonly(i32* %p) #0 {
+  %p.1 = getelementptr i32, i32* %p, i32 1
+
+  %v0 = load i32, i32* %p, align 8
+  call void @fn_writeonly() #3
+  %v1 = load i32, i32* %p.1, align 4
+  ret void
+}
+
+; CHECK-LABEL: @load_fn_readnone
+; CHECK-DAG: load <2 x i32>
+; CHECK-DAG: call void @fn_readnone()
+define void @load_fn_readnone(i32* %p) #0 {
+  %p.1 = getelementptr i32, i32* %p, i32 1
+
+  %v0 = load i32, i32* %p, align 8
+  call void @fn_readnone() #5
+  %v1 = load i32, i32* %p.1, align 4
+  ret void
+}
+
+; ------------------------------------------------
+; Same tests, but now for stores instead of loads.
+; ------------------------------------------------
+
+; CHECK-LABEL: @store_fn
+; CHECK: store
+; CHECK: call void @fn()
+; CHECK: store
+define void @store_fn(i32* %p) #0 {
+  %p.1 = getelementptr i32, i32* %p, i32 1
+
+  store i32 0, i32* %p
+  call void @fn()
+  store i32 0, i32* %p.1
+  ret void
+}
+
+; CHECK-LABEL: @store_fn_nounwind
+; CHECK: store
+; CHECK: call void @fn_nounwind()
+; CHECK: store
+define void @store_fn_nounwind(i32* %p) #0 {
+  %p.1 = getelementptr i32, i32* %p, i32 1
+
+  store i32 0, i32* %p
+  call void @fn_nounwind() #0
+  store i32 0, i32* %p.1
+  ret void
+}
+
+; CHECK-LABEL: @store_fn_nounwind_writeonly
+; CHECK: store
+; CHECK: call void @fn_nounwind_writeonly()
+; CHECK: store
+define void @store_fn_nounwind_writeonly(i32* %p) #0 {
+  %p.1 = getelementptr i32, i32* %p, i32 1
+
+  store i32 0, i32* %p
+  call void @fn_nounwind_writeonly() #1
+  store i32 0, i32* %p.1
+  ret void
+}
+
+; CHECK-LABEL: @store_fn_nounwind_readonly
+; CHECK: store
+; CHECK: call void @fn_nounwind_readonly()
+; CHECK: store
+define void @store_fn_nounwind_readonly(i32* %p) #0 {
+  %p.1 = getelementptr i32, i32* %p, i32 1
+
+  store i32 0, i32* %p
+  call void @fn_nounwind_readonly() #2
+  store i32 0, i32* %p.1
+  ret void
+}
+
+; CHECK-LABEL: @store_fn_readonly
+; CHECK: store
+; CHECK: call void @fn_readonly
+; CHECK: store
+define void @store_fn_readonly(i32* %p) #0 {
+  %p.1 = getelementptr i32, i32* %p, i32 1
+
+  store i32 0, i32* %p
+  call void @fn_readonly() #4
+  store i32 0, i32* %p.1
+  ret void
+}
+
+; CHECK-LABEL: @store_fn_writeonly
+; CHECK: store
+; CHECK: call void @fn_writeonly()
+; CHECK: store
+define void @store_fn_writeonly(i32* %p) #0 {
+  %p.1 = getelementptr i32, i32* %p, i32 1
+
+  store i32 0, i32* %p
+  call void @fn_writeonly() #3
+  store i32 0, i32* %p.1
+  ret void
+}
+
+; This is the only store idiom we can vectorize.
+; CHECK-LABEL: @store_fn_readnone
+; CHECK-DAG: store <2 x i32>
+; CHECK-DAG: call void @fn_readnone()
+define void @store_fn_readnone(i32* %p) #0 {
+  %p.1 = getelementptr i32, i32* %p, i32 1
+
+  store i32 0, i32* %p, align 8
+  call void @fn_readnone() #5
+  store i32 0, i32* %p.1, align 8
+  ret void
+}
+
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind writeonly }
+attributes #2 = { nounwind readonly }
+attributes #3 = { writeonly }
+attributes #4 = { readonly }
+; readnone implies nounwind, so no need to test separately
+attributes #5 = { nounwind readnone }

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/NVPTX/non-instr-bitcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/NVPTX/non-instr-bitcast.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/NVPTX/non-instr-bitcast.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/NVPTX/non-instr-bitcast.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,14 @@
+; RUN: opt -mtriple=nvptx64-nvidia-cuda -load-store-vectorizer -S -o - %s | FileCheck %s
+
+; Load from a constant.  This can be vectorized, but shouldn't crash us.
+
+ at global = internal addrspace(1) constant [4 x float] [float 0xBF71111120000000, float 0x3F70410420000000, float 0xBF81111120000000, float 0x3FB5555560000000], align 4
+
+define void @foo() {
+  ; CHECK: load <4 x float>
+  %a = load float, float addrspace(1)* getelementptr inbounds ([4 x float], [4 x float] addrspace(1)* @global, i64 0, i64 0), align 16
+  %b = load float, float addrspace(1)* getelementptr inbounds ([4 x float], [4 x float] addrspace(1)* @global, i64 0, i64 1), align 4
+  %c = load float, float addrspace(1)* getelementptr inbounds ([4 x float], [4 x float] addrspace(1)* @global, i64 0, i64 2), align 4
+  %d = load float, float addrspace(1)* getelementptr inbounds ([4 x float], [4 x float] addrspace(1)* @global, i64 0, i64 3), align 4
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/NVPTX/propagate-invariance-metadata.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/NVPTX/propagate-invariance-metadata.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/NVPTX/propagate-invariance-metadata.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/NVPTX/propagate-invariance-metadata.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,17 @@
+; RUN: opt -load-store-vectorizer -march=nvptx64 -mcpu=sm_35 -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; CHECK-LABEL: @foo
+define i32 @foo(i32* %ptr) {
+  %ptr1 = getelementptr i32, i32* %ptr, i32 1
+  %p1 = addrspacecast i32* %ptr1 to i32 addrspace(1)*
+  ; CHECK: load <2 x i32>, <2 x i32>* %{{[0-9]+}}, align 8, !invariant.load !0
+  %v0 = load i32, i32* %ptr, align 8, !invariant.load !0
+  %v1 = load i32, i32* %ptr1, align 4, !invariant.load !0
+  %sum = add i32 %v0, %v1
+  ret i32 %sum
+}
+
+!0 = !{}

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,80 @@
+; RUN: opt -codegenprepare -load-store-vectorizer %s -S -o - | FileCheck %s
+; RUN: opt                 -load-store-vectorizer %s -S -o - | FileCheck %s
+; RUN: opt -codegenprepare -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' %s -S -o - | FileCheck %s
+; RUN: opt                 -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' %s -S -o - | FileCheck %s
+
+target triple = "x86_64--"
+
+%union = type { { [4 x [4 x [4 x [16 x float]]]], [4 x [4 x [4 x [16 x float]]]], [10 x [10 x [4 x float]]] } }
+
+ at global_pointer = external unnamed_addr global { %union, [2000 x i8] }, align 4
+
+; Function Attrs: convergent nounwind
+define void @test(i32 %base) #0 {
+; CHECK-LABEL: @test(
+; CHECK-NOT: load i32
+; CHECK: load <2 x i32>
+; CHECK-NOT: load i32
+entry:
+  %mul331 = and i32 %base, -4
+  %add350.4 = add i32 4, %mul331
+  %idx351.4 = zext i32 %add350.4 to i64
+  %arrayidx352.4 = getelementptr inbounds { %union, [2000 x i8] }, { %union, [2000 x i8] }* @global_pointer, i64 0, i32 0, i32 0, i32 1, i64 0, i64 0, i64 0, i64 %idx351.4
+  %tmp296.4 = bitcast float* %arrayidx352.4 to i32*
+  %add350.5 = add i32 5, %mul331
+  %idx351.5 = zext i32 %add350.5 to i64
+  %arrayidx352.5 = getelementptr inbounds { %union, [2000 x i8] }, { %union, [2000 x i8] }* @global_pointer, i64 0, i32 0, i32 0, i32 1, i64 0, i64 0, i64 0, i64 %idx351.5
+  %tmp296.5 = bitcast float* %arrayidx352.5 to i32*
+  %cnd = icmp ult i32 %base, 1000
+  br i1 %cnd, label %loads, label %exit
+
+loads:
+  ; If and only if the loads are in a different BB from the GEPs codegenprepare
+  ; would try to turn the GEPs into math, which makes LoadStoreVectorizer's job
+  ; harder
+  %tmp297.4 = load i32, i32* %tmp296.4, align 4, !tbaa !0
+  %tmp297.5 = load i32, i32* %tmp296.5, align 4, !tbaa !0
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+define void @test.codegenprepared(i32 %base) #0 {
+; CHECK-LABEL: @test.codegenprepared(
+; CHECK-NOT: load i32
+; CHECK: load <2 x i32>
+; CHECK-NOT: load i32
+entry:
+  %mul331 = and i32 %base, -4
+  %add350.4 = add i32 4, %mul331
+  %idx351.4 = zext i32 %add350.4 to i64
+  %add350.5 = add i32 5, %mul331
+  %idx351.5 = zext i32 %add350.5 to i64
+  %cnd = icmp ult i32 %base, 1000
+  br i1 %cnd, label %loads, label %exit
+
+loads:                                            ; preds = %entry
+  %sunkaddr = mul i64 %idx351.4, 4
+  %sunkaddr1 = getelementptr inbounds i8, i8* bitcast ({ %union, [2000 x i8] }* @global_pointer to i8*), i64 %sunkaddr
+  %sunkaddr2 = getelementptr inbounds i8, i8* %sunkaddr1, i64 4096
+  %0 = bitcast i8* %sunkaddr2 to i32*
+  %tmp297.4 = load i32, i32* %0, align 4, !tbaa !0
+  %sunkaddr3 = mul i64 %idx351.5, 4
+  %sunkaddr4 = getelementptr inbounds i8, i8* bitcast ({ %union, [2000 x i8] }* @global_pointer to i8*), i64 %sunkaddr3
+  %sunkaddr5 = getelementptr inbounds i8, i8* %sunkaddr4, i64 4096
+  %1 = bitcast i8* %sunkaddr5 to i32*
+  %tmp297.5 = load i32, i32* %1, align 4, !tbaa !0
+  br label %exit
+
+exit:                                             ; preds = %loads, %entry
+  ret void
+}
+
+attributes #0 = { convergent nounwind }
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"float", !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C++ TBAA"}

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/compare-scev-by-complexity.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/compare-scev-by-complexity.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/compare-scev-by-complexity.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/compare-scev-by-complexity.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,77 @@
+; RUN: opt -load-store-vectorizer %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' %s -S | FileCheck %s
+
+; Check that setting wrapping flags after a SCEV node is created
+; does not invalidate "sorted by complexity" invariant for
+; operands of commutative and associative SCEV operators.
+
+target triple = "x86_64--"
+
+ at global_value0 = external constant i32
+ at global_value1 = external constant i32
+ at other_value = external global float
+ at a = external global float
+ at b = external global float
+ at c = external global float
+ at d = external global float
+ at plus1 = external global i32
+ at cnd = external global i8
+
+; Function Attrs: nounwind
+define void @main() local_unnamed_addr #0 {
+; CHECK-LABEL: @main()
+; CHECK: [[PTR:%[0-9]+]] = bitcast float* %preheader.load0.address to <2 x float>*
+; CHECK:  = load <2 x float>, <2 x float>* [[PTR]]
+; CHECK-LABEL: for.body23:
+entry:
+  %tmp = load i32, i32* @global_value0, !range !0
+  %tmp2 = load i32, i32* @global_value1
+  %and.i.i = and i32 %tmp2, 2
+  %add.nuw.nsw.i.i = add nuw nsw i32 %and.i.i, 0
+  %mul.i.i = shl nuw nsw i32 %add.nuw.nsw.i.i, 1
+  %and6.i.i = and i32 %tmp2, 3
+  %and9.i.i = and i32 %tmp2, 4
+  %add.nuw.nsw10.i.i = add nuw nsw i32 %and6.i.i, %and9.i.i
+  %conv3.i42.i = add nuw nsw i32 %mul.i.i, 1
+  %reass.add346.7 = add nuw nsw i32 %add.nuw.nsw10.i.i, 56
+  %reass.mul347.7 = mul nuw nsw i32 %tmp, %reass.add346.7
+  %add7.i.7 = add nuw nsw i32 %reass.mul347.7, 0
+  %preheader.address0.idx = add nuw nsw i32 %add7.i.7, %mul.i.i
+  %preheader.address0.idx.zext = zext i32 %preheader.address0.idx to i64
+  %preheader.load0.address = getelementptr inbounds float, float* @other_value, i64 %preheader.address0.idx.zext
+  %preheader.load0. = load float, float* %preheader.load0.address, align 4, !tbaa !1
+  %common.address.idx = add nuw nsw i32 %add7.i.7, %conv3.i42.i
+  %preheader.header.common.address.idx.zext = zext i32 %common.address.idx to i64
+  %preheader.load1.address = getelementptr inbounds float, float* @other_value, i64 %preheader.header.common.address.idx.zext
+  %preheader.load1. = load float, float* %preheader.load1.address, align 4, !tbaa !1
+  br label %for.body23
+
+for.body23:                                       ; preds = %for.body23, %entry
+  %loop.header.load0.address = getelementptr inbounds float, float* @other_value, i64 %preheader.header.common.address.idx.zext
+  %loop.header.load0. = load float, float* %loop.header.load0.address, align 4, !tbaa !1
+  %reass.mul343.7 = mul nuw nsw i32 %reass.add346.7, 72
+  %add7.i286.7.7 = add nuw nsw i32 %reass.mul343.7, 56
+  %add9.i288.7.7 = add nuw nsw i32 %add7.i286.7.7, %mul.i.i
+  %loop.header.address1.idx = add nuw nsw i32 %add9.i288.7.7, 1
+  %loop.header.address1.idx.zext = zext i32 %loop.header.address1.idx to i64
+  %loop.header.load1.address = getelementptr inbounds float, float* @other_value, i64 %loop.header.address1.idx.zext
+  %loop.header.load1. = load float, float* %loop.header.load1.address, align 4, !tbaa !1
+  store float %preheader.load0., float* @a, align 4, !tbaa !1
+  store float %preheader.load1., float* @b, align 4, !tbaa !1
+  store float %loop.header.load0., float* @c, align 4, !tbaa !1
+  store float %loop.header.load1., float* @d, align 4, !tbaa !1
+  %loaded.cnd = load i8, i8* @cnd
+  %condition = trunc i8 %loaded.cnd to i1
+  br i1 %condition, label %for.body23, label %exit
+
+exit:
+  ret void
+}
+
+attributes #0 = { nounwind }
+
+!0 = !{i32 0, i32 65536}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"float", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C++ TBAA"}

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,28 @@
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: @correct_order(
+; CHECK: [[LOAD_PTR:%[0-9]+]] = bitcast i32* %next.gep1
+; CHECK: load <2 x i32>, <2 x i32>* [[LOAD_PTR]]
+; CHECK: load i32, i32* %next.gep
+; CHECK: [[STORE_PTR:%[0-9]+]] = bitcast i32* %next.gep
+; CHECK: store <2 x i32>
+; CHECK-SAME: <2 x i32>* [[STORE_PTR]]
+; CHECK: load i32, i32* %next.gep1
+define void @correct_order(i32* noalias %ptr) {
+  %next.gep = getelementptr i32, i32* %ptr, i64 0
+  %next.gep1 = getelementptr i32, i32* %ptr, i64 1
+  %next.gep2 = getelementptr i32, i32* %ptr, i64 2
+
+  %l1 = load i32, i32* %next.gep1, align 4
+  %l2 = load i32, i32* %next.gep, align 4
+  store i32 0, i32* %next.gep1, align 4
+  store i32 0, i32* %next.gep, align 4
+  %l3 = load i32, i32* %next.gep1, align 4
+  %l4 = load i32, i32* %next.gep2, align 4
+
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/lit.local.cfg?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/lit.local.cfg (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/lit.local.cfg Tue Apr 16 21:52:47 2019
@@ -0,0 +1,3 @@
+if not 'X86' in config.root.targets:
+    config.unsupported = True
+

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/load-width.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/load-width.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/load-width.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/load-width.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,40 @@
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -mcpu haswell -S -o - %s | FileCheck --check-prefix=CHECK-HSW %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -mcpu knl -S -o - %s | FileCheck --check-prefix=CHECK-KNL %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -mcpu haswell -S -o - %s | FileCheck --check-prefix=CHECK-HSW %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -mcpu knl -S -o - %s | FileCheck --check-prefix=CHECK-KNL %s
+
+define <8 x double> @loadwidth_insert_extract(double* %ptr) {
+    %a = bitcast double* %ptr to <2 x double> *
+    %b = getelementptr <2 x double>, <2 x double>* %a, i32 1
+    %c = getelementptr <2 x double>, <2 x double>* %a, i32 2
+    %d = getelementptr <2 x double>, <2 x double>* %a, i32 3
+; CHECK-HSW: load <4 x double>
+; CHECK-HSW: load <4 x double>
+; CHECK-HSW-NOT: load
+; CHECK-KNL: load <8 x double>
+; CHECK-KNL-NOT: load
+    %la = load <2 x double>, <2 x double> *%a
+    %lb = load <2 x double>, <2 x double> *%b
+    %lc = load <2 x double>, <2 x double> *%c
+    %ld = load <2 x double>, <2 x double> *%d
+    ; Scalarize everything - Explicitly not a shufflevector to test this code
+    ; path in the LSV
+    %v1 = extractelement <2 x double> %la, i32 0
+    %v2 = extractelement <2 x double> %la, i32 1
+    %v3 = extractelement <2 x double> %lb, i32 0
+    %v4 = extractelement <2 x double> %lb, i32 1
+    %v5 = extractelement <2 x double> %lc, i32 0
+    %v6 = extractelement <2 x double> %lc, i32 1
+    %v7 = extractelement <2 x double> %ld, i32 0
+    %v8 = extractelement <2 x double> %ld, i32 1
+    ; Make a vector again
+    %i1 = insertelement <8 x double> undef, double %v1, i32 0
+    %i2 = insertelement <8 x double> %i1, double %v2, i32 1
+    %i3 = insertelement <8 x double> %i2, double %v3, i32 2
+    %i4 = insertelement <8 x double> %i3, double %v4, i32 3
+    %i5 = insertelement <8 x double> %i4, double %v5, i32 4
+    %i6 = insertelement <8 x double> %i5, double %v6, i32 5
+    %i7 = insertelement <8 x double> %i6, double %v7, i32 6
+    %i8 = insertelement <8 x double> %i7, double %v8, i32 7
+    ret <8 x double> %i8
+}

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,48 @@
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S < %s | \
+; RUN:     FileCheck %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S < %s | \
+; RUN:     FileCheck %s
+;
+; The GPU Load & Store Vectorizer may merge differently-typed accesses into a
+; single instruction. This test checks that we merge TBAA tags for such
+; accesses correctly.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; struct S {
+;   float f;
+;   int i;
+; };
+%struct.S = type { float, i32 }
+
+; float foo(S *p) {
+;   p->f -= 1;
+;   p->i -= 1;
+;   return p->f;
+; }
+define float @foo(%struct.S* %p) {
+entry:
+; CHECK-LABEL: foo
+; CHECK: load <2 x i32>, {{.*}}, !tbaa [[TAG_char:!.*]]
+; CHECK: store <2 x i32> {{.*}}, !tbaa [[TAG_char]]
+  %f = getelementptr inbounds %struct.S, %struct.S* %p, i64 0, i32 0
+  %0 = load float, float* %f, align 4, !tbaa !2
+  %sub = fadd float %0, -1.000000e+00
+  store float %sub, float* %f, align 4, !tbaa !2
+  %i = getelementptr inbounds %struct.S, %struct.S* %p, i64 0, i32 1
+  %1 = load i32, i32* %i, align 4, !tbaa !8
+  %sub1 = add nsw i32 %1, -1
+  store i32 %sub1, i32* %i, align 4, !tbaa !8
+  ret float %sub
+}
+
+!2 = !{!3, !4, i64 0}
+!3 = !{!"_ZTS1S", !4, i64 0, !7, i64 4}
+!4 = !{!"float", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C++ TBAA"}
+!7 = !{!"int", !5, i64 0}
+!8 = !{!3, !7, i64 4}
+
+; CHECK-DAG: [[TYPE_char:!.*]] = !{!"omnipotent char", {{.*}}, i64 0}
+; CHECK-DAG: [[TAG_char]] = !{[[TYPE_char]], [[TYPE_char]], i64 0}

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/non-byte-size.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/non-byte-size.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/non-byte-size.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/non-byte-size.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -load-store-vectorizer -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+; RUN: opt < %s -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+%rec = type { i32, i28 }
+
+; We currently do not optimize this scenario.
+; But we verify that we no longer crash when compiling this.
+define void @test1(%rec* %out, %rec* %in) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[IN1:%.*]] = getelementptr [[REC:%.*]], %rec* [[IN:%.*]], i16 0, i32 0
+; CHECK-NEXT:    [[IN2:%.*]] = getelementptr [[REC]], %rec* [[IN]], i16 0, i32 1
+; CHECK-NEXT:    [[VAL1:%.*]] = load i32, i32* [[IN1]], align 8
+; CHECK-NEXT:    [[VAL2:%.*]] = load i28, i28* [[IN2]]
+; CHECK-NEXT:    [[OUT1:%.*]] = getelementptr [[REC]], %rec* [[OUT:%.*]], i16 0, i32 0
+; CHECK-NEXT:    [[OUT2:%.*]] = getelementptr [[REC]], %rec* [[OUT]], i16 0, i32 1
+; CHECK-NEXT:    store i32 [[VAL1]], i32* [[OUT1]], align 8
+; CHECK-NEXT:    store i28 [[VAL2]], i28* [[OUT2]]
+; CHECK-NEXT:    ret void
+;
+  %in1 = getelementptr %rec, %rec* %in, i16 0, i32 0
+  %in2 = getelementptr %rec, %rec* %in, i16 0, i32 1
+  %val1 = load i32, i32* %in1, align 8
+  %val2 = load i28, i28* %in2
+  %out1 = getelementptr %rec, %rec* %out, i16 0, i32 0
+  %out2 = getelementptr %rec, %rec* %out, i16 0, i32 1
+  store i32 %val1, i32* %out1, align 8
+  store i28 %val2, i28* %out2
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,29 @@
+; RUN: opt -mtriple=x86_64-unknown-linux -load-store-vectorizer -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=x86_64-unknown-linux -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
+
+target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+
+%struct.buffer_t = type { i32, i8* }
+
+; Check an i32 and i8* get vectorized, and that the two accesses
+; (load into buff.val and store to buff.p) preserve their order.
+; Vectorized loads should be inserted at the position of the first load,
+; and instructions which were between the first and last load should be
+; reordered preserving their relative order inasmuch as possible.
+
+; CHECK-LABEL: @preserve_order_32(
+; CHECK: load <2 x i32>
+; CHECK: %buff.val = load i8
+; CHECK: store i8 0
+define void @preserve_order_32(%struct.buffer_t* noalias %buff) #0 {
+entry:
+  %tmp1 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i32 0, i32 1
+  %buff.p = load i8*, i8** %tmp1
+  %buff.val = load i8, i8* %buff.p
+  store i8 0, i8* %buff.p, align 8
+  %tmp0 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i32 0, i32 0
+  %buff.int = load i32, i32* %tmp0, align 8
+  ret void
+}
+
+attributes #0 = { nounwind }

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,78 @@
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+%struct.buffer_t = type { i64, i8* }
+%struct.nested.buffer = type { %struct.buffer_t, %struct.buffer_t }
+
+; Check an i64 and i8* get vectorized, and that the two accesses
+; (load into buff.val and store to buff.p) preserve their order.
+; Vectorized loads should be inserted at the position of the first load,
+; and instructions which were between the first and last load should be
+; reordered preserving their relative order inasmuch as possible.
+
+; CHECK-LABEL: @preserve_order_64(
+; CHECK: load <2 x i64>
+; CHECK: %buff.val = load i8
+; CHECK: store i8 0
+define void @preserve_order_64(%struct.buffer_t* noalias %buff) #0 {
+entry:
+  %tmp1 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 1
+  %buff.p = load i8*, i8** %tmp1
+  %buff.val = load i8, i8* %buff.p
+  store i8 0, i8* %buff.p, align 8
+  %tmp0 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 0
+  %buff.int = load i64, i64* %tmp0, align 16
+  ret void
+}
+
+; Check reordering recurses correctly.
+
+; CHECK-LABEL: @transitive_reorder(
+; CHECK: load <2 x i64>
+; CHECK: %buff.val = load i8
+; CHECK: store i8 0
+define void @transitive_reorder(%struct.buffer_t* noalias %buff, %struct.nested.buffer* noalias %nest) #0 {
+entry:
+  %nest0_0 = getelementptr inbounds %struct.nested.buffer, %struct.nested.buffer* %nest, i64 0, i32 0
+  %tmp1 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %nest0_0, i64 0, i32 1
+  %buff.p = load i8*, i8** %tmp1
+  %buff.val = load i8, i8* %buff.p
+  store i8 0, i8* %buff.p, align 8
+  %nest1_0 = getelementptr inbounds %struct.nested.buffer, %struct.nested.buffer* %nest, i64 0, i32 0
+  %tmp0 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %nest1_0, i64 0, i32 0
+  %buff.int = load i64, i64* %tmp0, align 16
+  ret void
+}
+
+; Check for no vectorization over phi node
+
+; CHECK-LABEL: @no_vect_phi(
+; CHECK: load i8*
+; CHECK: load i8
+; CHECK: store i8 0
+; CHECK: load i64
+define void @no_vect_phi(i32* noalias %ptr, %struct.buffer_t* noalias %buff) {
+entry:
+  %tmp1 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 1
+  %buff.p = load i8*, i8** %tmp1
+  %buff.val = load i8, i8* %buff.p
+  store i8 0, i8* %buff.p, align 8
+  br label %"for something"
+
+"for something":
+  %index = phi i64 [ 0, %entry ], [ %index.next, %"for something" ]
+
+  %tmp0 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 0
+  %buff.int = load i64, i64* %tmp0, align 16
+
+  %index.next = add i64 %index, 8
+  %cmp_res = icmp eq i64 %index.next, 8
+  br i1 %cmp_res, label %ending, label %"for something"
+
+ending:
+  ret void
+}
+
+attributes #0 = { nounwind }

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,118 @@
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; Vectorized subsets of the load/store chains in the presence of
+; interleaved loads/stores
+
+; CHECK-LABEL: @interleave_2L_2S(
+; CHECK: load <2 x i32>
+; CHECK: load i32
+; CHECK: store <2 x i32>
+; CHECK: load i32
+define void @interleave_2L_2S(i32* noalias %ptr) {
+  %next.gep = getelementptr i32, i32* %ptr, i64 0
+  %next.gep1 = getelementptr i32, i32* %ptr, i64 1
+  %next.gep2 = getelementptr i32, i32* %ptr, i64 2
+
+  %l1 = load i32, i32* %next.gep1, align 4
+  %l2 = load i32, i32* %next.gep, align 4
+  store i32 0, i32* %next.gep1, align 4
+  store i32 0, i32* %next.gep, align 4
+  %l3 = load i32, i32* %next.gep1, align 4
+  %l4 = load i32, i32* %next.gep2, align 4
+
+  ret void
+}
+
+; CHECK-LABEL: @interleave_3L_2S_1L(
+; CHECK: load <3 x i32>
+; CHECK: store <2 x i32>
+; CHECK: load i32
+
+define void @interleave_3L_2S_1L(i32* noalias %ptr) {
+  %next.gep = getelementptr i32, i32* %ptr, i64 0
+  %next.gep1 = getelementptr i32, i32* %ptr, i64 1
+  %next.gep2 = getelementptr i32, i32* %ptr, i64 2
+
+  %l2 = load i32, i32* %next.gep, align 4
+  %l1 = load i32, i32* %next.gep1, align 4
+  store i32 0, i32* %next.gep1, align 4
+  store i32 0, i32* %next.gep, align 4
+  %l3 = load i32, i32* %next.gep1, align 4
+  %l4 = load i32, i32* %next.gep2, align 4
+
+  ret void
+}
+
+; CHECK-LABEL: @chain_suffix(
+; CHECK: load i32
+; CHECK: store <2 x i32>
+; CHECK: load <2 x i32>
+define void @chain_suffix(i32* noalias %ptr) {
+  %next.gep = getelementptr i32, i32* %ptr, i64 0
+  %next.gep1 = getelementptr i32, i32* %ptr, i64 1
+  %next.gep2 = getelementptr i32, i32* %ptr, i64 2
+
+  %l2 = load i32, i32* %next.gep, align 4
+  store i32 0, i32* %next.gep1, align 4
+  store i32 0, i32* %next.gep, align 4
+  %l3 = load i32, i32* %next.gep1, align 4
+  %l4 = load i32, i32* %next.gep2, align 4
+
+  ret void
+}
+
+
+; CHECK-LABEL: @chain_prefix_suffix(
+; CHECK: load <2 x i32>
+; CHECK: store <2 x i32>
+; CHECK: load <3 x i32>
+define void  @chain_prefix_suffix(i32* noalias %ptr) {
+  %next.gep = getelementptr i32, i32* %ptr, i64 0
+  %next.gep1 = getelementptr i32, i32* %ptr, i64 1
+  %next.gep2 = getelementptr i32, i32* %ptr, i64 2
+  %next.gep3 = getelementptr i32, i32* %ptr, i64 3
+
+  %l1 = load i32, i32* %next.gep, align 4
+  %l2 = load i32, i32* %next.gep1, align 4
+  store i32 0, i32* %next.gep1, align 4
+  store i32 0, i32* %next.gep2, align 4
+  %l3 = load i32, i32* %next.gep1, align 4
+  %l4 = load i32, i32* %next.gep2, align 4
+  %l5 = load i32, i32* %next.gep3, align 4
+
+  ret void
+}
+
+; FIXME: If the chain is too long and TLI says misaligned is not fast,
+; then LSV fails to vectorize anything in that chain.
+; To reproduce below, add a tmp5 (ptr+4) and load tmp5 into l6 and l7.
+
+; CHECK-LABEL: @interleave_get_longest
+; CHECK: load <3 x i32>
+; CHECK: load i32
+; CHECK: store <2 x i32> zeroinitializer
+; CHECK: load i32
+; CHECK: load i32
+; CHECK: load i32
+
+define void @interleave_get_longest(i32* noalias %ptr) {
+  %tmp1 = getelementptr i32, i32* %ptr, i64 0
+  %tmp2 = getelementptr i32, i32* %ptr, i64 1
+  %tmp3 = getelementptr i32, i32* %ptr, i64 2
+  %tmp4 = getelementptr i32, i32* %ptr, i64 3
+
+  %l1 = load i32, i32* %tmp2, align 4
+  %l2 = load i32, i32* %tmp1, align 4
+  store i32 0, i32* %tmp2, align 4
+  store i32 0, i32* %tmp1, align 4
+  %l3 = load i32, i32* %tmp2, align 4
+  %l4 = load i32, i32* %tmp3, align 4
+  %l5 = load i32, i32* %tmp4, align 4
+  %l6 = load i32, i32* %tmp4, align 4
+  %l7 = load i32, i32* %tmp4, align 4
+
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/vector-scalar.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,15 @@
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -mcpu haswell -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -mcpu haswell -S -o - %s | FileCheck %s
+
+; Check that the LoadStoreVectorizer does not crash due to not differentiating <1 x T> and T.
+
+; CHECK-LABEL: @vector_scalar(
+; CHECK: store double
+; CHECK: store <1 x double>
+define void @vector_scalar(double* %ptr, double %a, <1 x double> %b) {
+  %1 = bitcast double* %ptr to <1 x double>*
+  %2 = getelementptr <1 x double>, <1 x double>* %1, i32 1
+  store double %a, double* %ptr, align 8
+  store <1 x double> %b, <1 x double>* %2, align 8
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoadStoreVectorizer/int_sideeffect.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoadStoreVectorizer/int_sideeffect.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoadStoreVectorizer/int_sideeffect.ll (added)
+++ llvm/trunk/test/Transforms/LoadStoreVectorizer/int_sideeffect.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,27 @@
+; RUN: opt -S < %s -load-store-vectorizer | FileCheck %s
+; RUN: opt -S < %s -passes='function(load-store-vectorizer)' | FileCheck %s
+
+declare void @llvm.sideeffect()
+
+; load-store vectorization across a @llvm.sideeffect.
+
+; CHECK-LABEL: test
+; CHECK: load <4 x float>
+; CHECK: store <4 x float>
+define void @test(float* %p) {
+    %p0 = getelementptr float, float* %p, i64 0
+    %p1 = getelementptr float, float* %p, i64 1
+    %p2 = getelementptr float, float* %p, i64 2
+    %p3 = getelementptr float, float* %p, i64 3
+    %l0 = load float, float* %p0, align 16
+    %l1 = load float, float* %p1
+    %l2 = load float, float* %p2
+    call void @llvm.sideeffect()
+    %l3 = load float, float* %p3
+    store float %l0, float* %p0, align 16
+    call void @llvm.sideeffect()
+    store float %l1, float* %p1
+    store float %l2, float* %p2
+    store float %l3, float* %p3
+    ret void
+}

Added: llvm/trunk/test/Transforms/LoopDataPrefetch/AArch64/kryo-large-stride.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDataPrefetch/AArch64/kryo-large-stride.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDataPrefetch/AArch64/kryo-large-stride.ll (added)
+++ llvm/trunk/test/Transforms/LoopDataPrefetch/AArch64/kryo-large-stride.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,53 @@
+; RUN: opt -mcpu=kryo -mtriple=aarch64-gnu-linux -loop-data-prefetch -max-prefetch-iters-ahead=1000 -S < %s | FileCheck %s --check-prefix=LARGE_PREFETCH --check-prefix=ALL
+; RUN: opt -mcpu=kryo -mtriple=aarch64-gnu-linux -loop-data-prefetch -S < %s | FileCheck %s --check-prefix=NO_LARGE_PREFETCH --check-prefix=ALL
+; RUN: opt -mcpu=kryo -mtriple=aarch64-gnu-linux -passes=loop-data-prefetch -max-prefetch-iters-ahead=1000 -S < %s | FileCheck %s --check-prefix=LARGE_PREFETCH --check-prefix=ALL
+; RUN: opt -mcpu=kryo -mtriple=aarch64-gnu-linux -passes=loop-data-prefetch -S < %s | FileCheck %s --check-prefix=NO_LARGE_PREFETCH --check-prefix=ALL
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+
+; ALL-LABEL: @small_stride(
+define void @small_stride(double* nocapture %a, double* nocapture readonly %b) {
+entry:
+  br label %for.body
+
+; ALL: for.body:
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv
+; ALL-NOT: call void @llvm.prefetch
+  %0 = load double, double* %arrayidx, align 8
+  %add = fadd double %0, 1.000000e+00
+  %arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv
+  store double %add, double* %arrayidx2, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1600
+  br i1 %exitcond, label %for.end, label %for.body
+
+; ALL: for.end:
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; ALL-LABEL: @large_stride(
+define void @large_stride(double* nocapture %a, double* nocapture readonly %b) {
+entry:
+  br label %for.body
+
+; ALL: for.body:
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv
+; LARGE_PREFETCH: call void @llvm.prefetch
+; NO_LARGE_PREFETCH-NOT: call void @llvm.prefetch
+  %0 = load double, double* %arrayidx, align 8
+  %add = fadd double %0, 1.000000e+00
+  %arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv
+  store double %add, double* %arrayidx2, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 150 
+  %exitcond = icmp eq i64 %indvars.iv.next, 160000
+  br i1 %exitcond, label %for.end, label %for.body
+
+; ALL: for.end:
+for.end:                                          ; preds = %for.body
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopDataPrefetch/AArch64/large-stride.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDataPrefetch/AArch64/large-stride.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDataPrefetch/AArch64/large-stride.ll (added)
+++ llvm/trunk/test/Transforms/LoopDataPrefetch/AArch64/large-stride.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,55 @@
+; RUN: opt -mcpu=cyclone -mtriple=arm64-apple-ios -loop-data-prefetch -max-prefetch-iters-ahead=100 -S < %s | FileCheck %s --check-prefix=LARGE_PREFETCH --check-prefix=ALL
+; RUN: opt -mcpu=cyclone -mtriple=arm64-apple-ios -loop-data-prefetch -S < %s | FileCheck %s --check-prefix=NO_LARGE_PREFETCH --check-prefix=ALL
+; RUN: opt -mcpu=generic -mtriple=arm64-apple-ios -loop-data-prefetch -S < %s | FileCheck %s --check-prefix=NO_LARGE_PREFETCH --check-prefix=ALL
+; RUN: opt -mcpu=cyclone -mtriple=arm64-apple-ios -passes=loop-data-prefetch -max-prefetch-iters-ahead=100 -S < %s | FileCheck %s --check-prefix=LARGE_PREFETCH --check-prefix=ALL
+; RUN: opt -mcpu=cyclone -mtriple=arm64-apple-ios -passes=loop-data-prefetch -S < %s | FileCheck %s --check-prefix=NO_LARGE_PREFETCH --check-prefix=ALL
+; RUN: opt -mcpu=generic -mtriple=arm64-apple-ios -passes=loop-data-prefetch -S < %s | FileCheck %s --check-prefix=NO_LARGE_PREFETCH --check-prefix=ALL
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+
+; ALL-LABEL: @small_stride(
+define void @small_stride(double* nocapture %a, double* nocapture readonly %b) {
+entry:
+  br label %for.body
+
+; ALL: for.body:
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv
+; ALL-NOT: call void @llvm.prefetch
+  %0 = load double, double* %arrayidx, align 8
+  %add = fadd double %0, 1.000000e+00
+  %arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv
+  store double %add, double* %arrayidx2, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1600
+  br i1 %exitcond, label %for.end, label %for.body
+
+; ALL: for.end:
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; ALL-LABEL: @large_stride(
+define void @large_stride(double* nocapture %a, double* nocapture readonly %b) {
+entry:
+  br label %for.body
+
+; ALL: for.body:
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv
+; LARGE_PREFETCH: call void @llvm.prefetch
+; NO_LARGE_PREFETCH-NOT: call void @llvm.prefetch
+  %0 = load double, double* %arrayidx, align 8
+  %add = fadd double %0, 1.000000e+00
+  %arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv
+  store double %add, double* %arrayidx2, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 300
+  %exitcond = icmp eq i64 %indvars.iv.next, 160000
+  br i1 %exitcond, label %for.end, label %for.body
+
+; ALL: for.end:
+for.end:                                          ; preds = %for.body
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopDataPrefetch/AArch64/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDataPrefetch/AArch64/lit.local.cfg?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDataPrefetch/AArch64/lit.local.cfg (added)
+++ llvm/trunk/test/Transforms/LoopDataPrefetch/AArch64/lit.local.cfg Tue Apr 16 21:52:47 2019
@@ -0,0 +1,4 @@
+config.suffixes = ['.ll']
+
+if not 'AArch64' in config.root.targets:
+    config.unsupported = True

Added: llvm/trunk/test/Transforms/LoopDataPrefetch/AArch64/opt-remark-with-hotness.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDataPrefetch/AArch64/opt-remark-with-hotness.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDataPrefetch/AArch64/opt-remark-with-hotness.ll (added)
+++ llvm/trunk/test/Transforms/LoopDataPrefetch/AArch64/opt-remark-with-hotness.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,86 @@
+; RUN: opt -mcpu=cyclone -mtriple=arm64-apple-ios -loop-data-prefetch \
+; RUN:     -pass-remarks=loop-data-prefetch -S -max-prefetch-iters-ahead=100 \
+; RUN:     -pass-remarks-with-hotness \
+; RUN:     < %s 2>&1 | FileCheck %s
+; RUN: opt -mcpu=cyclone -mtriple=arm64-apple-ios -passes=loop-data-prefetch \
+; RUN:     -pass-remarks=loop-data-prefetch -S -max-prefetch-iters-ahead=100 \
+; RUN:     -pass-remarks-with-hotness \
+; RUN:     < %s 2>&1 | FileCheck %s
+
+; ModuleID = '/tmp/s.c'
+source_filename = "/tmp/s.c"
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios5.0.0"
+
+;   1	struct MyStruct {
+;   2	  int field;
+;   3	  char kk[2044];
+;   4	} *my_struct;
+;   5
+;   6	int f(struct MyStruct *p, int N) {
+;   7	  int total = 0;
+;   8	  for (int i = 0; i < N; i++) {
+;   9	    total += my_struct[i].field;
+;  10	  }
+;  11	  return total;
+;  12	}
+
+; CHECK: remark: /tmp/s.c:9:27: prefetched memory access (hotness: 600)
+
+%struct.MyStruct = type { i32, [2044 x i8] }
+
+ at my_struct = common global %struct.MyStruct* null, align 8
+
+define i32 @f(%struct.MyStruct* nocapture readnone %p, i32 %N) !dbg !6 !prof !21 {
+entry:
+  %cmp6 = icmp sgt i32 %N, 0, !dbg !8
+  br i1 %cmp6, label %for.body.lr.ph, label %for.cond.cleanup, !dbg !9, !prof !22
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = load %struct.MyStruct*, %struct.MyStruct** @my_struct, align 8, !dbg !10, !tbaa !11
+  br label %for.body, !dbg !9
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %total.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %total.0.lcssa, !dbg !15
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %total.07 = phi i32 [ 0, %for.body.lr.ph ], [ %add, %for.body ]
+  %field = getelementptr inbounds %struct.MyStruct, %struct.MyStruct* %0, i64 %indvars.iv, i32 0, !dbg !16
+  %1 = load i32, i32* %field, align 4, !dbg !16, !tbaa !17
+  %add = add nsw i32 %1, %total.07, !dbg !20
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !9
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !9
+  %exitcond = icmp eq i32 %lftr.wideiv, %N, !dbg !9
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !9, !prof !23
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !2)
+!1 = !DIFile(filename: "/tmp/s.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"PIC Level", i32 2}
+!5 = !{!"clang version 3.9.0"}
+!6 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 6, type: !7, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
+!7 = !DISubroutineType(types: !2)
+!8 = !DILocation(line: 8, column: 21, scope: !6)
+!9 = !DILocation(line: 8, column: 3, scope: !6)
+!10 = !DILocation(line: 9, column: 14, scope: !6)
+!11 = !{!12, !12, i64 0}
+!12 = !{!"any pointer", !13, i64 0}
+!13 = !{!"omnipotent char", !14, i64 0}
+!14 = !{!"Simple C/C++ TBAA"}
+!15 = !DILocation(line: 11, column: 3, scope: !6)
+!16 = !DILocation(line: 9, column: 27, scope: !6)
+!17 = !{!18, !19, i64 0}
+!18 = !{!"MyStruct", !19, i64 0, !13, i64 4}
+!19 = !{!"int", !13, i64 0}
+!20 = !DILocation(line: 9, column: 11, scope: !6)
+!21 = !{!"function_entry_count", i64 6}
+!22 = !{!"branch_weights", i32 99, i32 1}
+!23 = !{!"branch_weights", i32 1, i32 99}

Added: llvm/trunk/test/Transforms/LoopDataPrefetch/AArch64/opt-remark.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDataPrefetch/AArch64/opt-remark.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDataPrefetch/AArch64/opt-remark.ll (added)
+++ llvm/trunk/test/Transforms/LoopDataPrefetch/AArch64/opt-remark.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,81 @@
+; RUN: opt -mcpu=cyclone -mtriple=arm64-apple-ios -loop-data-prefetch \
+; RUN:     -pass-remarks=loop-data-prefetch -S -max-prefetch-iters-ahead=100 \
+; RUN:     < %s 2>&1 | FileCheck %s
+; RUN: opt -mcpu=cyclone -mtriple=arm64-apple-ios -passes=loop-data-prefetch \
+; RUN:     -pass-remarks=loop-data-prefetch -S -max-prefetch-iters-ahead=100 \
+; RUN:     < %s 2>&1 | FileCheck %s
+
+; ModuleID = '/tmp/s.c'
+source_filename = "/tmp/s.c"
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios5.0.0"
+
+;   1	struct MyStruct {
+;   2	  int field;
+;   3	  char kk[2044];
+;   4	} *my_struct;
+;   5
+;   6	int f(struct MyStruct *p, int N) {
+;   7	  int total = 0;
+;   8	  for (int i = 0; i < N; i++) {
+;   9	    total += my_struct[i].field;
+;  10	  }
+;  11	  return total;
+;  12	}
+
+; CHECK: remark: /tmp/s.c:9:27: prefetched memory access
+
+%struct.MyStruct = type { i32, [2044 x i8] }
+
+ at my_struct = common global %struct.MyStruct* null, align 8
+
+define i32 @f(%struct.MyStruct* nocapture readnone %p, i32 %N) !dbg !6 {
+entry:
+  %cmp6 = icmp sgt i32 %N, 0, !dbg !8
+  br i1 %cmp6, label %for.body.lr.ph, label %for.cond.cleanup, !dbg !9
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = load %struct.MyStruct*, %struct.MyStruct** @my_struct, align 8, !dbg !10, !tbaa !11
+  br label %for.body, !dbg !9
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %total.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %total.0.lcssa, !dbg !15
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %total.07 = phi i32 [ 0, %for.body.lr.ph ], [ %add, %for.body ]
+  %field = getelementptr inbounds %struct.MyStruct, %struct.MyStruct* %0, i64 %indvars.iv, i32 0, !dbg !16
+  %1 = load i32, i32* %field, align 4, !dbg !16, !tbaa !17
+  %add = add nsw i32 %1, %total.07, !dbg !20
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !9
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !9
+  %exitcond = icmp eq i32 %lftr.wideiv, %N, !dbg !9
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !9
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !2)
+!1 = !DIFile(filename: "/tmp/s.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"PIC Level", i32 2}
+!5 = !{!"clang version 3.9.0"}
+!6 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 6, type: !7, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
+!7 = !DISubroutineType(types: !2)
+!8 = !DILocation(line: 8, column: 21, scope: !6)
+!9 = !DILocation(line: 8, column: 3, scope: !6)
+!10 = !DILocation(line: 9, column: 14, scope: !6)
+!11 = !{!12, !12, i64 0}
+!12 = !{!"any pointer", !13, i64 0}
+!13 = !{!"omnipotent char", !14, i64 0}
+!14 = !{!"Simple C/C++ TBAA"}
+!15 = !DILocation(line: 11, column: 3, scope: !6)
+!16 = !DILocation(line: 9, column: 27, scope: !6)
+!17 = !{!18, !19, i64 0}
+!18 = !{!"MyStruct", !19, i64 0, !13, i64 4}
+!19 = !{!"int", !13, i64 0}
+!20 = !DILocation(line: 9, column: 11, scope: !6)

Added: llvm/trunk/test/Transforms/LoopDataPrefetch/PowerPC/basic.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDataPrefetch/PowerPC/basic.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDataPrefetch/PowerPC/basic.ll (added)
+++ llvm/trunk/test/Transforms/LoopDataPrefetch/PowerPC/basic.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,26 @@
+; RUN: opt -mcpu=a2 -loop-data-prefetch -S < %s | FileCheck %s
+; RUN: opt -mcpu=a2 -passes=loop-data-prefetch -S < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-bgq-linux"
+
+define void @foo(double* nocapture %a, double* nocapture readonly %b) {
+entry:
+  br label %for.body
+
+; CHECK: for.body:
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv
+; CHECK: call void @llvm.prefetch
+  %0 = load double, double* %arrayidx, align 8
+  %add = fadd double %0, 1.000000e+00
+  %arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv
+  store double %add, double* %arrayidx2, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1600
+  br i1 %exitcond, label %for.end, label %for.body
+
+; CHECK: for.end:
+for.end:                                          ; preds = %for.body
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopDataPrefetch/PowerPC/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDataPrefetch/PowerPC/lit.local.cfg?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDataPrefetch/PowerPC/lit.local.cfg (added)
+++ llvm/trunk/test/Transforms/LoopDataPrefetch/PowerPC/lit.local.cfg Tue Apr 16 21:52:47 2019
@@ -0,0 +1,2 @@
+if not 'PowerPC' in config.root.targets:
+    config.unsupported = True

Added: llvm/trunk/test/Transforms/LoopDeletion/2007-07-23-InfiniteLoop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDeletion/2007-07-23-InfiniteLoop.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDeletion/2007-07-23-InfiniteLoop.ll (added)
+++ llvm/trunk/test/Transforms/LoopDeletion/2007-07-23-InfiniteLoop.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,13 @@
+; RUN: opt < %s -loop-deletion -S | grep switch
+; PR 1564
+  
+define fastcc void @out() {
+    start:
+            br label %loop
+    unreachable:
+            unreachable
+    loop:
+            switch i32 0, label %unreachable [
+                     i32 0, label %loop
+            ]
+}

Added: llvm/trunk/test/Transforms/LoopDeletion/2008-05-06-Phi.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDeletion/2008-05-06-Phi.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDeletion/2008-05-06-Phi.ll (added)
+++ llvm/trunk/test/Transforms/LoopDeletion/2008-05-06-Phi.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,109 @@
+; RUN: opt < %s -inline -instcombine -jump-threading -licm -loop-unswitch -instcombine -indvars -loop-deletion -gvn -simplifycfg -verify -disable-output
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin9"
+	%struct.BF_BitstreamElement = type { i32, i16 }
+	%struct.BF_BitstreamPart = type { i32, %struct.BF_BitstreamElement* }
+	%struct.BF_PartHolder = type { i32, %struct.BF_BitstreamPart* }
+	%struct.Bit_stream_struc = type { i8*, i32, %struct.FILE*, i8*, i32, i32, i32, i32 }
+	%struct.FILE = type { i8*, i32, i32, i16, i16, %struct.__sbuf, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.__sbuf, %struct.__sFILEX*, i32, [3 x i8], [1 x i8], %struct.__sbuf, i32, i64 }
+	%struct.III_scalefac_t = type { [22 x i32], [13 x [3 x i32]] }
+	%struct.III_side_info_t = type { i32, i32, i32, [2 x [4 x i32]], [2 x %struct.anon] }
+	%struct.__sFILEX = type opaque
+	%struct.__sbuf = type { i8*, i32 }
+	%struct.anon = type { [2 x %struct.gr_info_ss] }
+	%struct.gr_info = type { i32, i32, i32, i32, i32, i32, i32, i32, [3 x i32], [3 x i32], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32*, [4 x i32] }
+	%struct.gr_info_ss = type { %struct.gr_info }
+	%struct.lame_global_flags = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8*, i8*, i32, i32, float, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, i32, i32, i32, float, float, float, float, i32, i32, i32, i32, i32, i32, i32, i32 }
+ at scaleFactorsPH = external global [2 x [2 x %struct.BF_PartHolder*]]		; <[2 x [2 x %struct.BF_PartHolder*]]*> [#uses=1]
+ at slen1_tab = external constant [16 x i32]		; <[16 x i32]*> [#uses=1]
+
+declare %struct.BF_PartHolder* @BF_addElement(%struct.BF_PartHolder*, %struct.BF_BitstreamElement*) nounwind 
+
+define %struct.BF_PartHolder* @BF_addEntry(%struct.BF_PartHolder* %thePH, i32 %value, i32 %length) nounwind  {
+entry:
+	%myElement = alloca %struct.BF_BitstreamElement		; <%struct.BF_BitstreamElement*> [#uses=2]
+	%tmp1 = getelementptr %struct.BF_BitstreamElement, %struct.BF_BitstreamElement* %myElement, i32 0, i32 0		; <i32*> [#uses=1]
+	store i32 %value, i32* %tmp1, align 8
+	%tmp7 = icmp eq i32 %length, 0		; <i1> [#uses=1]
+	br i1 %tmp7, label %bb13, label %bb
+
+bb:		; preds = %entry
+	%tmp10 = call %struct.BF_PartHolder* @BF_addElement( %struct.BF_PartHolder* %thePH, %struct.BF_BitstreamElement* %myElement ) nounwind 		; <%struct.BF_PartHolder*> [#uses=1]
+	ret %struct.BF_PartHolder* %tmp10
+
+bb13:		; preds = %entry
+	ret %struct.BF_PartHolder* %thePH
+}
+
+define void @III_format_bitstream(%struct.lame_global_flags* %gfp, i32 %bitsPerFrame, [2 x [576 x i32]]* %l3_enc, %struct.III_side_info_t* %l3_side, [2 x %struct.III_scalefac_t]* %scalefac, %struct.Bit_stream_struc* %in_bs) nounwind  {
+entry:
+	call fastcc void @encodeMainData( %struct.lame_global_flags* %gfp, [2 x [576 x i32]]* %l3_enc, %struct.III_side_info_t* %l3_side, [2 x %struct.III_scalefac_t]* %scalefac ) nounwind 
+	unreachable
+}
+
+define internal fastcc void @encodeMainData(%struct.lame_global_flags* %gfp, [2 x [576 x i32]]* %l3_enc, %struct.III_side_info_t* %si, [2 x %struct.III_scalefac_t]* %scalefac) nounwind  {
+entry:
+	%tmp69 = getelementptr %struct.lame_global_flags, %struct.lame_global_flags* %gfp, i32 0, i32 43		; <i32*> [#uses=1]
+	%tmp70 = load i32, i32* %tmp69, align 4		; <i32> [#uses=1]
+	%tmp71 = icmp eq i32 %tmp70, 1		; <i1> [#uses=1]
+	br i1 %tmp71, label %bb352, label %bb498
+
+bb113:		; preds = %bb132
+	%tmp123 = getelementptr [2 x %struct.III_scalefac_t], [2 x %struct.III_scalefac_t]* %scalefac, i32 0, i32 0, i32 1, i32 %sfb.0, i32 %window.0		; <i32*> [#uses=1]
+	%tmp124 = load i32, i32* %tmp123, align 4		; <i32> [#uses=1]
+	%tmp126 = load %struct.BF_PartHolder*, %struct.BF_PartHolder** %tmp80, align 4		; <%struct.BF_PartHolder*> [#uses=1]
+	%tmp128 = call %struct.BF_PartHolder* @BF_addEntry( %struct.BF_PartHolder* %tmp126, i32 %tmp124, i32 %tmp93 ) nounwind 		; <%struct.BF_PartHolder*> [#uses=1]
+	store %struct.BF_PartHolder* %tmp128, %struct.BF_PartHolder** %tmp80, align 4
+	%tmp131 = add i32 %window.0, 1		; <i32> [#uses=1]
+	br label %bb132
+
+bb132:		; preds = %bb140, %bb113
+	%window.0 = phi i32 [ %tmp131, %bb113 ], [ 0, %bb140 ]		; <i32> [#uses=3]
+	%tmp134 = icmp slt i32 %window.0, 3		; <i1> [#uses=1]
+	br i1 %tmp134, label %bb113, label %bb137
+
+bb137:		; preds = %bb132
+	%tmp139 = add i32 %sfb.0, 1		; <i32> [#uses=1]
+	br label %bb140
+
+bb140:		; preds = %bb341, %bb137
+	%sfb.0 = phi i32 [ %tmp139, %bb137 ], [ 0, %bb341 ]		; <i32> [#uses=3]
+	%tmp142 = icmp slt i32 %sfb.0, 6		; <i1> [#uses=1]
+	br i1 %tmp142, label %bb132, label %bb174
+
+bb166:		; preds = %bb174
+	%tmp160 = load %struct.BF_PartHolder*, %struct.BF_PartHolder** %tmp80, align 4		; <%struct.BF_PartHolder*> [#uses=1]
+	%tmp162 = call %struct.BF_PartHolder* @BF_addEntry( %struct.BF_PartHolder* %tmp160, i32 0, i32 0 ) nounwind 		; <%struct.BF_PartHolder*> [#uses=0]
+	unreachable
+
+bb174:		; preds = %bb140
+	%tmp176 = icmp slt i32 6, 12		; <i1> [#uses=1]
+	br i1 %tmp176, label %bb166, label %bb341
+
+bb341:		; preds = %bb352, %bb174
+	%tmp80 = getelementptr [2 x [2 x %struct.BF_PartHolder*]], [2 x [2 x %struct.BF_PartHolder*]]* @scaleFactorsPH, i32 0, i32 0, i32 0		; <%struct.BF_PartHolder**> [#uses=3]
+	%tmp92 = getelementptr [16 x i32], [16 x i32]* @slen1_tab, i32 0, i32 0		; <i32*> [#uses=1]
+	%tmp93 = load i32, i32* %tmp92, align 4		; <i32> [#uses=1]
+	br label %bb140
+
+bb352:		; preds = %entry
+	%tmp354 = icmp slt i32 0, 2		; <i1> [#uses=1]
+	br i1 %tmp354, label %bb341, label %return
+
+bb498:		; preds = %entry
+	ret void
+
+return:		; preds = %bb352
+	ret void
+}
+
+define void @getframebits(%struct.lame_global_flags* %gfp, i32* %bitsPerFrame, i32* %mean_bits) nounwind  {
+entry:
+	unreachable
+}
+
+define i32 @lame_encode_buffer(%struct.lame_global_flags* %gfp, i16* %buffer_l, i16* %buffer_r, i32 %nsamples, i8* %mp3buf, i32 %mp3buf_size) nounwind  {
+entry:
+	unreachable
+}

Added: llvm/trunk/test/Transforms/LoopDeletion/2011-06-21-phioperands.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDeletion/2011-06-21-phioperands.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDeletion/2011-06-21-phioperands.ll (added)
+++ llvm/trunk/test/Transforms/LoopDeletion/2011-06-21-phioperands.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,182 @@
+; RUN: opt -loop-deletion -disable-output < %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+%0 = type { %"class.llvm::SmallVectorImpl", [1 x %"union.llvm::SmallVectorBase::U"] }
+%"class.clang::SourceLocation" = type { i32 }
+%"class.clang::driver::Arg" = type { %"class.clang::driver::Option"*, %"class.clang::driver::Arg"*, i32, i8, %0 }
+%"class.clang::driver::Option" = type { i32 (...)**, i32, %"class.clang::SourceLocation", i8*, %"class.clang::driver::OptionGroup"*, %"class.clang::driver::Option"*, i8 }
+%"class.clang::driver::OptionGroup" = type { %"class.clang::driver::Option" }
+%"class.llvm::SmallVectorBase" = type { i8*, i8*, i8*, %"union.llvm::SmallVectorBase::U" }
+%"class.llvm::SmallVectorImpl" = type { %"class.llvm::SmallVectorTemplateBase" }
+%"class.llvm::SmallVectorTemplateBase" = type { %"class.llvm::SmallVectorTemplateCommon" }
+%"class.llvm::SmallVectorTemplateCommon" = type { %"class.llvm::SmallVectorBase" }
+%"union.llvm::SmallVectorBase::U" = type { x86_fp80 }
+
+define void @_ZNK5clang6driver7ArgList20AddAllArgsTranslatedERN4llvm11SmallVectorIPKcLj16EEENS0_12OptSpecifierES5_b(i1 zeroext %Joined) nounwind align 2 {
+entry:
+  br i1 undef, label %entry.split.us, label %entry.entry.split_crit_edge
+
+entry.entry.split_crit_edge:                      ; preds = %entry
+  br label %entry.split
+
+entry.split.us:                                   ; preds = %entry
+  br label %for.cond.i14.us
+
+for.cond.i14.us:                                  ; preds = %for.inc.i38.us, %entry.split.us
+  br i1 true, label %for.cond.i50.us-lcssa.us, label %if.end.i23.us
+
+for.inc.i38.us:                                   ; preds = %if.end.i23.us
+  br label %for.cond.i14.us
+
+if.end.i23.us:                                    ; preds = %for.cond.i14.us
+  br i1 true, label %for.cond.i50.us-lcssa.us, label %for.inc.i38.us
+
+for.cond.i50.us-lcssa.us:                         ; preds = %if.end.i23.us, %for.cond.i14.us
+  br label %for.cond.i50
+
+entry.split:                                      ; preds = %entry.entry.split_crit_edge
+  br label %for.cond.i14
+
+for.cond.i14:                                     ; preds = %for.inc.i38, %entry.split
+  br i1 undef, label %for.cond.i50.us-lcssa, label %if.end.i23
+
+if.end.i23:                                       ; preds = %for.cond.i14
+  br i1 undef, label %for.cond.i50.us-lcssa, label %for.inc.i38
+
+for.inc.i38:                                      ; preds = %if.end.i23
+  br label %for.cond.i14
+
+for.cond.i50.us-lcssa:                            ; preds = %if.end.i23, %for.cond.i14
+  br label %for.cond.i50
+
+for.cond.i50:                                     ; preds = %for.cond.i50.us-lcssa, %for.cond.i50.us-lcssa.us
+  br label %for.cond
+
+for.cond.loopexit.us-lcssa:                       ; preds = %if.end.i, %for.cond.i
+  br label %for.cond.loopexit
+
+for.cond.loopexit:                                ; preds = %for.cond.loopexit.us-lcssa.us, %for.cond.loopexit.us-lcssa
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond.loopexit, %for.cond.i50
+  br i1 undef, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.cond
+  br i1 %Joined, label %if.then, label %if.else
+
+if.then:                                          ; preds = %for.body
+  br i1 undef, label %cond.false.i.i, label %_ZN4llvm9StringRefC1EPKc.exit
+
+cond.false.i.i:                                   ; preds = %if.then
+  unreachable
+
+_ZN4llvm9StringRefC1EPKc.exit:                    ; preds = %if.then
+  br i1 undef, label %_ZNK5clang6driver3Arg8getValueERKNS0_7ArgListEj.exit, label %cond.false.i.i91
+
+cond.false.i.i91:                                 ; preds = %_ZN4llvm9StringRefC1EPKc.exit
+  unreachable
+
+_ZNK5clang6driver3Arg8getValueERKNS0_7ArgListEj.exit: ; preds = %_ZN4llvm9StringRefC1EPKc.exit
+  br i1 undef, label %cond.false.i.i.i, label %if.end13.i.i.i.i
+
+if.end13.i.i.i.i:                                 ; preds = %_ZNK5clang6driver3Arg8getValueERKNS0_7ArgListEj.exit
+  br i1 undef, label %land.lhs.true16.i.i.i.i, label %if.end19.i.i.i.i
+
+land.lhs.true16.i.i.i.i:                          ; preds = %if.end13.i.i.i.i
+  br i1 undef, label %cond.false.i.i.i, label %_ZNK4llvm5Twine8isBinaryEv.exit8.i.i.i.i
+
+_ZNK4llvm5Twine8isBinaryEv.exit8.i.i.i.i:         ; preds = %land.lhs.true16.i.i.i.i
+  br i1 undef, label %cond.false.i.i.i, label %if.end19.i.i.i.i
+
+if.end19.i.i.i.i:                                 ; preds = %_ZNK4llvm5Twine8isBinaryEv.exit8.i.i.i.i, %if.end13.i.i.i.i
+  br i1 undef, label %land.lhs.true22.i.i.i.i, label %_ZN4llvmplERKNS_9StringRefEPKc.exit
+
+land.lhs.true22.i.i.i.i:                          ; preds = %if.end19.i.i.i.i
+  br i1 undef, label %cond.false.i.i.i, label %_ZNK4llvm5Twine8isBinaryEv.exit.i.i.i.i
+
+_ZNK4llvm5Twine8isBinaryEv.exit.i.i.i.i:          ; preds = %land.lhs.true22.i.i.i.i
+  br i1 undef, label %cond.false.i.i.i, label %_ZN4llvmplERKNS_9StringRefEPKc.exit
+
+cond.false.i.i.i:                                 ; preds = %_ZNK4llvm5Twine8isBinaryEv.exit.i.i.i.i, %land.lhs.true22.i.i.i.i, %_ZNK4llvm5Twine8isBinaryEv.exit8.i.i.i.i, %land.lhs.true16.i.i.i.i, %_ZNK5clang6driver3Arg8getValueERKNS0_7ArgListEj.exit
+  unreachable
+
+_ZN4llvmplERKNS_9StringRefEPKc.exit:              ; preds = %_ZNK4llvm5Twine8isBinaryEv.exit.i.i.i.i, %if.end19.i.i.i.i
+  br i1 undef, label %Retry.i, label %if.end.i99
+
+Retry.i:                                          ; preds = %if.end.i99, %_ZN4llvmplERKNS_9StringRefEPKc.exit
+  br i1 undef, label %_ZN4llvm15SmallVectorImplIPKcE9push_backERKS2_.exit, label %new.notnull.i
+
+new.notnull.i:                                    ; preds = %Retry.i
+  br label %_ZN4llvm15SmallVectorImplIPKcE9push_backERKS2_.exit
+
+if.end.i99:                                       ; preds = %_ZN4llvmplERKNS_9StringRefEPKc.exit
+  br label %Retry.i
+
+_ZN4llvm15SmallVectorImplIPKcE9push_backERKS2_.exit: ; preds = %new.notnull.i, %Retry.i
+  br label %for.cond.i.preheader
+
+if.else:                                          ; preds = %for.body
+  br i1 undef, label %Retry.i108, label %if.end.i113
+
+Retry.i108:                                       ; preds = %if.end.i113, %if.else
+  br i1 undef, label %_ZN4llvm15SmallVectorImplIPKcE9push_backERKS2_.exit114, label %new.notnull.i110
+
+new.notnull.i110:                                 ; preds = %Retry.i108
+  br label %_ZN4llvm15SmallVectorImplIPKcE9push_backERKS2_.exit114
+
+if.end.i113:                                      ; preds = %if.else
+  br label %Retry.i108
+
+_ZN4llvm15SmallVectorImplIPKcE9push_backERKS2_.exit114: ; preds = %new.notnull.i110, %Retry.i108
+  br i1 undef, label %_ZNK5clang6driver3Arg8getValueERKNS0_7ArgListEj.exit125, label %cond.false.i.i123
+
+cond.false.i.i123:                                ; preds = %_ZN4llvm15SmallVectorImplIPKcE9push_backERKS2_.exit114
+  unreachable
+
+_ZNK5clang6driver3Arg8getValueERKNS0_7ArgListEj.exit125: ; preds = %_ZN4llvm15SmallVectorImplIPKcE9push_backERKS2_.exit114
+  br i1 undef, label %Retry.i134, label %if.end.i139
+
+Retry.i134:                                       ; preds = %if.end.i139, %_ZNK5clang6driver3Arg8getValueERKNS0_7ArgListEj.exit125
+  br i1 undef, label %_ZN4llvm15SmallVectorImplIPKcE9push_backERKS2_.exit140, label %new.notnull.i136
+
+new.notnull.i136:                                 ; preds = %Retry.i134
+  br label %_ZN4llvm15SmallVectorImplIPKcE9push_backERKS2_.exit140
+
+if.end.i139:                                      ; preds = %_ZNK5clang6driver3Arg8getValueERKNS0_7ArgListEj.exit125
+  br label %Retry.i134
+
+_ZN4llvm15SmallVectorImplIPKcE9push_backERKS2_.exit140: ; preds = %new.notnull.i136, %Retry.i134
+  br label %for.cond.i.preheader
+
+for.cond.i.preheader:                             ; preds = %_ZN4llvm15SmallVectorImplIPKcE9push_backERKS2_.exit140, %_ZN4llvm15SmallVectorImplIPKcE9push_backERKS2_.exit
+  br i1 undef, label %for.cond.i.preheader.split.us, label %for.cond.i.preheader.for.cond.i.preheader.split_crit_edge
+
+for.cond.i.preheader.for.cond.i.preheader.split_crit_edge: ; preds = %for.cond.i.preheader
+  br label %for.cond.i.preheader.split
+
+for.cond.i.preheader.split.us:                    ; preds = %for.cond.i.preheader
+  br label %for.cond.i.us
+
+for.cond.i.us:                                    ; preds = %if.end.i.us, %for.cond.i.preheader.split.us
+  br i1 true, label %for.cond.loopexit.us-lcssa.us, label %if.end.i.us
+
+if.end.i.us:                                      ; preds = %for.cond.i.us
+  br i1 true, label %for.cond.loopexit.us-lcssa.us, label %for.cond.i.us
+
+for.cond.loopexit.us-lcssa.us:                    ; preds = %if.end.i.us, %for.cond.i.us
+  %tmp178218.us.lcssa = phi %"class.clang::driver::Arg"** [ undef, %if.end.i.us ], [ undef, %for.cond.i.us ]
+  br label %for.cond.loopexit
+
+for.cond.i.preheader.split:                       ; preds = %for.cond.i.preheader.for.cond.i.preheader.split_crit_edge
+  br label %for.cond.i
+
+for.cond.i:                                       ; preds = %if.end.i, %for.cond.i.preheader.split
+  br i1 undef, label %for.cond.loopexit.us-lcssa, label %if.end.i
+
+if.end.i:                                         ; preds = %for.cond.i
+  br i1 undef, label %for.cond.loopexit.us-lcssa, label %for.cond.i
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopDeletion/2017-07-11-incremental-dt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDeletion/2017-07-11-incremental-dt.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDeletion/2017-07-11-incremental-dt.ll (added)
+++ llvm/trunk/test/Transforms/LoopDeletion/2017-07-11-incremental-dt.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,56 @@
+; RUN: opt < %s -loop-deletion -S
+; RUN: opt < %s -loop-deletion -analyze -domtree 2>&1 | FileCheck -check-prefix=DT %s
+; RUN: opt < %s -loop-deletion -analyze -verify-dom-info
+
+; CHECK: for.body
+; CHECK-NOT: for.cond1
+
+; Verify only the important parts of the DomTree.
+; DT: [1] %entry
+; DT:   [2] %for.cond
+; DT:     [3] %lbl63A679E5
+; DT:     [3] %for.cond9
+; DT:     [3] %lbl64774A9B
+; DT:     [3] %for.body
+; DT:       [4] %for.cond3.loopexit
+
+define i32 @fn1() {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %entry
+  br i1 undef, label %lbl63A679E5, label %for.body
+
+for.body:                                         ; preds = %for.cond
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.cond1, %for.body
+  br i1 undef, label %for.cond1, label %for.cond3.loopexit
+
+for.cond3.loopexit:                               ; preds = %for.cond1
+  br label %for.cond3
+
+for.cond3:                                        ; preds = %for.cond9, %for.cond3.loopexit
+  br i1 undef, label %for.body4, label %for.cond17
+
+for.body4:                                        ; preds = %for.cond3
+  br label %for.cond5
+
+for.cond5:                                        ; preds = %lbl63A679E5, %for.body4
+  br label %for.cond9
+
+lbl63A679E5:                                      ; preds = %for.cond
+  br label %for.cond5
+
+for.cond9:                                        ; preds = %for.end14.split, %for.cond5
+  br i1 undef, label %for.cond3, label %lbl64774A9B
+
+lbl64774A9B:                                      ; preds = %for.cond17, %for.cond9
+  br label %for.end14.split
+
+for.end14.split:                                  ; preds = %lbl64774A9B
+  br label %for.cond9
+
+for.cond17:                                       ; preds = %for.cond3
+  br label %lbl64774A9B
+}

Added: llvm/trunk/test/Transforms/LoopDeletion/crashbc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDeletion/crashbc.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDeletion/crashbc.ll (added)
+++ llvm/trunk/test/Transforms/LoopDeletion/crashbc.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,31 @@
+; Make sure we don't crash when writing bitcode.
+; RUN: opt < %s -loop-deletion -o /dev/null
+
+define void @f() {
+  br label %bb1
+
+bb1:                                              ; preds = %bb1, %0
+  call void @llvm.dbg.value(metadata i16 undef, metadata !1, metadata !DIExpression()), !dbg !11
+  br i1 undef, label %bb1, label %bb3
+
+bb3:                                              ; preds = %bb1
+  ret void
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!9}
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !DILocalVariable(name: "i", scope: !2, file: !3, line: 31, type: !7)
+!2 = distinct !DILexicalBlock(scope: !4, file: !3, line: 31, column: 9)
+!3 = !DIFile(filename: "foo.c", directory: "/bar")
+!4 = distinct !DISubprogram(name: "f", scope: !3, file: !3, line: 26, type: !5, scopeLine: 27, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !9, retainedNodes: !10)
+!5 = !DISubroutineType(types: !6)
+!6 = !{!7, !8, !7}
+!7 = !DIBasicType(name: "int", size: 16, encoding: DW_ATE_signed)
+!8 = !DIBasicType(size: 16, encoding: DW_ATE_signed)
+!9 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "My Compiler", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !10, retainedTypes: !10, globals: !10)
+!10 = !{}
+!11 = !DILocation(line: 31, column: 13, scope: !2)

Added: llvm/trunk/test/Transforms/LoopDeletion/dcetest.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDeletion/dcetest.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDeletion/dcetest.ll (added)
+++ llvm/trunk/test/Transforms/LoopDeletion/dcetest.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,36 @@
+; This is the test case taken from Appel's book that illustrates a hard case
+; that SCCP gets right, and when followed by ADCE, is completely eliminated
+;
+; RUN: opt < %s -sccp -simplifycfg -indvars -loop-deletion -dce -simplifycfg -S | not grep br
+
+define i32 @"test function"(i32 %i0, i32 %j0) {
+BB1:
+        br label %BB2
+
+BB2:            ; preds = %BB7, %BB1
+        %j2 = phi i32 [ %j4, %BB7 ], [ 1, %BB1 ]                ; <i32> [#uses=2]
+        %k2 = phi i32 [ %k4, %BB7 ], [ 0, %BB1 ]                ; <i32> [#uses=4]
+        %kcond = icmp slt i32 %k2, 100          ; <i1> [#uses=1]
+        br i1 %kcond, label %BB3, label %BB4
+
+BB3:            ; preds = %BB2
+        %jcond = icmp slt i32 %j2, 20           ; <i1> [#uses=1]
+        br i1 %jcond, label %BB5, label %BB6
+
+BB4:            ; preds = %BB2
+        ret i32 %j2
+
+BB5:            ; preds = %BB3
+        %k3 = add i32 %k2, 1            ; <i32> [#uses=1]
+        br label %BB7
+
+BB6:            ; preds = %BB3
+        %k5 = add i32 %k2, 1            ; <i32> [#uses=1]
+        br label %BB7
+
+BB7:            ; preds = %BB6, %BB5
+        %j4 = phi i32 [ 1, %BB5 ], [ %k2, %BB6 ]                ; <i32> [#uses=1]
+        %k4 = phi i32 [ %k3, %BB5 ], [ %k5, %BB6 ]              ; <i32> [#uses=1]
+        br label %BB2
+}
+

Added: llvm/trunk/test/Transforms/LoopDeletion/diundef.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDeletion/diundef.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDeletion/diundef.ll (added)
+++ llvm/trunk/test/Transforms/LoopDeletion/diundef.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,75 @@
+; RUN: opt %s -loop-deletion -S | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+ at a = common local_unnamed_addr global i32 0, align 4, !dbg !0
+
+define i32 @b() local_unnamed_addr !dbg !12 {
+entry:
+  call void @llvm.dbg.value(metadata i32 0, metadata !16, metadata !DIExpression()), !dbg !17
+  br label %for.cond, !dbg !18
+
+for.cond:                                         ; preds = %for.cond, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.cond ], !dbg !20
+  call void @llvm.dbg.value(metadata i32 %i.0, metadata !16, metadata !DIExpression()), !dbg !17
+  %inc = add nuw nsw i32 %i.0, 1, !dbg !21
+  call void @llvm.dbg.value(metadata i32 %inc, metadata !16, metadata !DIExpression()), !dbg !17
+  %exitcond = icmp ne i32 %inc, 3, !dbg !23
+  br i1 %exitcond, label %for.cond, label %for.end, !dbg !24, !llvm.loop !25
+
+; CHECK: call void @llvm.dbg.value(metadata i32 undef, metadata !16, metadata !DIExpression()), !dbg !17
+; CHECK-NEXT: %call = tail call i32 {{.*}} @patatino()
+for.end:                                          ; preds = %for.cond
+  %call = tail call i32 (...) @patatino() #3, !dbg !27
+  %0 = load i32, i32* @a, align 4, !dbg !28
+  ret i32 %0, !dbg !33
+}
+
+declare i32 @patatino(...) local_unnamed_addr
+
+define i32 @main() local_unnamed_addr !dbg !34 {
+entry:
+  %call = call i32 @b(), !dbg !35
+  ret i32 0, !dbg !36
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!7, !8, !9, !10}
+!llvm.ident = !{!11}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "a", scope: !2, file: !3, line: 1, type: !6, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 8.0.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, nameTableKind: GNU)
+!3 = !DIFile(filename: "a.c", directory: "/Users/davide/work/llvm-project-20170507/build-debug/bin")
+!4 = !{}
+!5 = !{!0}
+!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{i32 1, !"wchar_size", i32 4}
+!10 = !{i32 7, !"PIC Level", i32 2}
+!11 = !{!"clang version 8.0.0 "}
+!12 = distinct !DISubprogram(name: "b", scope: !3, file: !3, line: 2, type: !13, scopeLine: 2, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !15)
+!13 = !DISubroutineType(types: !14)
+!14 = !{!6}
+!15 = !{!16}
+!16 = !DILocalVariable(name: "i", scope: !12, file: !3, line: 3, type: !6)
+!17 = !DILocation(line: 3, column: 7, scope: !12)
+!18 = !DILocation(line: 4, column: 8, scope: !19)
+!19 = distinct !DILexicalBlock(scope: !12, file: !3, line: 4, column: 3)
+!20 = !DILocation(line: 0, scope: !19)
+!21 = !DILocation(line: 4, column: 23, scope: !22)
+!22 = distinct !DILexicalBlock(scope: !19, file: !3, line: 4, column: 3)
+!23 = !DILocation(line: 4, column: 17, scope: !22)
+!24 = !DILocation(line: 4, column: 3, scope: !19)
+!25 = distinct !{!25, !24, !26}
+!26 = !DILocation(line: 5, column: 5, scope: !19)
+!27 = !DILocation(line: 6, column: 3, scope: !12)
+!28 = !DILocation(line: 7, column: 10, scope: !12)
+!33 = !DILocation(line: 7, column: 3, scope: !12)
+!34 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 9, type: !13, scopeLine: 9, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !4)
+!35 = !DILocation(line: 9, column: 14, scope: !34)
+!36 = !DILocation(line: 9, column: 19, scope: !34)

Added: llvm/trunk/test/Transforms/LoopDeletion/invalidation.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDeletion/invalidation.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDeletion/invalidation.ll (added)
+++ llvm/trunk/test/Transforms/LoopDeletion/invalidation.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,42 @@
+; Ensure we don't run analyses over loops after they've been deleted. We run
+; one version with a no-op loop pass to make sure that the loop doesn't get
+; simplified away.
+;
+; RUN: opt < %s -passes='require<ivusers>,no-op-loop,require<ivusers>' -S \
+; RUN:     | FileCheck %s --check-prefixes=CHECK,BEFORE
+; RUN: opt < %s -passes='require<ivusers>,loop-deletion,require<ivusers>' -S \
+; RUN:     | FileCheck %s --check-prefixes=CHECK,AFTER
+
+
+define void @foo(i64 %n, i64 %m) nounwind {
+; CHECK-LABEL: @foo(
+
+entry:
+  br label %bb
+; CHECK:       entry:
+; BEFORE-NEXT:   br label %bb
+; AFTER-NEXT:    br label %return
+
+bb:
+  %x.0 = phi i64 [ 0, %entry ], [ %t0, %bb2 ]
+  %t0 = add i64 %x.0, 1
+  %t1 = icmp slt i64 %x.0, %n
+  br i1 %t1, label %bb2, label %return
+; BEFORE:      bb:
+; BEFORE:        br i1 {{.*}}, label %bb2, label %return
+; AFTER-NOT:   bb:
+; AFTER-NOT:     br
+
+bb2:
+  %t2 = icmp slt i64 %x.0, %m
+  br i1 %t1, label %bb, label %return
+; BEFORE:      bb2:
+; BEFORE:        br i1 {{.*}}, label %bb, label %return
+; AFTER-NOT:   bb2:
+; AFTER-NOT:     br
+
+return:
+  ret void
+; CHECK:       return:
+; CHECK-NEXT:    ret void
+}

Added: llvm/trunk/test/Transforms/LoopDeletion/multiple-exit-conditions.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDeletion/multiple-exit-conditions.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDeletion/multiple-exit-conditions.ll (added)
+++ llvm/trunk/test/Transforms/LoopDeletion/multiple-exit-conditions.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,28 @@
+; RUN: opt < %s -loop-deletion -S | FileCheck %s
+; RUN: opt < %s -passes='loop(loop-deletion)' -S | FileCheck %s
+
+; ScalarEvolution can prove the loop iteration is finite, even though
+; it can't represent the exact trip count as an expression. That's
+; good enough to let the loop be deleted.
+
+; CHECK:      entry:
+; CHECK-NEXT:   br label %return
+
+; CHECK:      return:
+; CHECK-NEXT:   ret void
+
+define void @foo(i64 %n, i64 %m) nounwind {
+entry:
+  br label %bb
+
+bb:
+  %x.0 = phi i64 [ 0, %entry ], [ %t0, %bb ]
+  %t0 = add i64 %x.0, 1
+  %t1 = icmp slt i64 %x.0, %n
+  %t3 = icmp sgt i64 %x.0, %m
+  %t4 = and i1 %t1, %t3
+  br i1 %t4, label %bb, label %return
+
+return:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopDeletion/multiple-exits.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDeletion/multiple-exits.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDeletion/multiple-exits.ll (added)
+++ llvm/trunk/test/Transforms/LoopDeletion/multiple-exits.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,138 @@
+; Checks whether dead loops with multiple exits can be eliminated.
+; Note that we loop simplify and LCSSA over the test cases to make sure the
+; critical components remain after those passes and are visible to the loop
+; deletion pass.
+;
+; RUN: opt < %s -loop-simplify -lcssa -S | FileCheck %s --check-prefixes=CHECK,BEFORE
+; RUN: opt < %s -loop-deletion -S | FileCheck %s --check-prefixes=CHECK,AFTER
+;
+; RUN: opt < %s -passes=no-op-loop -S | FileCheck %s --check-prefixes=CHECK,BEFORE
+; RUN: opt < %s -passes=loop-deletion -S | FileCheck %s --check-prefixes=CHECK,AFTER
+
+
+define void @foo(i64 %n, i64 %m) nounwind {
+; CHECK-LABEL: @foo(
+
+entry:
+  br label %bb
+; CHECK:       entry:
+; BEFORE-NEXT:   br label %bb
+; AFTER-NEXT:    br label %return
+
+bb:
+  %x.0 = phi i64 [ 0, %entry ], [ %t0, %bb2 ]
+  %t0 = add i64 %x.0, 1
+  %t1 = icmp slt i64 %x.0, %n
+  br i1 %t1, label %bb2, label %return
+; BEFORE:      bb:
+; BEFORE:        br i1 {{.*}}, label %bb2, label %return
+; AFTER-NOT:   bb:
+; AFTER-NOT:     br
+
+bb2:
+  %t2 = icmp slt i64 %x.0, %m
+  br i1 %t1, label %bb, label %return
+; BEFORE:      bb2:
+; BEFORE:        br i1 {{.*}}, label %bb, label %return
+; AFTER-NOT:   bb2:
+; AFTER-NOT:     br
+
+return:
+  ret void
+; CHECK:       return:
+; CHECK-NEXT:    ret void
+}
+
+define i64 @bar(i64 %n, i64 %m, i64 %maybe_zero) nounwind {
+; CHECK-LABEL: @bar(
+
+entry:
+  br label %bb
+; CHECK:       entry:
+; BEFORE-NEXT:   br label %bb
+; AFTER-NEXT:    br label %return
+
+bb:
+  %x.0 = phi i64 [ 0, %entry ], [ %t0, %bb3 ]
+  %t0 = add i64 %x.0, 1
+  %t1 = icmp slt i64 %x.0, %n
+  br i1 %t1, label %bb2, label %return
+; BEFORE:      bb:
+; BEFORE:        br i1 {{.*}}, label %bb2, label %return
+; AFTER-NOT:   bb:
+; AFTER-NOT:     br
+
+bb2:
+  %t2 = icmp slt i64 %x.0, %m
+  ; This unused division prevents unifying this loop exit path with others
+  ; because it can be deleted but cannot be hoisted.
+  %unused1 = udiv i64 42, %maybe_zero
+  br i1 %t2, label %bb3, label %return
+; BEFORE:      bb2:
+; BEFORE:        br i1 {{.*}}, label %bb3, label %return
+; AFTER-NOT:   bb2:
+; AFTER-NOT:     br
+
+bb3:
+  %t3 = icmp slt i64 %x.0, %m
+  ; This unused division prevents unifying this loop exit path with others
+  ; because it can be deleted but cannot be hoisted.
+  %unused2 = sdiv i64 42, %maybe_zero
+  br i1 %t3, label %bb, label %return
+; BEFORE:      bb3:
+; BEFORE:        br i1 {{.*}}, label %bb, label %return
+; AFTER-NOT:   bb3:
+; AFTER-NOT:     br
+
+return:
+  %x.lcssa = phi i64 [ 10, %bb ], [ 10, %bb2 ], [ 10, %bb3 ]
+  ret i64 %x.lcssa
+; CHECK:       return:
+; BEFORE-NEXT:   %[[X:.*]] = phi i64 [ 10, %bb ], [ 10, %bb2 ], [ 10, %bb3 ]
+; AFTER-NEXT:    %[[X:.*]] = phi i64 [ 10, %entry ]
+; CHECK-NEXT:    ret i64 %[[X]]
+}
+
+; This function has a loop which looks like @bar's but that cannot be deleted
+; because which path we exit through determines which value is selected.
+define i64 @baz(i64 %n, i64 %m, i64 %maybe_zero) nounwind {
+; CHECK-LABEL:  @baz(
+
+entry:
+  br label %bb
+; CHECK:       entry:
+; CHECK-NEXT:    br label %bb
+
+bb:
+  %x.0 = phi i64 [ 0, %entry ], [ %t0, %bb3 ]
+  %t0 = add i64 %x.0, 1
+  %t1 = icmp slt i64 %x.0, %n
+  br i1 %t1, label %bb2, label %return
+; CHECK:       bb:
+; CHECK:         br i1 {{.*}}, label %bb2, label %return
+
+bb2:
+  %t2 = icmp slt i64 %x.0, %m
+  ; This unused division prevents unifying this loop exit path with others
+  ; because it can be deleted but cannot be hoisted.
+  %unused1 = udiv i64 42, %maybe_zero
+  br i1 %t2, label %bb3, label %return
+; CHECK:       bb2:
+; CHECK:         br i1 {{.*}}, label %bb3, label %return
+
+bb3:
+  %t3 = icmp slt i64 %x.0, %m
+  ; This unused division prevents unifying this loop exit path with others
+  ; because it can be deleted but cannot be hoisted.
+  %unused2 = sdiv i64 42, %maybe_zero
+  br i1 %t3, label %bb, label %return
+; CHECK:       bb3:
+; CHECK:         br i1 {{.*}}, label %bb, label %return
+
+return:
+  %x.lcssa = phi i64 [ 12, %bb ], [ 10, %bb2 ], [ 10, %bb3 ]
+  ret i64 %x.lcssa
+; CHECK: return:
+; CHECK-NEXT:  %[[X:.*]] = phi i64 [ 12, %bb ], [ 10, %bb2 ], [ 10, %bb3 ]
+; CHECK-NEXT:  ret i64 %[[X]]
+}

Added: llvm/trunk/test/Transforms/LoopDeletion/simplify-then-delete.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDeletion/simplify-then-delete.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDeletion/simplify-then-delete.ll (added)
+++ llvm/trunk/test/Transforms/LoopDeletion/simplify-then-delete.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,67 @@
+; RUN: opt < %s -S -indvars -loop-deletion -simplifycfg | FileCheck %s
+; PR5794
+
+; Indvars and loop deletion should be able to eliminate all looping
+; in this testcase.
+
+; CHECK:      define i32 @pmat(i32 %m, i32 %n, double* %y) #0 {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   ret i32 0
+; CHECK-NEXT: }
+
+target datalayout = "e-p:64:64:64"
+
+define i32 @pmat(i32 %m, i32 %n, double* %y) nounwind {
+entry:
+  %cmp4 = icmp sgt i32 %m, 0
+  br i1 %cmp4, label %bb.n10, label %w.e12
+
+w.c:
+  %cmp = icmp slt i32 %inc11, %m
+  br i1 %cmp, label %w.c2.p, label %w.c.w.e12c
+
+w.c.w.e12c:
+  br label %w.c.w.e12c.s
+
+w.c.w.e12c.s:
+  br label %w.e12
+
+bb.n10:
+  %cmp51 = icmp sgt i32 %n, 0
+  br i1 %cmp51, label %bb.n10.w.c.w.e12c.sc, label %bb.n10.bb.n10.sc
+
+bb.n10.bb.n10.sc:
+  br label %bb.n10.s
+
+bb.n10.w.c.w.e12c.sc:
+  br label %w.c.w.e12c.s
+
+bb.n10.s:
+  br label %w.c2.p
+
+w.c2.p:
+  %i.05 = phi i32 [ 0, %bb.n10.s ], [ %inc11, %w.c ]
+  br i1 false, label %bb.n, label %w.e
+
+w.c2:
+  br i1 undef, label %w.b6, label %w.c2.w.ec
+
+w.c2.w.ec:
+  br label %w.e
+
+bb.n:
+  br label %w.b6
+
+w.b6:
+  br label %w.c2
+
+w.e:
+  %i.08 = phi i32 [ undef, %w.c2.w.ec ], [ %i.05, %w.c2.p ]
+  %inc11 = add nsw i32 %i.08, 1
+  br label %w.c
+
+w.e12:
+  ret i32 0
+}
+
+; CHECK: attributes #0 = { nounwind }

Added: llvm/trunk/test/Transforms/LoopDeletion/unreachable-loops.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDeletion/unreachable-loops.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDeletion/unreachable-loops.ll (added)
+++ llvm/trunk/test/Transforms/LoopDeletion/unreachable-loops.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,412 @@
+; RUN: opt < %s -loop-deletion -verify-dom-info -S | FileCheck %s
+
+; Checking that we can delete loops that are never executed.
+; We do not change the constant conditional branch statement (where the not-taken target
+; is the loop) to an unconditional one.
+
+; delete the infinite loop because it is never executed.
+define void @test1(i64 %n, i64 %m) nounwind {
+; CHECK-LABEL: test1
+; CHECK-LABEL: entry:
+; CHECK-NEXT: br i1 true, label %return, label %bb.preheader
+; CHECK-NOT: bb:
+entry:
+  br i1 true, label %return, label %bb
+
+bb:
+  %x.0 = phi i64 [ 0, %entry ], [ %t0, %bb ]
+  %t0 = add i64 %x.0, 1
+  %t1 = icmp slt i64 %x.0, %n
+  %t3 = icmp sgt i64 %x.0, %m
+  %t4 = and i1 %t1, %t3
+  br i1 true, label %bb, label %return
+
+return:
+  ret void
+}
+
+; FIXME: We can delete this infinite loop. Currently we do not,
+; because the infinite loop has no exit block.
+define void @test2(i64 %n, i64 %m) nounwind {
+; CHECK-LABEL: test2
+; CHECK-LABEL: entry:
+; CHECK-NEXT: br i1 true, label %return, label %bb.preheader
+; CHECK-LABEL: bb:
+; CHECK: br label %bb
+entry:
+  br i1 true, label %return, label %bb
+
+bb:
+  %x.0 = phi i64 [ 0, %entry ], [ %t0, %bb ]
+  %t0 = add i64 %x.0, 1
+  %t1 = icmp slt i64 %x.0, %n
+  %t3 = icmp sgt i64 %x.0, %m
+  %t4 = and i1 %t1, %t3
+  br label %bb
+
+return:
+  ret void
+}
+
+; There are multiple exiting blocks and a single exit block. 
+; Since it is a never executed loop, we do not care about the values
+; from different exiting paths and we can
+; delete the loop.
+define i64 @test3(i64 %n, i64 %m, i64 %maybe_zero) nounwind {
+
+; CHECK-NOT: bb:
+; CHECK-NOT: bb2:
+; CHECK-NOT: bb3:
+; CHECK-LABEL: return.loopexit:
+; CHECK-NEXT: %x.lcssa.ph = phi i64 [ undef, %bb.preheader ]
+; CHECK-NEXT: br label %return
+; CHECK-LABEL: return:
+; CHECK-NEXT: %x.lcssa = phi i64 [ 20, %entry ], [ %x.lcssa.ph, %return.loopexit ]
+; CHECK-NEXT: ret i64 %x.lcssa
+entry:
+  br i1 false, label %bb, label %return
+
+bb:
+  %x.0 = phi i64 [ 0, %entry ], [ %t0, %bb3 ]
+  %t0 = add i64 %x.0, 1
+  %t1 = icmp slt i64 %x.0, %n
+  br i1 %t1, label %bb2, label %return
+
+bb2:
+  %t2 = icmp slt i64 %x.0, %m
+  %unused1 = udiv i64 42, %maybe_zero
+  br i1 %t2, label %bb3, label %return
+
+bb3:
+  %t3 = icmp slt i64 %x.0, %m
+  %unused2 = sdiv i64 42, %maybe_zero
+  br i1 %t3, label %bb, label %return
+
+return:
+; the only valid value fo x.lcssa is 20.
+  %x.lcssa = phi i64 [ 12, %bb ], [ 14, %bb2 ], [ 16, %bb3 ], [20, %entry ]
+  ret i64 %x.lcssa
+}
+
+; Cannot delete the loop, since it may be executed at runtime.
+define void @test4(i64 %n, i64 %m, i1 %cond) {
+; CHECK-LABEL: test4
+; CHECK-LABEL: bb:
+entry:
+  br i1 %cond, label %looppred1, label %looppred2
+
+looppred1:
+  br i1 true, label %return, label %bb
+
+looppred2:
+  br i1 false, label %return, label %bb
+
+bb:
+  %x.0 = phi i64 [ 0, %looppred1 ], [ 1, %looppred2 ], [ %t0, %bb ]
+  %t0 = add i64 %x.0, 1
+  %t1 = icmp slt i64 %x.0, %n
+  %t3 = icmp sgt i64 %x.0, %m
+  %t4 = and i1 %t1, %t3
+  br i1 true, label %bb, label %return
+
+return:
+  ret void
+}
+
+; multiple constant conditional branches with loop not-taken in all cases.
+define void @test5(i64 %n, i64 %m, i1 %cond) nounwind {
+; CHECK-LABEL: test5
+; CHECK-LABEL: looppred1:
+; CHECK-NEXT: br i1 true, label %return, label %bb.preheader
+; CHECK-LABEL: looppred2:
+; CHECK-NEXT: br i1 true, label %return, label %bb.preheader
+; CHECK-NOT: bb:
+entry:
+  br i1 %cond, label %looppred1, label %looppred2
+
+looppred1:
+  br i1 true, label %return, label %bb
+
+looppred2:
+  br i1 true, label %return, label %bb
+
+bb:
+  %x.0 = phi i64 [ 0, %looppred1 ], [ 1, %looppred2 ], [ %t0, %bb ]
+  %t0 = add i64 %x.0, 1
+  %t1 = icmp slt i64 %x.0, %n
+  %t3 = icmp sgt i64 %x.0, %m
+  %t4 = and i1 %t1, %t3
+  br i1 true, label %bb, label %return
+
+return:
+  ret void
+}
+
+; Don't delete this infinite loop because the loop 
+; is executable at runtime.
+define void @test6(i64 %n, i64 %m) nounwind {
+; CHECK-LABEL: test6
+; CHECK-LABEL: entry:
+; CHECK-NEXT: br i1 true, label %bb.preheader, label %bb.preheader
+; CHECK: bb:
+entry:
+  br i1 true, label %bb, label %bb
+
+bb:
+  %x.0 = phi i64 [ 0, %entry ], [ 0, %entry ], [ %t0, %bb ]
+  %t0 = add i64 %x.0, 1
+  %t1 = icmp slt i64 %x.0, %n
+  %t3 = icmp sgt i64 %x.0, %m
+  %t4 = and i1 %t1, %t3
+  br i1 true, label %bb, label %return
+
+return:
+  ret void
+}
+
+declare i64 @foo(i64)
+; The loop L2 is never executed and is a subloop, with an 
+; exit block that branches back to parent loop.
+; Here we can delete loop L2, while L1 still exists.
+define i64 @test7(i64 %n) {
+; CHECK-LABEL: test7
+; CHECK-LABEL: L1:
+; CHECK: br i1 true, label %L1Latch, label %L2.preheader
+; CHECK-LABEL: L2.preheader:
+; CHECK-NEXT: br label %L1Latch.loopexit
+; CHECK-LABEL: L1Latch.loopexit:
+; CHECK: br label %L1Latch
+; CHECK-LABEL: L1Latch:
+; CHECK-NEXT: %y = phi i64 [ %y.next, %L1 ], [ %y.L2.lcssa, %L1Latch.loopexit ]
+; CHECK: br i1 %cond2, label %exit, label %L1
+entry: 
+  br label %L1
+
+L1:
+  %y.next = phi i64 [ 0, %entry ], [ %y.add, %L1Latch ]
+  br i1 true, label %L1Latch, label %L2
+
+L2:
+  %x = phi i64 [ 0, %L1 ], [ %x.next, %L2 ]
+  %x.next = add i64 %x, 1
+  %y.L2 = call i64 @foo(i64 %x.next)
+  %cond = icmp slt i64 %x.next, %n
+  br i1 %cond, label %L2, label %L1Latch
+
+L1Latch:
+ %y = phi i64 [ %y.next, %L1 ], [ %y.L2, %L2 ]
+ %y.add = add i64 %y, %n
+ %cond2 = icmp eq i64 %y.add, 42
+ br i1 %cond2, label %exit, label %L1
+
+exit:
+ ret i64 %y.add
+}
+
+
+; Show recursive deletion of loops. Since we start with subloops and progress outward 
+; to parent loop, we first delete the loop L2. Now loop L1 becomes a non-loop since it's backedge
+; from L2's preheader to L1's exit block is never taken. So, L1 gets deleted as well.
+define void @test8(i64 %n) {
+; CHECK-LABEL: test8
+; CHECK-LABEL: entry:
+; CHECK-NEXT: br label %exit
+; CHECK-LABEL: exit:
+; CHECK-NEXT: ret void
+entry: 
+  br label %L1
+
+L1:
+  br i1 true, label %exit, label %L2
+
+L2:
+  %x = phi i64 [ 0, %L1 ], [ %x.next, %L2 ]
+  %x.next = add i64 %x, 1
+  %y.L2 = call i64 @foo(i64 %x.next)
+  %cond = icmp slt i64 %x.next, %n
+  br i1 %cond, label %L2, label %L1
+
+exit:
+ ret void
+}
+
+
+; Delete a loop (L2) which has subloop (L3).
+; Here we delete loop L2, but leave L3 as is.
+; FIXME: Can delete L3 as well, by iteratively going backward through the single
+; predecessor of L3 until we reach L1's block that guarantees L3 is never
+; executed.
+define void @test9(i64 %n) {
+; CHECK-LABEL: test9
+; CHECK-LABEL: L2.preheader:
+; CHECK-NEXT: br label %L3.preheader
+; CHECK-NOT: L2:
+; CHECK-LABEL: L3.preheader:
+; CHECK-NEXT: %y.L2.lcssa = phi i64 [ undef, %L2.preheader ]
+; CHECK-NEXT: br label %L3
+; CHECK-LABEL: L3:
+; CHECK: br i1 %cond2, label %L3, label %L1.loopexit
+entry: 
+  br label %L1
+
+L1:
+  br i1 true, label %exit, label %L2
+
+L2:
+  %x = phi i64 [ 0, %L1 ], [ %x.next, %L2 ]
+  %x.next = add i64 %x, 1
+  %y.L2 = call i64 @foo(i64 %x.next)
+  %cond = icmp slt i64 %x.next, %n
+  br i1 %cond, label %L2, label %L3
+
+L3: 
+  %cond2 = icmp slt i64 %y.L2, %n
+  br i1 %cond2, label %L3, label %L1
+
+exit:
+ ret void
+}
+
+; We cannot delete L3 because of call within it.
+; Since L3 is not deleted, and entirely contained within L2, L2 is also not
+; deleted.
+; FIXME: We can delete unexecutable loops having
+; subloops contained entirely within them.
+define void @test10(i64 %n) {
+; CHECK-LABEL: test10
+; CHECK: L2:
+; CHECK: L3:
+entry: 
+  br label %L1
+
+L1:
+  br i1 true, label %exit, label %L2
+
+L2:
+  %x = phi i64 [ 0, %L1 ], [ %x.next, %L3 ]
+  %x.next = add i64 %x, 1
+  %y.L2 = call i64 @foo(i64 %x.next)
+  %cond = icmp slt i64 %x.next, %n
+  br i1 %cond, label %L1, label %L3
+
+L3:
+  %y.L3 = phi i64 [ %y.L2, %L2 ], [ %y.L3.next, %L3 ]
+  %y.L3.next = add i64 %y.L3, 1
+  %dummy = call i64 @foo(i64 %y.L3.next)
+  %cond2 = icmp slt i64 %y.L3, %n
+  br i1 %cond2, label %L3, label %L2
+
+exit:
+ ret void
+}
+
+; same as test10, but L3 does not contain call.
+; So, in the first iteration, all statements of L3 are made invariant, and L3 is
+; deleted.
+; In the next iteration, since L2 is never executed and has no subloops, we delete
+; L2 as well. Finally, the outermost loop L1 is deleted.
+define void @test11(i64 %n) {
+; CHECK-LABEL: test11
+; CHECK-LABEL: entry:
+; CHECK-NEXT: br label %exit
+; CHECK-LABEL: exit:
+; CHECK-NEXT: ret void
+entry: 
+  br label %L1
+
+L1:
+  br i1 true, label %exit, label %L2
+
+L2:
+  %x = phi i64 [ 0, %L1 ], [ %x.next, %L3 ]
+  %x.next = add i64 %x, 1
+  %y.L2 = call i64 @foo(i64 %x.next)
+  %cond = icmp slt i64 %x.next, %n
+  br i1 %cond, label %L1, label %L3
+
+L3: 
+  %y.L3 = phi i64 [ %y.L2, %L2 ], [ %y.L3.next, %L3 ]
+  %y.L3.next = add i64 %y.L3, 1
+  %cond2 = icmp slt i64 %y.L3, %n
+  br i1 %cond2, label %L3, label %L2
+
+exit:
+ ret void
+}
+
+
+; 2 edges from a single exiting block to the exit block.
+define i64 @test12(i64 %n){
+;CHECK-LABEL: @test12
+; CHECK-NOT: L1:
+; CHECK-NOT: L1Latch:
+; CHECK-LABEL: L1.preheader:
+; CHECK-NEXT:    br label %exit
+; CHECK-LABEL: exit:
+; CHECK-NEXT:    %y.phi = phi i64 [ undef, %L1.preheader ]
+; CHECK-NEXT:    ret i64 %y.phi
+
+entry:
+  br i1 true, label %exit1, label %L1
+
+exit1:
+  ret i64 42
+
+L1:                                               ; preds = %L1Latch, %entry
+  %y.next = phi i64 [ 0, %entry ], [ %y.add, %L1Latch ]
+  br i1 true, label %L1Latch, label %exit
+
+L1Latch:                                          ; preds = %L1
+  %y = phi i64 [ %y.next, %L1 ]
+  %y.add = add i64 %y, %n
+  %cond2 = icmp eq i64 %y.add, 42
+  switch i64 %n, label %L1 [
+    i64 10, label %exit
+    i64 20, label %exit
+  ]
+
+exit:                                             ; preds = %L1Latch, %L1Latch
+  %y.phi = phi i64 [ 10, %L1Latch ], [ 10, %L1Latch ], [ %y.next, %L1]
+  ret i64 %y.phi
+}
+
+; multiple edges to exit block from the same exiting blocks
+define i64 @test13(i64 %n) {
+; CHECK-LABEL: @test13
+; CHECK-NOT: L1:
+; CHECK-NOT: L1Latch:
+; CHECK-LABEL: L1.preheader:
+; CHECK-NEXT:    br label %exit
+; CHECK-LABEL: exit:
+; CHECK-NEXT:    %y.phi = phi i64 [ undef, %L1.preheader ]
+; CHECK-NEXT:    ret i64 %y.phi
+
+entry:
+  br i1 true, label %exit1, label %L1
+
+exit1:
+  ret i64 42
+
+L1:                                               ; preds = %L1Latch, %entry
+  %y.next = phi i64 [ 0, %entry ], [ %y.add, %L1Latch ]
+  br i1 true, label %L1Block, label %exit
+
+L1Block:                                          ; preds = %L1
+  %y = phi i64 [ %y.next, %L1 ]
+  %y.add = add i64 %y, %n
+  %cond2 = icmp eq i64 %y.add, 42
+  switch i64 %n, label %L1Latch [
+    i64 10, label %exit
+    i64 20, label %exit
+  ]
+
+L1Latch:
+  switch i64 %n, label %L1 [
+    i64 30, label %exit
+    i64 40, label %exit
+  ]
+
+exit:                                             ; preds = %L1Block, %L1, %L1Latch
+  %y.phi = phi i64 [ 10, %L1Block ], [ 10, %L1Block ], [ %y.next, %L1 ], [ 30, %L1Latch ], [ 30, %L1Latch ]
+  ret i64 %y.phi
+}

Added: llvm/trunk/test/Transforms/LoopDeletion/update-scev.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDeletion/update-scev.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDeletion/update-scev.ll (added)
+++ llvm/trunk/test/Transforms/LoopDeletion/update-scev.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,56 @@
+; RUN: opt -S -analyze -scalar-evolution -loop-deletion -scalar-evolution < %s | FileCheck %s --check-prefix=SCEV-EXPRS
+; RUN: opt -S -loop-deletion < %s | FileCheck %s --check-prefix=IR-AFTER-TRANSFORM
+; RUN: opt -S -indvars -loop-deletion -indvars < %s | FileCheck %s --check-prefix=ORIGINAL-CRASH
+
+; Checking for a crash.  Loop-deletion would change the loop
+; disposition of an instruction, but not update SCEV.
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+define void @pr27570() {
+; IR-AFTER-TRANSFORM-LABEL: @pr27570(
+; ORIGINAL-CRASH: @pr27570(
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond14, %entry
+  %f.0 = phi i32 [ 20, %entry ], [ 0, %for.cond14 ]
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc11, %for.cond
+; IR-AFTER-TRANSFORM: for.body:
+; IR-AFTER-TRANSFORM: %cmp = icmp eq i32 %val, -1
+; IR-AFTER-TRANSFORM: %conv7 = zext i1 %cmp to i32
+; IR-AFTER-TRANSFORM: for.body6:
+
+; SCEV-EXPRS:  %conv7 = zext i1 %cmp to i32
+; SCEV-EXPRS:  %conv7 = zext i1 %cmp to i32
+; SCEV-EXPRS-NEXT:  -->  {{.*}} LoopDispositions: { %for.body: Variant, %for.cond: Variant, %for.body6: Invariant }
+  %val = phi i32 [ -20, %for.cond ], [ %inc12, %for.inc11 ]
+  %g.040 = phi i32 [ -20, %for.cond ], [ %and.lcssa, %for.inc11 ]
+  br label %for.body6
+
+for.body6:                                        ; preds = %for.body6, %for.body
+  %h.039 = phi i32 [ 1, %for.body ], [ %inc, %for.body6 ]
+  %g.138 = phi i32 [ %g.040, %for.body ], [ %and, %for.body6 ]
+  %cmp = icmp eq i32 %val, -1
+  %conv7 = zext i1 %cmp to i32
+  %add.i = add nsw i32 %conv7, %h.039
+  %sext = shl i32 %add.i, 24
+  %conv8 = ashr exact i32 %sext, 24
+  %cmp9 = icmp eq i32 %conv8, %f.0
+  %conv10 = zext i1 %cmp9 to i32
+  %and = and i32 %conv10, %g.138
+  %inc = add i32 %h.039, 1
+  br i1 undef, label %for.inc11, label %for.body6
+
+for.inc11:                                        ; preds = %for.body6
+  %and.lcssa = phi i32 [ %and, %for.body6 ]
+  %inc12 = add nsw i32 %val, 1
+  %tobool = icmp eq i32 %inc12, 0
+  br i1 %tobool, label %for.cond14, label %for.body
+
+for.cond14:                                       ; preds = %for.cond14, %for.inc11
+  br i1 undef, label %for.cond, label %for.cond14
+}

Added: llvm/trunk/test/Transforms/LoopDeletion/use-in-unreachable.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDeletion/use-in-unreachable.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDeletion/use-in-unreachable.ll (added)
+++ llvm/trunk/test/Transforms/LoopDeletion/use-in-unreachable.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,24 @@
+; RUN: opt < %s -loop-deletion -S | FileCheck %s
+
+; Checking that possible users of instruction from the loop in
+; unreachable blocks are handled.
+
+define i64 @foo() {
+entry:
+  br label %invloop
+; CHECK-LABEL-NOT: invloop
+invloop:
+  %indvar1 = phi i64 [ 3, %entry ], [ %indvar2, %invloop_iter ]
+  %check = icmp ult i64 %indvar1, 400
+  br i1 %check, label %invloop_iter, label %loopexit
+invloop_iter:
+  %indvar2 = add i64 %indvar1, 1
+  %baddef = add i64 0, 0
+  br label %invloop
+loopexit:
+  ret i64 0
+deadcode:
+; CHECK-LABEL: deadcode
+; CHECK: ret i64 undef
+  ret i64 %baddef
+}

Added: llvm/trunk/test/Transforms/LoopDistribute/basic-with-memchecks.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDistribute/basic-with-memchecks.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDistribute/basic-with-memchecks.ll (added)
+++ llvm/trunk/test/Transforms/LoopDistribute/basic-with-memchecks.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,110 @@
+; RUN: opt -basicaa -loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info -S \
+; RUN:   < %s | FileCheck %s
+
+; RUN: opt -basicaa -loop-distribute -enable-loop-distribute -loop-vectorize -force-vector-width=4 \
+; RUN:   -verify-loop-info -verify-dom-info -S < %s | \
+; RUN:   FileCheck --check-prefix=VECTORIZE %s
+
+; The memcheck version of basic.ll.  We should distribute and vectorize the
+; second part of this loop with 5 memchecks (A+1 x {C, D, E} + C x {A, B})
+;
+;   for (i = 0; i < n; i++) {
+;     A[i + 1] = A[i] * B[i];
+; -------------------------------
+;     C[i] = D[i] * E[i];
+;   }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+ at B = common global i32* null, align 8
+ at A = common global i32* null, align 8
+ at C = common global i32* null, align 8
+ at D = common global i32* null, align 8
+ at E = common global i32* null, align 8
+
+define void @f() {
+entry:
+  %a = load i32*, i32** @A, align 8
+  %b = load i32*, i32** @B, align 8
+  %c = load i32*, i32** @C, align 8
+  %d = load i32*, i32** @D, align 8
+  %e = load i32*, i32** @E, align 8
+  br label %for.body
+
+; We have two compares for each array overlap check.
+; Since the checks to A and A + 4 get merged, this will give us a
+; total of 8 compares.
+;
+; CHECK: for.body.lver.check:
+; CHECK:     = icmp
+; CHECK:     = icmp
+
+; CHECK:     = icmp
+; CHECK:     = icmp
+
+; CHECK:     = icmp
+; CHECK:     = icmp
+
+; CHECK:     = icmp
+; CHECK:     = icmp
+
+; CHECK-NOT: = icmp
+; CHECK:     br i1 %memcheck.conflict, label %for.body.ph.lver.orig, label %for.body.ph.ldist1
+
+; The non-distributed loop that the memchecks fall back on.
+
+; CHECK: for.body.ph.lver.orig:
+; CHECK:     br label %for.body.lver.orig
+; CHECK: for.body.lver.orig:
+; CHECK:    br i1 %exitcond.lver.orig, label %for.end, label %for.body.lver.orig
+
+; Verify the two distributed loops.
+
+; CHECK: for.body.ph.ldist1:
+; CHECK:     br label %for.body.ldist1
+; CHECK: for.body.ldist1:
+; CHECK:    %mulA.ldist1 = mul i32 %loadB.ldist1, %loadA.ldist1
+; CHECK:    br i1 %exitcond.ldist1, label %for.body.ph, label %for.body.ldist1
+
+; CHECK: for.body.ph:
+; CHECK:    br label %for.body
+; CHECK: for.body:
+; CHECK:    %mulC = mul i32 %loadD, %loadE
+; CHECK: for.end:
+
+
+; VECTORIZE: mul <4 x i32>
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+  %loadD = load i32, i32* %arrayidxD, align 4
+
+  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+  %loadE = load i32, i32* %arrayidxE, align 4
+
+  %mulC = mul i32 %loadD, %loadE
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopDistribute/basic.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDistribute/basic.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDistribute/basic.ll (added)
+++ llvm/trunk/test/Transforms/LoopDistribute/basic.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,83 @@
+; RUN: opt -basicaa -loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info -S \
+; RUN:   < %s | FileCheck %s
+
+; RUN: opt -basicaa -loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info \
+; RUN:   -loop-accesses -analyze < %s | FileCheck %s --check-prefix=ANALYSIS
+
+; RUN: opt -basicaa -loop-distribute -enable-loop-distribute -loop-vectorize -force-vector-width=4 -S \
+; RUN:   < %s | FileCheck %s --check-prefix=VECTORIZE
+
+; We should distribute this loop into a safe (2nd statement) and unsafe loop
+; (1st statement):
+;   for (i = 0; i < n; i++) {
+;     A[i + 1] = A[i] * B[i];
+;     =======================
+;     C[i] = D[i] * E[i];
+;   }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+define void @f(i32* noalias %a,
+               i32* noalias %b,
+               i32* noalias %c,
+               i32* noalias %d,
+               i32* noalias %e) {
+entry:
+  br label %for.body
+
+; Verify the two distributed loops.
+
+; CHECK: entry.split.ldist1:
+; CHECK:    br label %for.body.ldist1
+; CHECK: for.body.ldist1:
+; CHECK:    %mulA.ldist1 = mul i32 %loadB.ldist1, %loadA.ldist1
+; CHECK:    br i1 %exitcond.ldist1, label %entry.split, label %for.body.ldist1
+
+; CHECK: entry.split:
+; CHECK:    br label %for.body
+; CHECK: for.body:
+; CHECK:    %mulC = mul i32 %loadD, %loadE
+; CHECK: for.end:
+
+
+; ANALYSIS: for.body:
+; ANALYSIS-NEXT: Memory dependences are safe{{$}}
+; ANALYSIS: for.body.ldist1:
+; ANALYSIS-NEXT: Report: unsafe dependent memory operations in loop
+
+
+; VECTORIZE: mul <4 x i32>
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+  %loadD = load i32, i32* %arrayidxD, align 4
+
+  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+  %loadE = load i32, i32* %arrayidxE, align 4
+
+  %mulC = mul i32 %loadD, %loadE
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopDistribute/bounds-expansion-bug.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDistribute/bounds-expansion-bug.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDistribute/bounds-expansion-bug.ll (added)
+++ llvm/trunk/test/Transforms/LoopDistribute/bounds-expansion-bug.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,106 @@
+; RUN: opt -basicaa -loop-distribute -enable-loop-distribute -S < %s | FileCheck %s
+
+; When emitting the memchecks for:
+;
+;   for (i = 0; i < n; i++) {
+;     A[i + 1] = A[i] * B[i];
+;     =======================
+;     C[i] = D[i] * E[i];
+;   }
+;
+; we had a bug when expanding the bounds for A and C.  These are expanded
+; multiple times and rely on the caching in SCEV expansion to avoid any
+; redundancy.  However, due to logic in SCEVExpander::ReuseOrCreateCast, we
+; can get earlier expanded values invalidated when casts are used.  This test
+; ensure that we are not using the invalidated values.
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @f(i32* %a1, i32* %a2,
+               i32* %b,
+               i32* %c1, i32* %c2,
+               i32* %d,
+               i32* %e) {
+entry:
+
+  %cond = icmp eq i32* %e, null
+  br i1 %cond, label %one, label %two
+one:
+  br label %join
+two:
+  br label %join
+join:
+
+; The pointers need to be defined by PHIs in order for the bug to trigger.
+; Because of the PHIs the existing casts won't be at the desired location so a
+; new cast will be emitted and the old cast will get invalidated.
+;
+; These are the steps:
+;
+; 1. After the bounds for A and C are first expanded:
+;
+;   join:
+;     %a = phi i32* [ %a1, %one ], [ %a2, %two ]
+;     %c = phi i32* [ %c1, %one ], [ %c2, %two ]
+;     %c5 = bitcast i32* %c to i8*
+;     %a3 = bitcast i32* %a to i8*
+;
+; 2. After A is expanded again:
+;
+;   join:                                             ; preds = %two, %one
+;     %a = phi i32* [ %a1, %one ], [ %a2, %two ]
+;     %c = phi i32* [ %c1, %one ], [ %c2, %two ]
+;     %a3 = bitcast i32* %a to i8*                   <--- new
+;     %c5 = bitcast i32* %c to i8*
+;     %0 = bitcast i32* undef to i8*                 <--- old, invalidated
+;
+; 3. Finally, when C is expanded again:
+;
+;   join:                                             ; preds = %two, %one
+;     %a = phi i32* [ %a1, %one ], [ %a2, %two ]
+;     %c = phi i32* [ %c1, %one ], [ %c2, %two ]
+;     %c5 = bitcast i32* %c to i8*                   <--- new
+;     %a3 = bitcast i32* %a to i8*
+;     %0 = bitcast i32* undef to i8*                 <--- old, invalidated
+;     %1 = bitcast i32* undef to i8*
+
+  %a = phi i32* [%a1, %one], [%a2, %two]
+  %c = phi i32* [%c1, %one], [%c2, %two]
+  br label %for.body
+
+
+; CHECK: [[VALUE:%[0-9a-z]+]] = bitcast i32* undef to i8*
+; CHECK-NOT: [[VALUE]]
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %join ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+  %loadD = load i32, i32* %arrayidxD, align 4
+
+  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+  %loadE = load i32, i32* %arrayidxE, align 4
+
+  %mulC = mul i32 %loadD, %loadE
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopDistribute/crash-in-memcheck-generation.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDistribute/crash-in-memcheck-generation.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDistribute/crash-in-memcheck-generation.ll (added)
+++ llvm/trunk/test/Transforms/LoopDistribute/crash-in-memcheck-generation.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,59 @@
+; RUN: opt -basicaa -loop-distribute -enable-loop-distribute -loop-vectorize -force-vector-width=4 \
+; RUN:   -verify-loop-info -verify-dom-info -S < %s | FileCheck %s
+
+; If only A and B can alias here, we don't need memchecks to distribute since
+; A and B are in the same partition.  This used to cause a crash in the
+; memcheck generation.
+;
+;   for (i = 0; i < n; i++) {
+;     A[i + 1] = A[i] * B[i];
+; ------------------------------
+;     C[i] = D[i] * E[i];
+;   }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+define void @f(i32*  %a,
+               i32*  %b,
+               i32* noalias %c,
+               i32* noalias %d,
+               i32* noalias %e) {
+entry:
+  br label %for.body
+
+; CHECK-NOT: memcheck:
+; CHECK: mul <4 x i32>
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+  %loadD = load i32, i32* %arrayidxD, align 4
+
+  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+  %loadE = load i32, i32* %arrayidxE, align 4
+
+  %mulC = mul i32 %loadD, %loadE
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll (added)
+++ llvm/trunk/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,83 @@
+; RUN: opt -loop-simplify -loop-distribute -enable-loop-distribute -S -pass-remarks-missed=loop-distribute \
+; RUN:     -pass-remarks-analysis=loop-distribute \
+; RUN:     -pass-remarks-with-hotness < %s 2>&1 | FileCheck %s --check-prefix=HOTNESS
+; RUN: opt -loop-simplify -loop-distribute -enable-loop-distribute -S -pass-remarks-missed=loop-distribute \
+; RUN:     -pass-remarks-analysis=loop-distribute \
+; RUN:                                < %s 2>&1 | FileCheck %s --check-prefix=NO_HOTNESS
+
+; RUN: opt -passes='loop-simplify,require<aa>,loop-distribute' -S -pass-remarks-missed=loop-distribute \
+; RUN:     -pass-remarks-analysis=loop-distribute \
+; RUN:     -pass-remarks-with-hotness < %s 2>&1 | FileCheck %s --check-prefix=HOTNESS
+; RUN: opt -passes='loop-simplify,require<aa>,loop-distribute' -S -pass-remarks-missed=loop-distribute \
+; RUN:     -pass-remarks-analysis=loop-distribute \
+; RUN:                                < %s 2>&1 | FileCheck %s --check-prefix=NO_HOTNESS
+
+; This is the input program:
+;
+;     1	void forced (char *A, char *B, char *C, int N) {
+;     2	#pragma clang loop distribute(enable)
+;     3	  for(int i = 0; i < N; i++) {
+;     4	    A[i] = B[i] * C[i];
+;     5	  }
+;     6	}
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+; HOTNESS: remark: /tmp/t.c:3:3: loop not distributed: use -Rpass-analysis=loop-distribute for more info (hotness: 300)
+; HOTNESS: remark: /tmp/t.c:3:3: loop not distributed: memory operations are safe for vectorization (hotness: 300)
+; NO_HOTNESS: remark: /tmp/t.c:3:3: loop not distributed: use -Rpass-analysis=loop-distribute for more info{{$}}
+; NO_HOTNESS: remark: /tmp/t.c:3:3: loop not distributed: memory operations are safe for vectorization{{$}}
+
+define void @forced(i8* %A, i8* %B, i8* %C, i32 %N) !dbg !7 !prof !22 {
+entry:
+  %cmp12 = icmp sgt i32 %N, 0, !dbg !9
+  br i1 %cmp12, label %ph, label %for.cond.cleanup, !dbg !10, !prof !23
+
+ph:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %ph ]
+  %arrayidx = getelementptr inbounds i8, i8* %B, i64 %indvars.iv, !dbg !12
+  %0 = load i8, i8* %arrayidx, align 1, !dbg !12, !tbaa !13
+  %arrayidx2 = getelementptr inbounds i8, i8* %C, i64 %indvars.iv, !dbg !16
+  %1 = load i8, i8* %arrayidx2, align 1, !dbg !16, !tbaa !13
+  %mul = mul i8 %1, %0, !dbg !17
+  %arrayidx6 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv, !dbg !18
+  store i8 %mul, i8* %arrayidx6, align 1, !dbg !19, !tbaa !13
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !10
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !10
+  %exitcond = icmp eq i32 %lftr.wideiv, %N, !dbg !10
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !10, !llvm.loop !20, !prof !24
+
+for.cond.cleanup:
+  ret void, !dbg !11
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (trunk 267633) (llvm/trunk 267675)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!1 = !DIFile(filename: "/tmp/t.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 2}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!7 = distinct !DISubprogram(name: "forced", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
+!8 = !DISubroutineType(types: !2)
+!9 = !DILocation(line: 3, column: 20, scope: !7)
+!10 = !DILocation(line: 3, column: 3, scope: !7)
+!11 = !DILocation(line: 6, column: 1, scope: !7)
+!12 = !DILocation(line: 4, column: 12, scope: !7)
+!13 = !{!14, !14, i64 0}
+!14 = !{!"omnipotent char", !15, i64 0}
+!15 = !{!"Simple C/C++ TBAA"}
+!16 = !DILocation(line: 4, column: 19, scope: !7)
+!17 = !DILocation(line: 4, column: 17, scope: !7)
+!18 = !DILocation(line: 4, column: 5, scope: !7)
+!19 = !DILocation(line: 4, column: 10, scope: !7)
+!20 = distinct !{!20, !21}
+!21 = !{!"llvm.loop.distribute.enable", i1 true}
+!22 = !{!"function_entry_count", i64 3}
+!23 = !{!"branch_weights", i32 99, i32 1}
+!24 = !{!"branch_weights", i32 1, i32 99}

Added: llvm/trunk/test/Transforms/LoopDistribute/diagnostics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDistribute/diagnostics.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDistribute/diagnostics.ll (added)
+++ llvm/trunk/test/Transforms/LoopDistribute/diagnostics.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,179 @@
+; RUN: opt -loop-simplify -loop-distribute -enable-loop-distribute -S < %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=ALWAYS --check-prefix=NO_REMARKS
+; RUN: opt -loop-simplify -loop-distribute -enable-loop-distribute -S \
+; RUN:     -pass-remarks-missed=loop-distribute < %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=ALWAYS --check-prefix=MISSED_REMARKS
+; RUN: opt -loop-simplify -loop-distribute -enable-loop-distribute -S \
+; RUN:     -pass-remarks-analysis=loop-distribute < %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=ALWAYS --check-prefix=ANALYSIS_REMARKS
+; RUN: opt -loop-simplify -loop-distribute -enable-loop-distribute -S \
+; RUN:     -pass-remarks=loop-distribute < %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=ALWAYS --check-prefix=REMARKS
+
+; This is the input program:
+;
+;     1	void forced (char *A, char *B, char *C, int N) {
+;     2	#pragma clang loop distribute(enable)
+;     3	  for(int i = 0; i < N; i++) {
+;     4	    A[i] = B[i] * C[i];
+;     5	  }
+;     6	}
+;     7
+;     8	void not_forced (char *A, char *B, char *C, int N) {
+;     9	  for(int i = 0; i < N; i++) {
+;    10	    A[i] = B[i] * C[i];
+;    11	  }
+;    12	}
+;    13
+;    14 void success (char *A, char *B, char *C, char *D, char *E, int N) {
+;    15   for(int i = 0; i < N; i++) {
+;    16     A[i + 1] = A[i] + B[i];
+;    17     C[i] = D[i] * E[i];
+;    18   }
+;    19 }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+; MISSED_REMARKS: remark:  /tmp/t.c:3:3: loop not distributed: use -Rpass-analysis=loop-distribute for more info
+; ALWAYS:         remark: /tmp/t.c:3:3: loop not distributed: memory operations are safe for vectorization
+; ALWAYS:         warning: /tmp/t.c:3:3: loop not distributed: failed explicitly specified loop distribution
+
+define void @forced(i8* %A, i8* %B, i8* %C, i32 %N) !dbg !7 {
+entry:
+  %cmp12 = icmp sgt i32 %N, 0, !dbg !9
+  br i1 %cmp12, label %ph, label %for.cond.cleanup, !dbg !10
+
+ph:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %ph ]
+  %arrayidx = getelementptr inbounds i8, i8* %B, i64 %indvars.iv, !dbg !12
+  %0 = load i8, i8* %arrayidx, align 1, !dbg !12, !tbaa !13
+  %arrayidx2 = getelementptr inbounds i8, i8* %C, i64 %indvars.iv, !dbg !16
+  %1 = load i8, i8* %arrayidx2, align 1, !dbg !16, !tbaa !13
+  %mul = mul i8 %1, %0, !dbg !17
+  %arrayidx6 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv, !dbg !18
+  store i8 %mul, i8* %arrayidx6, align 1, !dbg !19, !tbaa !13
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !10
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !10
+  %exitcond = icmp eq i32 %lftr.wideiv, %N, !dbg !10
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !10, !llvm.loop !20
+
+for.cond.cleanup:
+  ret void, !dbg !11
+}
+
+; NO_REMARKS-NOT: remark: /tmp/t.c:9:3: loop not distributed: memory operations are safe for vectorization
+; MISSED_REMARKS: remark: /tmp/t.c:9:3: loop not distributed: use -Rpass-analysis=loop-distribute for more info
+; ANALYSIS_REMARKS: remark: /tmp/t.c:9:3: loop not distributed: memory operations are safe for vectorization
+; ALWAYS-NOT: warning: /tmp/t.c:9:3: loop not distributed: failed explicitly specified loop distribution
+
+define void @not_forced(i8* %A, i8* %B, i8* %C, i32 %N) !dbg !22 {
+entry:
+  %cmp12 = icmp sgt i32 %N, 0, !dbg !23
+  br i1 %cmp12, label %ph, label %for.cond.cleanup, !dbg !24
+
+ph:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %ph ]
+  %arrayidx = getelementptr inbounds i8, i8* %B, i64 %indvars.iv, !dbg !26
+  %0 = load i8, i8* %arrayidx, align 1, !dbg !26, !tbaa !13
+  %arrayidx2 = getelementptr inbounds i8, i8* %C, i64 %indvars.iv, !dbg !27
+  %1 = load i8, i8* %arrayidx2, align 1, !dbg !27, !tbaa !13
+  %mul = mul i8 %1, %0, !dbg !28
+  %arrayidx6 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv, !dbg !29
+  store i8 %mul, i8* %arrayidx6, align 1, !dbg !30, !tbaa !13
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !24
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !24
+  %exitcond = icmp eq i32 %lftr.wideiv, %N, !dbg !24
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !24
+
+for.cond.cleanup:
+  ret void, !dbg !25
+}
+
+; REMARKS: remark: /tmp/t.c:15:3: distributed loop
+
+define void @success(i8* %A, i8* %B, i8* %C, i8* %D, i8* %E, i32 %N) !dbg !31 {
+entry:
+  %cmp28 = icmp sgt i32 %N, 0, !dbg !32
+  br i1 %cmp28, label %ph, label %for.cond.cleanup, !dbg !33
+
+ph:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %ph ]
+  %arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv, !dbg !35
+  %0 = load i8, i8* %arrayidx, align 1, !dbg !35, !tbaa !13
+  %arrayidx2 = getelementptr inbounds i8, i8* %B, i64 %indvars.iv, !dbg !36
+  %1 = load i8, i8* %arrayidx2, align 1, !dbg !36, !tbaa !13
+  %add = add i8 %1, %0, !dbg !37
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !33
+  %arrayidx7 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv.next, !dbg !38
+  store i8 %add, i8* %arrayidx7, align 1, !dbg !39, !tbaa !13
+  %arrayidx9 = getelementptr inbounds i8, i8* %D, i64 %indvars.iv, !dbg !40
+  %2 = load i8, i8* %arrayidx9, align 1, !dbg !40, !tbaa !13
+  %arrayidx12 = getelementptr inbounds i8, i8* %E, i64 %indvars.iv, !dbg !41
+  %3 = load i8, i8* %arrayidx12, align 1, !dbg !41, !tbaa !13
+  %mul = mul i8 %3, %2, !dbg !42
+  %arrayidx16 = getelementptr inbounds i8, i8* %C, i64 %indvars.iv, !dbg !43
+  store i8 %mul, i8* %arrayidx16, align 1, !dbg !44, !tbaa !13
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !33
+  %exitcond = icmp eq i32 %lftr.wideiv, %N, !dbg !33
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !33
+
+for.cond.cleanup:
+  ret void, !dbg !34
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (trunk 267633) (llvm/trunk 267675)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!1 = !DIFile(filename: "/tmp/t.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 2}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!7 = distinct !DISubprogram(name: "forced", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
+!8 = !DISubroutineType(types: !2)
+!9 = !DILocation(line: 3, column: 20, scope: !7)
+!10 = !DILocation(line: 3, column: 3, scope: !7)
+!11 = !DILocation(line: 6, column: 1, scope: !7)
+!12 = !DILocation(line: 4, column: 12, scope: !7)
+!13 = !{!14, !14, i64 0}
+!14 = !{!"omnipotent char", !15, i64 0}
+!15 = !{!"Simple C/C++ TBAA"}
+!16 = !DILocation(line: 4, column: 19, scope: !7)
+!17 = !DILocation(line: 4, column: 17, scope: !7)
+!18 = !DILocation(line: 4, column: 5, scope: !7)
+!19 = !DILocation(line: 4, column: 10, scope: !7)
+!20 = distinct !{!20, !21}
+!21 = !{!"llvm.loop.distribute.enable", i1 true}
+!22 = distinct !DISubprogram(name: "not_forced", scope: !1, file: !1, line: 8, type: !8, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
+!23 = !DILocation(line: 9, column: 20, scope: !22)
+!24 = !DILocation(line: 9, column: 3, scope: !22)
+!25 = !DILocation(line: 12, column: 1, scope: !22)
+!26 = !DILocation(line: 10, column: 12, scope: !22)
+!27 = !DILocation(line: 10, column: 19, scope: !22)
+!28 = !DILocation(line: 10, column: 17, scope: !22)
+!29 = !DILocation(line: 10, column: 5, scope: !22)
+!30 = !DILocation(line: 10, column: 10, scope: !22)
+!31 = distinct !DISubprogram(name: "success", scope: !1, file: !1, line: 14, type: !8, isLocal: false, isDefinition: true, scopeLine: 14, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
+!32 = !DILocation(line: 15, column: 20, scope: !31)
+!33 = !DILocation(line: 15, column: 3, scope: !31)
+!34 = !DILocation(line: 19, column: 1, scope: !31)
+!35 = !DILocation(line: 16, column: 16, scope: !31)
+!36 = !DILocation(line: 16, column: 23, scope: !31)
+!37 = !DILocation(line: 16, column: 21, scope: !31)
+!38 = !DILocation(line: 16, column: 5, scope: !31)
+!39 = !DILocation(line: 16, column: 14, scope: !31)
+!40 = !DILocation(line: 17, column: 12, scope: !31)
+!41 = !DILocation(line: 17, column: 19, scope: !31)
+!42 = !DILocation(line: 17, column: 17, scope: !31)
+!43 = !DILocation(line: 17, column: 5, scope: !31)
+!44 = !DILocation(line: 17, column: 10, scope: !31)

Added: llvm/trunk/test/Transforms/LoopDistribute/disable_nonforced.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDistribute/disable_nonforced.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDistribute/disable_nonforced.ll (added)
+++ llvm/trunk/test/Transforms/LoopDistribute/disable_nonforced.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,50 @@
+; RUN: opt -loop-distribute -enable-loop-distribute=1 -S < %s | FileCheck %s
+;
+; Check that the disable_nonforced is honored by loop distribution.
+;
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: @disable_nonforced(
+; CHECK-NOT: for.body.ldist1:
+define void @disable_nonforced(i32* noalias %a,
+                         i32* noalias %b,
+                         i32* noalias %c,
+                         i32* noalias %d,
+                         i32* noalias %e) {
+entry:
+  br label %for.body
+
+for.body:
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+  %loadD = load i32, i32* %arrayidxD, align 4
+
+  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+  %loadE = load i32, i32* %arrayidxE, align 4
+
+  %mulC = mul i32 %loadD, %loadE
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+!0 = distinct !{!0, !{!"llvm.loop.disable_nonforced"}}

Added: llvm/trunk/test/Transforms/LoopDistribute/disable_nonforced_enable.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDistribute/disable_nonforced_enable.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDistribute/disable_nonforced_enable.ll (added)
+++ llvm/trunk/test/Transforms/LoopDistribute/disable_nonforced_enable.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,51 @@
+; RUN: opt -loop-distribute -S < %s | FileCheck %s
+;
+; Check that llvm.loop.distribute.enable overrides
+; llvm.loop.disable_nonforced.
+;
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: @disable_nonforced(
+; CHECK: for.body.ldist1:
+define void @disable_nonforced(i32* noalias %a,
+                         i32* noalias %b,
+                         i32* noalias %c,
+                         i32* noalias %d,
+                         i32* noalias %e) {
+entry:
+  br label %for.body
+
+for.body:
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+  %loadD = load i32, i32* %arrayidxD, align 4
+
+  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+  %loadE = load i32, i32* %arrayidxE, align 4
+
+  %mulC = mul i32 %loadD, %loadE
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+!0 = distinct !{!0, !{!"llvm.loop.disable_nonforced"}, !{!"llvm.loop.distribute.enable", i32 1}}

Added: llvm/trunk/test/Transforms/LoopDistribute/followup.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDistribute/followup.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDistribute/followup.ll (added)
+++ llvm/trunk/test/Transforms/LoopDistribute/followup.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,66 @@
+; RUN: opt -basicaa -loop-distribute -S < %s | FileCheck %s
+;
+; Check that followup loop-attributes are applied to the loops after
+; loop distribution.
+;
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @f(i32* %a, i32* %b, i32* %c, i32* %d, i32* %e) {
+entry:
+  br label %for.body
+
+for.body:
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+  %loadD = load i32, i32* %arrayidxD, align 4
+
+  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+  %loadE = load i32, i32* %arrayidxE, align 4
+
+  %mulC = mul i32 %loadD, %loadE
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+!0 = distinct !{!0, !1, !2, !3, !4, !5}
+!1 = !{!"llvm.loop.distribute.enable", i1 true}
+!2 = !{!"llvm.loop.distribute.followup_all", !{!"FollowupAll"}}
+!3 = !{!"llvm.loop.distribute.followup_coincident", !{!"FollowupCoincident", i1 false}}
+!4 = !{!"llvm.loop.distribute.followup_sequential", !{!"FollowupSequential", i32 8}}
+!5 = !{!"llvm.loop.distribute.followup_fallback", !{!"FollowupFallback"}}
+
+
+; CHECK-LABEL: for.body.lver.orig:
+; CHECK: br i1 %exitcond.lver.orig, label %for.end, label %for.body.lver.orig, !llvm.loop ![[LOOP_ORIG:[0-9]+]]
+; CHECK-LABEL: for.body.ldist1:
+; CHECK: br i1 %exitcond.ldist1, label %for.body.ph, label %for.body.ldist1, !llvm.loop ![[LOOP_SEQUENTIAL:[0-9]+]]
+; CHECK-LABEL: for.body:
+; CHECK: br i1 %exitcond, label %for.end, label %for.body, !llvm.loop ![[LOOP_COINCIDENT:[0-9]+]]
+
+; CHECK: ![[LOOP_ORIG]] = distinct !{![[LOOP_ORIG]], ![[FOLLOWUP_ALL:[0-9]+]], ![[FOLLOUP_FALLBACK:[0-9]+]]}
+; CHECK: ![[FOLLOWUP_ALL]] = !{!"FollowupAll"}
+; CHECK: ![[FOLLOUP_FALLBACK]] = !{!"FollowupFallback"}
+; CHECK: ![[LOOP_SEQUENTIAL]] = distinct !{![[LOOP_SEQUENTIAL]], ![[FOLLOWUP_ALL]], ![[FOLLOWUP_SEQUENTIAL:[0-9]+]]}
+; CHECK: ![[FOLLOWUP_SEQUENTIAL]] = !{!"FollowupSequential", i32 8}
+; CHECK: ![[LOOP_COINCIDENT]] = distinct !{![[LOOP_COINCIDENT]], ![[FOLLOWUP_ALL]], ![[FOLLOWUP_COINCIDENT:[0-9]+]]}
+; CHECK: ![[FOLLOWUP_COINCIDENT]] = !{!"FollowupCoincident", i1 false}

Added: llvm/trunk/test/Transforms/LoopDistribute/metadata.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDistribute/metadata.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDistribute/metadata.ll (added)
+++ llvm/trunk/test/Transforms/LoopDistribute/metadata.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,149 @@
+; RUN: opt -basicaa -loop-distribute -enable-loop-distribute=0 -S < %s | FileCheck %s --check-prefix=CHECK --check-prefix=EXPLICIT --check-prefix=DEFAULT_OFF
+; RUN: opt -basicaa -loop-distribute -enable-loop-distribute=1 -S < %s | FileCheck %s --check-prefix=CHECK --check-prefix=EXPLICIT --check-prefix=DEFAULT_ON
+
+; Same loop as in basic.ll.  Check that distribution is enabled/disabled
+; properly according to -enable-loop-distribute=0/1 and the
+; llvm.loop.distribute.enable metadata.
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; CHECK-LABEL: @explicit_on(
+define void @explicit_on(i32* noalias %a,
+                         i32* noalias %b,
+                         i32* noalias %c,
+                         i32* noalias %d,
+                         i32* noalias %e) {
+entry:
+  br label %for.body
+
+; EXPLICIT: for.body.ldist1:
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+  %loadD = load i32, i32* %arrayidxD, align 4
+
+  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+  %loadE = load i32, i32* %arrayidxE, align 4
+
+  %mulC = mul i32 %loadD, %loadE
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; CHECK-LABEL: @explicit_off(
+define void @explicit_off(i32* noalias %a,
+                         i32* noalias %b,
+                         i32* noalias %c,
+                         i32* noalias %d,
+                         i32* noalias %e) {
+entry:
+  br label %for.body
+
+; EXPLICIT-NOT: for.body.ldist1:
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+  %loadD = load i32, i32* %arrayidxD, align 4
+
+  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+  %loadE = load i32, i32* %arrayidxE, align 4
+
+  %mulC = mul i32 %loadD, %loadE
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !2
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; CHECK-LABEL: @default_distribute(
+define void @default_distribute(i32* noalias %a,
+               i32* noalias %b,
+               i32* noalias %c,
+               i32* noalias %d,
+               i32* noalias %e) {
+entry:
+  br label %for.body
+
+; Verify the two distributed loops.
+
+; DEFAULT_ON: for.body.ldist1:
+; DEFAULT_OFF-NOT: for.body.ldist1:
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+  %loadD = load i32, i32* %arrayidxD, align 4
+
+  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+  %loadE = load i32, i32* %arrayidxE, align 4
+
+  %mulC = mul i32 %loadD, %loadE
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.distribute.enable", i1 true}
+!2 = distinct !{!2, !3}
+!3 = !{!"llvm.loop.distribute.enable", i1 false}

Added: llvm/trunk/test/Transforms/LoopDistribute/no-if-convert.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDistribute/no-if-convert.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDistribute/no-if-convert.ll (added)
+++ llvm/trunk/test/Transforms/LoopDistribute/no-if-convert.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,95 @@
+; RUN: opt -basicaa -loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info -S < %s \
+; RUN:   | FileCheck %s
+
+; We should distribute this loop along === but not along ---.  The last
+; partition won't be vectorized due to conditional stores so it's better to
+; keep it with the second partition which has a dependence cycle.
+
+; (1st statement):
+;   for (i = 0; i < n; i++) {
+;     C[i] = D[i] * E[i];
+;=============================
+;     A[i + 1] = A[i] * B[i];
+;-----------------------------
+;     if (F[i])
+;        G[i] = H[i] * J[i];
+;   }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+define void @f(i32* noalias %a,
+               i32* noalias %b,
+               i32* noalias %c,
+               i32* noalias %d,
+               i32* noalias %e,
+               i32* noalias %g,
+               i32* noalias %h,
+               i32* noalias %j,
+               i64 %x) {
+entry:
+  br label %for.body
+
+; Ensure that we have only two partitions, the first with one multiplication
+; and the second with two.
+
+; CHECK: for.body.ldist1:
+; CHECK:    %mulC.ldist1 = mul i32 %loadD.ldist1, %loadE.ldist1
+; CHECK:    br i1 %exitcond.ldist1, label %entry.split, label %for.body.ldist1
+; CHECK: entry.split:
+; CHECK:    br label %for.body
+; CHECK: for.body:
+; CHECK:    %mulA = mul i32 %loadB, %loadA
+; CHECK:    %mulG = mul i32 %loadH, %loadJ
+; CHECK: for.end:
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %if.end ]
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+  %loadD = load i32, i32* %arrayidxD, align 4
+
+  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+  %loadE = load i32, i32* %arrayidxE, align 4
+
+  %mulC = mul i32 %loadD, %loadE
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %if.cond = icmp eq i64 %ind, %x
+  br i1 %if.cond, label %if.then, label %if.end
+
+if.then:
+  %arrayidxH = getelementptr inbounds i32, i32* %h, i64 %ind
+  %loadH = load i32, i32* %arrayidxH, align 4
+
+  %arrayidxJ = getelementptr inbounds i32, i32* %j, i64 %ind
+  %loadJ = load i32, i32* %arrayidxJ, align 4
+
+  %mulG = mul i32 %loadH, %loadJ
+
+  %arrayidxG = getelementptr inbounds i32, i32* %g, i64 %ind
+  store i32 %mulG, i32* %arrayidxG, align 4
+  br label %if.end
+
+if.end:
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopDistribute/outside-use.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDistribute/outside-use.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDistribute/outside-use.ll (added)
+++ llvm/trunk/test/Transforms/LoopDistribute/outside-use.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,69 @@
+; RUN: opt -loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info -S < %s \
+; RUN:   | FileCheck %s
+
+; Check that definitions used outside the loop are handled correctly: (1) they
+; are not dropped (2) when version the loop, a phi is added to merge the value
+; from the non-distributed loop and the distributed loop.
+;
+;   for (i = 0; i < n; i++) {
+;     A[i + 1] = A[i] * B[i];
+;   ==========================
+;     sum += C[i];
+;   }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+ at B = common global i32* null, align 8
+ at A = common global i32* null, align 8
+ at C = common global i32* null, align 8
+ at D = common global i32* null, align 8
+ at E = common global i32* null, align 8
+ at SUM = common global i32 0, align 8
+
+define void @f() {
+entry:
+  %a = load i32*, i32** @A, align 8
+  %b = load i32*, i32** @B, align 8
+  %c = load i32*, i32** @C, align 8
+  %d = load i32*, i32** @D, align 8
+  %e = load i32*, i32** @E, align 8
+
+  br label %for.body
+
+; CHECK: for.body.ldist1:
+; CHECK:   %mulA.ldist1 = mul i32 %loadB.ldist1, %loadA.ldist1
+; CHECK: for.body.ph:
+; CHECK: for.body:
+; CHECK:   %sum_add = add nuw nsw i32 %sum, %loadC
+; CHECK: for.end:
+; CHECK:   %sum_add.lver = phi i32 [ %sum_add, %for.body ], [ %sum_add.lver.orig, %for.body.lver.orig ]
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  %sum = phi i32 [ 0, %entry ], [ %sum_add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+  %loadC = load i32, i32* %arrayidxC, align 4
+
+  %sum_add = add nuw nsw i32 %sum, %loadC
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  store i32 %sum_add, i32* @SUM, align 4
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopDistribute/pr28443.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDistribute/pr28443.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDistribute/pr28443.ll (added)
+++ llvm/trunk/test/Transforms/LoopDistribute/pr28443.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,36 @@
+; RUN: opt -basicaa -loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info -S \
+; RUN:   < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @fn1(i64 %a, i64* %b) {
+entry:
+  br label %for.body
+
+for.body:
+  %add75.epil = phi i64 [ %add7.epil, %for.body ], [ %a, %entry ]
+  %add1.epil = add nsw i64 %add75.epil, 268435457
+  %arrayidx.epil = getelementptr inbounds i64, i64* %b, i64 %add1.epil
+  %load = load i64, i64* %arrayidx.epil, align 8
+  %add5.epil = add nsw i64 %add75.epil, 805306369
+  %arrayidx6.epil = getelementptr inbounds i64, i64* %b, i64 %add5.epil
+  store i64 %load, i64* %arrayidx6.epil, align 8
+  %add7.epil = add nsw i64 %add75.epil, 2
+  %epil.iter.cmp = icmp eq i64 %add7.epil, 0
+  br i1 %epil.iter.cmp, label %for.end, label %for.body
+
+  ; CHECK: %[[phi:.*]]  = phi i64
+  ; CHECK: %[[add1:.*]] = add nsw i64 %[[phi]], 268435457
+  ; CHECK: %[[gep1:.*]] = getelementptr inbounds i64, i64* %b, i64 %[[add1]]
+  ; CHECK: %[[load:.*]] = load i64, i64* %[[gep1]], align 8
+  ; CHECK: %[[add2:.*]] = add nsw i64 %[[phi]], 805306369
+  ; CHECK: %[[gep2:.*]] = getelementptr inbounds i64, i64* %b, i64 %[[add2]]
+  ; CHECK: store i64 %[[load]], i64* %[[gep2]], align 8
+  ; CHECK: %[[incr:.*]] = add nsw i64 %[[phi]], 2
+  ; CHECK: %[[cmp:.*]]  = icmp eq i64 %[[incr]], 0
+  ; CHECK: br i1 %[[cmp]]
+
+for.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopDistribute/program-order.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDistribute/program-order.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDistribute/program-order.ll (added)
+++ llvm/trunk/test/Transforms/LoopDistribute/program-order.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,65 @@
+; RUN: opt -loop-distribute -enable-loop-distribute -S -verify-loop-info -verify-dom-info < %s \
+; RUN:   | FileCheck %s
+
+; Distributing this loop to avoid the dependence cycle would require to
+; reorder S1 and S2 to form the two partitions: {S2} | {S1, S3}.  The analysis
+; provided by LoopAccessAnalysis does not allow us to reorder memory
+; operations so make sure we bail on this loop.
+;
+;   for (i = 0; i < n; i++) {
+;     S1: d = D[i];
+;     S2: A[i + 1] = A[i] * B[i];
+;     S3: C[i] = d * E[i];
+;   }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+define void @f(i32* noalias %a,
+               i32* noalias %b,
+               i32* noalias %c,
+               i32* noalias %d,
+               i32* noalias %e) {
+entry:
+  br label %for.body
+
+; CHECK: entry:
+; CHECK:    br label %for.body
+; CHECK: for.body:
+; CHECK:    br i1 %exitcond, label %for.end, label %for.body
+; CHECK: for.end:
+; CHECK:    ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+  %loadD = load i32, i32* %arrayidxD, align 4
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+
+  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
+  %loadE = load i32, i32* %arrayidxE, align 4
+
+  %mulC = mul i32 %loadD, %loadE
+
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopDistribute/symbolic-stride.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDistribute/symbolic-stride.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDistribute/symbolic-stride.ll (added)
+++ llvm/trunk/test/Transforms/LoopDistribute/symbolic-stride.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,65 @@
+; RUN: opt -basicaa -loop-distribute -enable-loop-distribute -S < %s | \
+; RUN:     FileCheck %s --check-prefix=ALL --check-prefix=STRIDE_SPEC
+
+; RUN: opt -basicaa -loop-distribute -enable-loop-distribute -S -enable-mem-access-versioning=0 < %s | \
+; RUN:     FileCheck %s --check-prefix=ALL --check-prefix=NO_STRIDE_SPEC
+
+; If we don't speculate stride for 1 we can't distribute along the line
+; because we could have a backward dependence:
+;
+;   for (i = 0; i < n; i++) {
+;     A[i + 1] = A[i] * B[i];
+;     =======================
+;     C[i] = D[i] * A[stride * i];
+;   }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; ALL-LABEL: @f(
+define void @f(i32* noalias %a,
+               i32* noalias %b,
+               i32* noalias %c,
+               i32* noalias %d,
+               i64 %stride) {
+entry:
+  br label %for.body
+
+; STRIDE_SPEC: %ident.check = icmp ne i64 %stride, 1
+
+; STRIDE_SPEC: for.body.ldist1:
+; NO_STRIDE_SPEC-NOT: for.body.ldist1:
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+  %loadD = load i32, i32* %arrayidxD, align 4
+
+  %mul = mul i64 %ind, %stride
+  %arrayidxStridedA = getelementptr inbounds i32, i32* %a, i64 %mul
+  %loadStridedA = load i32, i32* %arrayidxStridedA, align 4
+
+  %mulC = mul i32 %loadD, %loadStridedA
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopDistribute/unknown-bounds-for-memchecks.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopDistribute/unknown-bounds-for-memchecks.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopDistribute/unknown-bounds-for-memchecks.ll (added)
+++ llvm/trunk/test/Transforms/LoopDistribute/unknown-bounds-for-memchecks.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,57 @@
+; RUN: opt -basicaa -loop-distribute -enable-loop-distribute -S < %s | FileCheck %s
+
+; If we can't find the bounds for one of the arrays in order to generate the
+; memchecks (e.g., C[i * i] below), loop shold not get distributed.
+;
+;   for (i = 0; i < n; i++) {
+;     A[i + 1] = A[i] * 3;
+; -------------------------------
+;     C[i * i] = B[i] * 2;
+;   }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Verify that we didn't distribute by checking that we still have the original
+; number of branches.
+
+ at A = common global i32* null, align 8
+ at B = common global i32* null, align 8
+ at C = common global i32* null, align 8
+
+define void @f() {
+entry:
+  %a = load i32*, i32** @A, align 8
+  %b = load i32*, i32** @B, align 8
+  %c = load i32*, i32** @C, align 8
+  br label %for.body
+; CHECK: br
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %mulA = mul i32 %loadA, 3
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulC = mul i32 %loadB, 2
+
+  %ind_2 = mul i64 %ind, %ind
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind_2
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body
+; CHECK: br
+; CHECK-NOT: br
+
+for.end:                                          ; preds = %for.body
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopFusion/cannot_fuse.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopFusion/cannot_fuse.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopFusion/cannot_fuse.ll (added)
+++ llvm/trunk/test/Transforms/LoopFusion/cannot_fuse.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,371 @@
+; RUN: opt -S -loop-fusion -debug-only=loop-fusion -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+ at B = common global [1024 x i32] zeroinitializer, align 16
+
+; CHECK that the two candidates for fusion are placed into separate candidate
+; sets because they are not control flow equivalent.
+
+; CHECK: Performing Loop Fusion on function non_cfe
+; CHECK: Fusion Candidates:
+; CHECK: *** Fusion Candidate Set ***
+; CHECK: bb
+; CHECK: ****************************
+; CHECK: *** Fusion Candidate Set ***
+; CHECK: bb20.preheader
+; CHECK: ****************************
+; CHECK: Loop Fusion complete
+define void @non_cfe(i32* noalias %arg) {
+bb:
+  br label %bb5
+
+bb5:                                              ; preds = %bb14, %bb
+  %indvars.iv2 = phi i64 [ %indvars.iv.next3, %bb14 ], [ 0, %bb ]
+  %.01 = phi i32 [ 0, %bb ], [ %tmp15, %bb14 ]
+  %exitcond4 = icmp ne i64 %indvars.iv2, 100
+  br i1 %exitcond4, label %bb7, label %bb16
+
+bb7:                                              ; preds = %bb5
+  %tmp = add nsw i32 %.01, -3
+  %tmp8 = add nuw nsw i64 %indvars.iv2, 3
+  %tmp9 = trunc i64 %tmp8 to i32
+  %tmp10 = mul nsw i32 %tmp, %tmp9
+  %tmp11 = trunc i64 %indvars.iv2 to i32
+  %tmp12 = srem i32 %tmp10, %tmp11
+  %tmp13 = getelementptr inbounds i32, i32* %arg, i64 %indvars.iv2
+  store i32 %tmp12, i32* %tmp13, align 4
+  br label %bb14
+
+bb14:                                             ; preds = %bb7
+  %indvars.iv.next3 = add nuw nsw i64 %indvars.iv2, 1
+  %tmp15 = add nuw nsw i32 %.01, 1
+  br label %bb5
+
+bb16:                                             ; preds = %bb5
+  %tmp17 = load i32, i32* %arg, align 4
+  %tmp18 = icmp slt i32 %tmp17, 0
+  br i1 %tmp18, label %bb20, label %bb33
+
+bb20:                                             ; preds = %bb30, %bb16
+  %indvars.iv = phi i64 [ %indvars.iv.next, %bb30 ], [ 0, %bb16 ]
+  %.0 = phi i32 [ 0, %bb16 ], [ %tmp31, %bb30 ]
+  %exitcond = icmp ne i64 %indvars.iv, 100
+  br i1 %exitcond, label %bb22, label %bb33
+
+bb22:                                             ; preds = %bb20
+  %tmp23 = add nsw i32 %.0, -3
+  %tmp24 = add nuw nsw i64 %indvars.iv, 3
+  %tmp25 = trunc i64 %tmp24 to i32
+  %tmp26 = mul nsw i32 %tmp23, %tmp25
+  %tmp27 = trunc i64 %indvars.iv to i32
+  %tmp28 = srem i32 %tmp26, %tmp27
+  %tmp29 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %indvars.iv
+  store i32 %tmp28, i32* %tmp29, align 4
+  br label %bb30
+
+bb30:                                             ; preds = %bb22
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %tmp31 = add nuw nsw i32 %.0, 1
+  br label %bb20
+
+bb33:                                             ; preds = %bb20, %bb16
+  ret void
+}
+
+; Check that fusion detects the two canddates are not adjacent (the exit block
+; of the first candidate is not the preheader of the second candidate).
+
+; CHECK: Performing Loop Fusion on function non_adjacent
+; CHECK: Fusion Candidates:
+; CHECK: *** Fusion Candidate Set ***
+; CHECK-NEXT: [[LOOP1PREHEADER:bb[0-9]*]]
+; CHECK-NEXT: [[LOOP2PREHEADER:bb[0-9]*]]
+; CHECK-NEXT: ****************************
+; CHECK: Attempting fusion on Candidate Set:
+; CHECK-NEXT: [[LOOP1PREHEADER]]
+; CHECK-NEXT: [[LOOP2PREHEADER]]
+; CHECK: Fusion candidates are not adjacent. Not fusing.
+; CHECK: Loop Fusion complete
+define void @non_adjacent(i32* noalias %arg) {
+bb:
+  br label %bb3
+
+bb3:                                              ; preds = %bb11, %bb
+  %.01 = phi i64 [ 0, %bb ], [ %tmp12, %bb11 ]
+  %exitcond2 = icmp ne i64 %.01, 100
+  br i1 %exitcond2, label %bb5, label %bb4
+
+bb4:                                              ; preds = %bb3
+  br label %bb13
+
+bb5:                                              ; preds = %bb3
+  %tmp = add nsw i64 %.01, -3
+  %tmp6 = add nuw nsw i64 %.01, 3
+  %tmp7 = mul nsw i64 %tmp, %tmp6
+  %tmp8 = srem i64 %tmp7, %.01
+  %tmp9 = trunc i64 %tmp8 to i32
+  %tmp10 = getelementptr inbounds i32, i32* %arg, i64 %.01
+  store i32 %tmp9, i32* %tmp10, align 4
+  br label %bb11
+
+bb11:                                             ; preds = %bb5
+  %tmp12 = add nuw nsw i64 %.01, 1
+  br label %bb3
+
+bb13:                                             ; preds = %bb4
+  br label %bb14
+
+bb14:                                             ; preds = %bb23, %bb13
+  %.0 = phi i64 [ 0, %bb13 ], [ %tmp24, %bb23 ]
+  %exitcond = icmp ne i64 %.0, 100
+  br i1 %exitcond, label %bb16, label %bb15
+
+bb15:                                             ; preds = %bb14
+  br label %bb25
+
+bb16:                                             ; preds = %bb14
+  %tmp17 = add nsw i64 %.0, -3
+  %tmp18 = add nuw nsw i64 %.0, 3
+  %tmp19 = mul nsw i64 %tmp17, %tmp18
+  %tmp20 = srem i64 %tmp19, %.0
+  %tmp21 = trunc i64 %tmp20 to i32
+  %tmp22 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %.0
+  store i32 %tmp21, i32* %tmp22, align 4
+  br label %bb23
+
+bb23:                                             ; preds = %bb16
+  %tmp24 = add nuw nsw i64 %.0, 1
+  br label %bb14
+
+bb25:                                             ; preds = %bb15
+  ret void
+}
+
+; Check that the different bounds are detected and prevent fusion.
+
+; CHECK: Performing Loop Fusion on function different_bounds
+; CHECK: Fusion Candidates:
+; CHECK: *** Fusion Candidate Set ***
+; CHECK-NEXT: [[LOOP1PREHEADER:bb[0-9]*]]
+; CHECK-NEXT: [[LOOP2PREHEADER:bb[0-9]*]]
+; CHECK-NEXT: ****************************
+; CHECK: Attempting fusion on Candidate Set:
+; CHECK-NEXT: [[LOOP1PREHEADER]]
+; CHECK-NEXT: [[LOOP2PREHEADER]]
+; CHECK: Fusion candidates do not have identical trip counts. Not fusing.
+; CHECK: Loop Fusion complete
+define void @different_bounds(i32* noalias %arg) {
+bb:
+  br label %bb3
+
+bb3:                                              ; preds = %bb11, %bb
+  %.01 = phi i64 [ 0, %bb ], [ %tmp12, %bb11 ]
+  %exitcond2 = icmp ne i64 %.01, 100
+  br i1 %exitcond2, label %bb5, label %bb4
+
+bb4:                                              ; preds = %bb3
+  br label %bb13
+
+bb5:                                              ; preds = %bb3
+  %tmp = add nsw i64 %.01, -3
+  %tmp6 = add nuw nsw i64 %.01, 3
+  %tmp7 = mul nsw i64 %tmp, %tmp6
+  %tmp8 = srem i64 %tmp7, %.01
+  %tmp9 = trunc i64 %tmp8 to i32
+  %tmp10 = getelementptr inbounds i32, i32* %arg, i64 %.01
+  store i32 %tmp9, i32* %tmp10, align 4
+  br label %bb11
+
+bb11:                                             ; preds = %bb5
+  %tmp12 = add nuw nsw i64 %.01, 1
+  br label %bb3
+
+bb13:                                             ; preds = %bb4
+  br label %bb14
+
+bb14:                                             ; preds = %bb23, %bb13
+  %.0 = phi i64 [ 0, %bb13 ], [ %tmp24, %bb23 ]
+  %exitcond = icmp ne i64 %.0, 200
+  br i1 %exitcond, label %bb16, label %bb15
+
+bb15:                                             ; preds = %bb14
+  br label %bb25
+
+bb16:                                             ; preds = %bb14
+  %tmp17 = add nsw i64 %.0, -3
+  %tmp18 = add nuw nsw i64 %.0, 3
+  %tmp19 = mul nsw i64 %tmp17, %tmp18
+  %tmp20 = srem i64 %tmp19, %.0
+  %tmp21 = trunc i64 %tmp20 to i32
+  %tmp22 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %.0
+  store i32 %tmp21, i32* %tmp22, align 4
+  br label %bb23
+
+bb23:                                             ; preds = %bb16
+  %tmp24 = add nuw nsw i64 %.0, 1
+  br label %bb14
+
+bb25:                                             ; preds = %bb15
+  ret void
+}
+
+; Check that the negative dependence between the two candidates is identified
+; and prevents fusion.
+
+; CHECK: Performing Loop Fusion on function negative_dependence
+; CHECK: Fusion Candidates:
+; CHECK: *** Fusion Candidate Set ***
+; CHECK-NEXT: [[LOOP1PREHEADER:bb[0-9]*]]
+; CHECK-NEXT: [[LOOP2PREHEADER:bb[0-9]*]]
+; CHECK-NEXT: ****************************
+; CHECK: Attempting fusion on Candidate Set:
+; CHECK-NEXT: [[LOOP1PREHEADER]]
+; CHECK-NEXT: [[LOOP2PREHEADER]]
+; CHECK: Memory dependencies do not allow fusion!
+; CHECK: Loop Fusion complete
+define void @negative_dependence(i32* noalias %arg) {
+bb:
+  br label %bb5
+
+bb5:                                              ; preds = %bb9, %bb
+  %indvars.iv2 = phi i64 [ %indvars.iv.next3, %bb9 ], [ 0, %bb ]
+  %exitcond4 = icmp ne i64 %indvars.iv2, 100
+  br i1 %exitcond4, label %bb7, label %bb11
+
+bb7:                                              ; preds = %bb5
+  %tmp = getelementptr inbounds i32, i32* %arg, i64 %indvars.iv2
+  %tmp8 = trunc i64 %indvars.iv2 to i32
+  store i32 %tmp8, i32* %tmp, align 4
+  br label %bb9
+
+bb9:                                              ; preds = %bb7
+  %indvars.iv.next3 = add nuw nsw i64 %indvars.iv2, 1
+  br label %bb5
+
+bb11:                                             ; preds = %bb18, %bb5
+  %indvars.iv = phi i64 [ %indvars.iv.next, %bb18 ], [ 0, %bb5 ]
+  %exitcond = icmp ne i64 %indvars.iv, 100
+  br i1 %exitcond, label %bb13, label %bb19
+
+bb13:                                             ; preds = %bb11
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %tmp14 = getelementptr inbounds i32, i32* %arg, i64 %indvars.iv.next
+  %tmp15 = load i32, i32* %tmp14, align 4
+  %tmp16 = shl nsw i32 %tmp15, 1
+  %tmp17 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %indvars.iv
+  store i32 %tmp16, i32* %tmp17, align 4
+  br label %bb18
+
+bb18:                                             ; preds = %bb13
+  br label %bb11
+
+bb19:                                             ; preds = %bb11
+  ret void
+}
+
+; Check for values defined in Loop 0 and used in Loop 1.
+; It is not safe to fuse in this case, because the second loop has
+; a use of %.01.lcssa which is defined in the body of loop 0. The
+; first loop must execute completely in order to compute the correct
+; value of %.01.lcssa to be used in the second loop.
+
+; CHECK: Performing Loop Fusion on function sumTest
+; CHECK: Fusion Candidates:
+; CHECK: *** Fusion Candidate Set ***
+; CHECK-NEXT: [[LOOP1PREHEADER:bb[0-9]*]]
+; CHECK-NEXT: [[LOOP2PREHEADER:bb[0-9]*]]
+; CHECK-NEXT: ****************************
+; CHECK: Attempting fusion on Candidate Set:
+; CHECK-NEXT: [[LOOP1PREHEADER]]
+; CHECK-NEXT: [[LOOP2PREHEADER]]
+; CHECK: Memory dependencies do not allow fusion!
+; CHECK: Loop Fusion complete
+define i32 @sumTest(i32* noalias %arg) {
+bb:
+  br label %bb6
+
+bb6:                                              ; preds = %bb9, %bb
+  %indvars.iv3 = phi i64 [ %indvars.iv.next4, %bb9 ], [ 0, %bb ]
+  %.01 = phi i32 [ 0, %bb ], [ %tmp11, %bb9 ]
+  %exitcond5 = icmp ne i64 %indvars.iv3, 100
+  br i1 %exitcond5, label %bb9, label %bb13
+
+bb9:                                              ; preds = %bb6
+  %tmp = getelementptr inbounds i32, i32* %arg, i64 %indvars.iv3
+  %tmp10 = load i32, i32* %tmp, align 4
+  %tmp11 = add nsw i32 %.01, %tmp10
+  %indvars.iv.next4 = add nuw nsw i64 %indvars.iv3, 1
+  br label %bb6
+
+bb13:                                             ; preds = %bb20, %bb6
+  %.01.lcssa = phi i32 [ %.01, %bb6 ], [ %.01.lcssa, %bb20 ]
+  %indvars.iv = phi i64 [ %indvars.iv.next, %bb20 ], [ 0, %bb6 ]
+  %exitcond = icmp ne i64 %indvars.iv, 100
+  br i1 %exitcond, label %bb15, label %bb14
+
+bb14:                                             ; preds = %bb13
+  br label %bb21
+
+bb15:                                             ; preds = %bb13
+  %tmp16 = getelementptr inbounds i32, i32* %arg, i64 %indvars.iv
+  %tmp17 = load i32, i32* %tmp16, align 4
+  %tmp18 = sdiv i32 %tmp17, %.01.lcssa
+  %tmp19 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %indvars.iv
+  store i32 %tmp18, i32* %tmp19, align 4
+  br label %bb20
+
+bb20:                                             ; preds = %bb15
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %bb13
+
+bb21:                                             ; preds = %bb14
+  ret i32 %.01.lcssa
+}
+
+; Similar to sumTest above. The first loop computes %add and must
+; complete before it is used in the second loop. Thus, these two loops
+; also cannot be fused.
+
+; CHECK: Performing Loop Fusion on function test
+; CHECK: Fusion Candidates:
+; CHECK: *** Fusion Candidate Set ***
+; CHECK-NEXT: [[LOOP1PREHEADER:for.body[0-9]*.preheader]]
+; CHECK-NEXT: [[LOOP2PREHEADER:for.body[0-9]*.preheader]]
+; CHECK-NEXT: ****************************
+; CHECK: Attempting fusion on Candidate Set:
+; CHECK-NEXT: [[LOOP1PREHEADER]]
+; CHECK-NEXT: [[LOOP2PREHEADER]]
+; CHECK: Memory dependencies do not allow fusion!
+; CHECK: Loop Fusion complete
+define float @test(float* nocapture %a, i32 %n) {
+entry:
+  %conv = zext i32 %n to i64
+  %cmp32 = icmp eq i32 %n, 0
+  br i1 %cmp32, label %for.cond.cleanup7, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.034 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %sum1.033 = phi float [ %add, %for.body ], [ 0.000000e+00, %entry ]
+  %idxprom = trunc i64 %i.034 to i32
+  %arrayidx = getelementptr inbounds float, float* %a, i32 %idxprom
+  %0 = load float, float* %arrayidx, align 4
+  %add = fadd float %sum1.033, %0
+  %inc = add nuw nsw i64 %i.034, 1
+  %cmp = icmp ult i64 %inc, %conv
+  br i1 %cmp, label %for.body, label %for.body8
+
+for.body8:                                        ; preds = %for.body, %for.body8
+  %i2.031 = phi i64 [ %inc14, %for.body8 ], [ 0, %for.body ]
+  %idxprom9 = trunc i64 %i2.031 to i32
+  %arrayidx10 = getelementptr inbounds float, float* %a, i32 %idxprom9
+  %1 = load float, float* %arrayidx10, align 4
+  %div = fdiv float %1, %add
+  store float %div, float* %arrayidx10, align 4
+  %inc14 = add nuw nsw i64 %i2.031, 1
+  %cmp5 = icmp ult i64 %inc14, %conv
+  br i1 %cmp5, label %for.body8, label %for.cond.cleanup7
+
+for.cond.cleanup7:                                ; preds = %for.body8, %entry
+  %sum1.0.lcssa36 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body8 ]
+  ret float %sum1.0.lcssa36
+}

Added: llvm/trunk/test/Transforms/LoopFusion/four_loops.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopFusion/four_loops.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopFusion/four_loops.ll (added)
+++ llvm/trunk/test/Transforms/LoopFusion/four_loops.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,136 @@
+; RUN: opt -S -loop-fusion < %s | FileCheck %s
+
+ at A = common global [1024 x i32] zeroinitializer, align 16
+ at B = common global [1024 x i32] zeroinitializer, align 16
+ at C = common global [1024 x i32] zeroinitializer, align 16
+ at D = common global [1024 x i32] zeroinitializer, align 16
+
+; CHECK: void @dep_free
+; CHECK-NEXT: bb:
+; CHECK-NEXT: br label %[[LOOP1HEADER:bb[0-9]+]]
+; CHECK: [[LOOP1HEADER]]
+; CHECK: br i1 %exitcond12, label %[[LOOP1BODY:bb[0-9]+]], label %[[LOOP2PREHEADER:bb[0-9]+]]
+; CHECK: [[LOOP1BODY]]
+; CHECK: br label %[[LOOP1LATCH:bb[0-9]+]]
+; CHECK: [[LOOP1LATCH]]
+; CHECK: br label %[[LOOP2PREHEADER]]
+; CHECK: [[LOOP2PREHEADER]]
+; CHECK: br i1 %exitcond9, label %[[LOOP2HEADER:bb[0-9]+]], label %[[LOOP3PREHEADER:bb[0-9]+]]
+; CHECK: [[LOOP2HEADER]]
+; CHECK: br label %[[LOOP2LATCH:bb[0-9]+]]
+; CHECK: [[LOOP2LATCH]]
+; CHECK: br label %[[LOOP3PREHEADER]]
+; CHECK: [[LOOP3PREHEADER]]
+; CHECK: br i1 %exitcond6, label %[[LOOP3HEADER:bb[0-9]+]], label %[[LOOP4PREHEADER:bb[0-9]+]]
+; CHECK: [[LOOP3HEADER]]
+; CHECK: br label %[[LOOP3LATCH:bb[0-9]+]]
+; CHECK: [[LOOP3LATCH]]
+; CHECK: br label %[[LOOP4PREHEADER]]
+; CHECK: [[LOOP4PREHEADER]]
+; CHECK: br i1 %exitcond, label %[[LOOP4HEADER:bb[0-9]+]], label %[[LOOP4EXIT:bb[0-9]+]]
+; CHECK: [[LOOP4EXIT]]
+; CHECK: br label %[[FUNCEXIT:bb[0-9]+]]
+; CHECK: [[LOOP4HEADER]]
+; CHECK: br label %[[LOOP4LATCH:bb[0-9]+]]
+; CHECK: [[LOOP4LATCH]]
+; CHECK: br label %[[LOOP1HEADER]]
+; CHECK: [[FUNCEXIT]]
+; CHECK: ret void
+define void @dep_free() {
+bb:
+  br label %bb13
+
+bb13:                                             ; preds = %bb22, %bb
+  %indvars.iv10 = phi i64 [ %indvars.iv.next11, %bb22 ], [ 0, %bb ]
+  %.0 = phi i32 [ 0, %bb ], [ %tmp23, %bb22 ]
+  %exitcond12 = icmp ne i64 %indvars.iv10, 100
+  br i1 %exitcond12, label %bb15, label %bb25
+
+bb15:                                             ; preds = %bb13
+  %tmp = add nsw i32 %.0, -3
+  %tmp16 = add nuw nsw i64 %indvars.iv10, 3
+  %tmp17 = trunc i64 %tmp16 to i32
+  %tmp18 = mul nsw i32 %tmp, %tmp17
+  %tmp19 = trunc i64 %indvars.iv10 to i32
+  %tmp20 = srem i32 %tmp18, %tmp19
+  %tmp21 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv10
+  store i32 %tmp20, i32* %tmp21, align 4
+  br label %bb22
+
+bb22:                                             ; preds = %bb15
+  %indvars.iv.next11 = add nuw nsw i64 %indvars.iv10, 1
+  %tmp23 = add nuw nsw i32 %.0, 1
+  br label %bb13
+
+bb25:                                             ; preds = %bb35, %bb13
+  %indvars.iv7 = phi i64 [ %indvars.iv.next8, %bb35 ], [ 0, %bb13 ]
+  %.01 = phi i32 [ 0, %bb13 ], [ %tmp36, %bb35 ]
+  %exitcond9 = icmp ne i64 %indvars.iv7, 100
+  br i1 %exitcond9, label %bb27, label %bb38
+
+bb27:                                             ; preds = %bb25
+  %tmp28 = add nsw i32 %.01, -3
+  %tmp29 = add nuw nsw i64 %indvars.iv7, 3
+  %tmp30 = trunc i64 %tmp29 to i32
+  %tmp31 = mul nsw i32 %tmp28, %tmp30
+  %tmp32 = trunc i64 %indvars.iv7 to i32
+  %tmp33 = srem i32 %tmp31, %tmp32
+  %tmp34 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %indvars.iv7
+  store i32 %tmp33, i32* %tmp34, align 4
+  br label %bb35
+
+bb35:                                             ; preds = %bb27
+  %indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 1
+  %tmp36 = add nuw nsw i32 %.01, 1
+  br label %bb25
+
+bb38:                                             ; preds = %bb48, %bb25
+  %indvars.iv4 = phi i64 [ %indvars.iv.next5, %bb48 ], [ 0, %bb25 ]
+  %.02 = phi i32 [ 0, %bb25 ], [ %tmp49, %bb48 ]
+  %exitcond6 = icmp ne i64 %indvars.iv4, 100
+  br i1 %exitcond6, label %bb40, label %bb51
+
+bb40:                                             ; preds = %bb38
+  %tmp41 = add nsw i32 %.02, -3
+  %tmp42 = add nuw nsw i64 %indvars.iv4, 3
+  %tmp43 = trunc i64 %tmp42 to i32
+  %tmp44 = mul nsw i32 %tmp41, %tmp43
+  %tmp45 = trunc i64 %indvars.iv4 to i32
+  %tmp46 = srem i32 %tmp44, %tmp45
+  %tmp47 = getelementptr inbounds [1024 x i32], [1024 x i32]* @C, i64 0, i64 %indvars.iv4
+  store i32 %tmp46, i32* %tmp47, align 4
+  br label %bb48
+
+bb48:                                             ; preds = %bb40
+  %indvars.iv.next5 = add nuw nsw i64 %indvars.iv4, 1
+  %tmp49 = add nuw nsw i32 %.02, 1
+  br label %bb38
+
+bb51:                                             ; preds = %bb61, %bb38
+  %indvars.iv = phi i64 [ %indvars.iv.next, %bb61 ], [ 0, %bb38 ]
+  %.03 = phi i32 [ 0, %bb38 ], [ %tmp62, %bb61 ]
+  %exitcond = icmp ne i64 %indvars.iv, 100
+  br i1 %exitcond, label %bb53, label %bb52
+
+bb52:                                             ; preds = %bb51
+  br label %bb63
+
+bb53:                                             ; preds = %bb51
+  %tmp54 = add nsw i32 %.03, -3
+  %tmp55 = add nuw nsw i64 %indvars.iv, 3
+  %tmp56 = trunc i64 %tmp55 to i32
+  %tmp57 = mul nsw i32 %tmp54, %tmp56
+  %tmp58 = trunc i64 %indvars.iv to i32
+  %tmp59 = srem i32 %tmp57, %tmp58
+  %tmp60 = getelementptr inbounds [1024 x i32], [1024 x i32]* @D, i64 0, i64 %indvars.iv
+  store i32 %tmp59, i32* %tmp60, align 4
+  br label %bb61
+
+bb61:                                             ; preds = %bb53
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %tmp62 = add nuw nsw i32 %.03, 1
+  br label %bb51
+
+bb63:                                             ; preds = %bb52
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopFusion/inner_loops.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopFusion/inner_loops.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopFusion/inner_loops.ll (added)
+++ llvm/trunk/test/Transforms/LoopFusion/inner_loops.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,86 @@
+; RUN: opt -S -loop-fusion < %s 2>&1 | FileCheck %s
+
+ at A = common global [1024 x [1024 x i32]] zeroinitializer, align 16
+ at B = common global [1024 x [1024 x i32]] zeroinitializer, align 16
+
+; CHECK: void @dep_free
+; CHECK-NEXT: bb:
+; CHECK-NEXT: br label %[[LOOP1HEADER:bb[0-9]*]]
+; CHECK: [[LOOP1HEADER]]
+; CHECK: br i1 %{{.*}}, label %[[LOOP1BODY:bb[0-9]*]], label %[[LOOP2PREHEADER:bb[0-9]+]]
+; CHECK: [[LOOP1BODY]]
+; CHECK: br label %[[LOOP1LATCH:bb[0-9]*]]
+; CHECK: [[LOOP1LATCH]]
+; CHECK: br label %[[LOOP2PREHEADER:bb[0-9]+]]
+; CHECK: [[LOOP2PREHEADER]]
+; CHECK: br i1 %{{.*}}, label %[[LOOP2BODY:bb[0-9]*]], label %[[LOOP2EXIT:bb[0-9]*]]
+; CHECK: [[LOOP2BODY]]
+; CHECK: br label %[[LOOP2LATCH:bb[0-9]+]]
+; CHECK: [[LOOP2LATCH]]
+; CHECK: br label %[[LOOP1HEADER]]
+; CHECK: ret void
+
+define void @dep_free() {
+bb:
+  br label %bb9
+
+bb9:                                              ; preds = %bb35, %bb
+  %indvars.iv6 = phi i64 [ %indvars.iv.next7, %bb35 ], [ 0, %bb ]
+  %.0 = phi i32 [ 0, %bb ], [ %tmp36, %bb35 ]
+  %exitcond8 = icmp ne i64 %indvars.iv6, 100
+  br i1 %exitcond8, label %bb11, label %bb10
+
+bb10:                                             ; preds = %bb9
+  br label %bb37
+
+bb11:                                             ; preds = %bb9
+  br label %bb12
+
+bb12:                                             ; preds = %bb21, %bb11
+  %indvars.iv = phi i64 [ %indvars.iv.next, %bb21 ], [ 0, %bb11 ]
+  %exitcond = icmp ne i64 %indvars.iv, 100
+  br i1 %exitcond, label %bb14, label %bb23
+
+bb14:                                             ; preds = %bb12
+  %tmp = add nsw i32 %.0, -3
+  %tmp15 = add nuw nsw i64 %indvars.iv6, 3
+  %tmp16 = trunc i64 %tmp15 to i32
+  %tmp17 = mul nsw i32 %tmp, %tmp16
+  %tmp18 = trunc i64 %indvars.iv6 to i32
+  %tmp19 = srem i32 %tmp17, %tmp18
+  %tmp20 = getelementptr inbounds [1024 x [1024 x i32]], [1024 x [1024 x i32]]* @A, i64 0, i64 %indvars.iv6, i64 %indvars.iv
+  store i32 %tmp19, i32* %tmp20, align 4
+  br label %bb21
+
+bb21:                                             ; preds = %bb14
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %bb12
+
+bb23:                                             ; preds = %bb33, %bb12
+  %indvars.iv3 = phi i64 [ %indvars.iv.next4, %bb33 ], [ 0, %bb12 ]
+  %exitcond5 = icmp ne i64 %indvars.iv3, 100
+  br i1 %exitcond5, label %bb25, label %bb35
+
+bb25:                                             ; preds = %bb23
+  %tmp26 = add nsw i32 %.0, -3
+  %tmp27 = add nuw nsw i64 %indvars.iv6, 3
+  %tmp28 = trunc i64 %tmp27 to i32
+  %tmp29 = mul nsw i32 %tmp26, %tmp28
+  %tmp30 = trunc i64 %indvars.iv6 to i32
+  %tmp31 = srem i32 %tmp29, %tmp30
+  %tmp32 = getelementptr inbounds [1024 x [1024 x i32]], [1024 x [1024 x i32]]* @B, i64 0, i64 %indvars.iv6, i64 %indvars.iv3
+  store i32 %tmp31, i32* %tmp32, align 4
+  br label %bb33
+
+bb33:                                             ; preds = %bb25
+  %indvars.iv.next4 = add nuw nsw i64 %indvars.iv3, 1
+  br label %bb23
+
+bb35:                                             ; preds = %bb23
+  %indvars.iv.next7 = add nuw nsw i64 %indvars.iv6, 1
+  %tmp36 = add nuw nsw i32 %.0, 1
+  br label %bb9
+
+bb37:                                             ; preds = %bb10
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopFusion/loop_nest.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopFusion/loop_nest.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopFusion/loop_nest.ll (added)
+++ llvm/trunk/test/Transforms/LoopFusion/loop_nest.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,120 @@
+; RUN: opt -S -loop-fusion < %s | FileCheck %s
+;
+;    int A[1024][1024];
+;    int B[1024][1024];
+;
+;    #define EXPENSIVE_PURE_COMPUTATION(i) ((i - 3) * (i + 3) % i)
+;
+;    void dep_free() {
+;
+;      for (int i = 0; i < 100; i++)
+;        for (int j = 0; j < 100; j++)
+;          A[i][j] = EXPENSIVE_PURE_COMPUTATION(i);
+;
+;      for (int i = 0; i < 100; i++)
+;        for (int j = 0; j < 100; j++)
+;          B[i][j] = EXPENSIVE_PURE_COMPUTATION(i);
+;    }
+;
+ at A = common global [1024 x [1024 x i32]] zeroinitializer, align 16
+ at B = common global [1024 x [1024 x i32]] zeroinitializer, align 16
+
+; CHECK: void @dep_free
+; CHECK-NEXT: bb:
+; CHECK-NEXT: br label %[[LOOP1HEADER:bb[0-9]+]]
+; CHECK: [[LOOP1HEADER]]
+; CHECK: br i1 %exitcond12, label %[[LOOP3PREHEADER:bb[0-9]+.preheader]], label %[[LOOP2HEADER:bb[0-9]+]]
+; CHECK: [[LOOP3PREHEADER]]
+; CHECK: br label %[[LOOP3HEADER:bb[0-9]+]]
+; CHECK: [[LOOP3HEADER]]
+; CHECK: br i1 %exitcond9, label %[[LOOP3BODY:bb[0-9]+]], label %[[LOOP1LATCH:bb[0-9]+]]
+; CHECK: [[LOOP1LATCH]]
+; CHECK: br label %[[LOOP2HEADER:bb[0-9]+]]
+; CHECK: [[LOOP2HEADER]]
+; CHECK: br i1 %exitcond6, label %[[LOOP4PREHEADER:bb[0-9]+.preheader]], label %[[LOOP2EXITBLOCK:bb[0-9]+]]
+; CHECK: [[LOOP4PREHEADER]]
+; CHECK: br label %[[LOOP4HEADER:bb[0-9]+]]
+; CHECK: [[LOOP2EXITBLOCK]]
+; CHECK-NEXT: br label %[[FUNCEXIT:bb[0-9]+]]
+; CHECK: [[LOOP4HEADER]]
+; CHECK: br i1 %exitcond, label %[[LOOP4BODY:bb[0-9]+]], label %[[LOOP2LATCH:bb[0-9]+]]
+; CHECK: [[LOOP2LATCH]]
+; CHECK: br label %[[LOOP1HEADER:bb[0-9]+]]
+; CHECK: [[FUNCEXIT]]
+; CHECK: ret void
+
+; TODO: The current version of loop fusion does not allow the inner loops to be
+; fused because they are not control flow equivalent and adjacent. These are
+; limitations that can be addressed in future improvements to fusion.
+define void @dep_free() {
+bb:
+  br label %bb13
+
+bb13:                                             ; preds = %bb27, %bb
+  %indvars.iv10 = phi i64 [ %indvars.iv.next11, %bb27 ], [ 0, %bb ]
+  %.0 = phi i32 [ 0, %bb ], [ %tmp28, %bb27 ]
+  %exitcond12 = icmp ne i64 %indvars.iv10, 100
+  br i1 %exitcond12, label %bb16, label %bb30
+
+bb16:                                             ; preds = %bb25, %bb13
+  %indvars.iv7 = phi i64 [ %indvars.iv.next8, %bb25 ], [ 0, %bb13 ]
+  %exitcond9 = icmp ne i64 %indvars.iv7, 100
+  br i1 %exitcond9, label %bb18, label %bb27
+
+bb18:                                             ; preds = %bb16
+  %tmp = add nsw i32 %.0, -3
+  %tmp19 = add nuw nsw i64 %indvars.iv10, 3
+  %tmp20 = trunc i64 %tmp19 to i32
+  %tmp21 = mul nsw i32 %tmp, %tmp20
+  %tmp22 = trunc i64 %indvars.iv10 to i32
+  %tmp23 = srem i32 %tmp21, %tmp22
+  %tmp24 = getelementptr inbounds [1024 x [1024 x i32]], [1024 x [1024 x i32]]* @A, i64 0, i64 %indvars.iv10, i64 %indvars.iv7
+  store i32 %tmp23, i32* %tmp24, align 4
+  br label %bb25
+
+bb25:                                             ; preds = %bb18
+  %indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 1
+  br label %bb16
+
+bb27:                                             ; preds = %bb16
+  %indvars.iv.next11 = add nuw nsw i64 %indvars.iv10, 1
+  %tmp28 = add nuw nsw i32 %.0, 1
+  br label %bb13
+
+bb30:                                             ; preds = %bb45, %bb13
+  %indvars.iv4 = phi i64 [ %indvars.iv.next5, %bb45 ], [ 0, %bb13 ]
+  %.02 = phi i32 [ 0, %bb13 ], [ %tmp46, %bb45 ]
+  %exitcond6 = icmp ne i64 %indvars.iv4, 100
+  br i1 %exitcond6, label %bb33, label %bb31
+
+bb31:                                             ; preds = %bb30
+  br label %bb47
+
+bb33:                                             ; preds = %bb43, %bb30
+  %indvars.iv = phi i64 [ %indvars.iv.next, %bb43 ], [ 0, %bb30 ]
+  %exitcond = icmp ne i64 %indvars.iv, 100
+  br i1 %exitcond, label %bb35, label %bb45
+
+bb35:                                             ; preds = %bb33
+  %tmp36 = add nsw i32 %.02, -3
+  %tmp37 = add nuw nsw i64 %indvars.iv4, 3
+  %tmp38 = trunc i64 %tmp37 to i32
+  %tmp39 = mul nsw i32 %tmp36, %tmp38
+  %tmp40 = trunc i64 %indvars.iv4 to i32
+  %tmp41 = srem i32 %tmp39, %tmp40
+  %tmp42 = getelementptr inbounds [1024 x [1024 x i32]], [1024 x [1024 x i32]]* @B, i64 0, i64 %indvars.iv4, i64 %indvars.iv
+  store i32 %tmp41, i32* %tmp42, align 4
+  br label %bb43
+
+bb43:                                             ; preds = %bb35
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %bb33
+
+bb45:                                             ; preds = %bb33
+  %indvars.iv.next5 = add nuw nsw i64 %indvars.iv4, 1
+  %tmp46 = add nuw nsw i32 %.02, 1
+  br label %bb30
+
+bb47:                                             ; preds = %bb31
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopFusion/simple.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopFusion/simple.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopFusion/simple.ll (added)
+++ llvm/trunk/test/Transforms/LoopFusion/simple.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,317 @@
+; RUN: opt -S -loop-fusion < %s | FileCheck %s
+
+ at B = common global [1024 x i32] zeroinitializer, align 16
+
+; CHECK: void @dep_free
+; CHECK-NEXT: bb:
+; CHECK-NEXT: br label %[[LOOP1HEADER:bb[0-9]*]]
+; CHECK: [[LOOP1HEADER]]
+; CHECK: br i1 %{{.*}}, label %[[LOOP1BODY:bb[0-9]*]], label %[[LOOP2PREHEADER:bb[0-9]+]]
+; CHECK: [[LOOP1BODY]]
+; CHECK: br label %[[LOOP1LATCH:bb[0-9]*]]
+; CHECK: [[LOOP1LATCH]]
+; CHECK: br label %[[LOOP2PREHEADER:bb[0-9]+]]
+; CHECK: [[LOOP2PREHEADER]]
+; CHECK: br i1 %{{.*}}, label %[[LOOP2BODY:bb[0-9]*]], label %[[LOOP2EXIT:bb[0-9]*]]
+; CHECK: [[LOOP2BODY]]
+; CHECK: br label %[[LOOP2LATCH:bb[0-9]+]]
+; CHECK: [[LOOP2LATCH]]
+; CHECK: br label %[[LOOP1HEADER]]
+; CHECK: ret void
+define void @dep_free(i32* noalias %arg) {
+bb:
+  br label %bb5
+
+bb5:                                              ; preds = %bb14, %bb
+  %indvars.iv2 = phi i64 [ %indvars.iv.next3, %bb14 ], [ 0, %bb ]
+  %.01 = phi i32 [ 0, %bb ], [ %tmp15, %bb14 ]
+  %exitcond4 = icmp ne i64 %indvars.iv2, 100
+  br i1 %exitcond4, label %bb7, label %bb17
+
+bb7:                                              ; preds = %bb5
+  %tmp = add nsw i32 %.01, -3
+  %tmp8 = add nuw nsw i64 %indvars.iv2, 3
+  %tmp9 = trunc i64 %tmp8 to i32
+  %tmp10 = mul nsw i32 %tmp, %tmp9
+  %tmp11 = trunc i64 %indvars.iv2 to i32
+  %tmp12 = srem i32 %tmp10, %tmp11
+  %tmp13 = getelementptr inbounds i32, i32* %arg, i64 %indvars.iv2
+  store i32 %tmp12, i32* %tmp13, align 4
+  br label %bb14
+
+bb14:                                             ; preds = %bb7
+  %indvars.iv.next3 = add nuw nsw i64 %indvars.iv2, 1
+  %tmp15 = add nuw nsw i32 %.01, 1
+  br label %bb5
+
+bb17:                                             ; preds = %bb27, %bb5
+  %indvars.iv = phi i64 [ %indvars.iv.next, %bb27 ], [ 0, %bb5 ]
+  %.0 = phi i32 [ 0, %bb5 ], [ %tmp28, %bb27 ]
+  %exitcond = icmp ne i64 %indvars.iv, 100
+  br i1 %exitcond, label %bb19, label %bb18
+
+bb18:                                             ; preds = %bb17
+  br label %bb29
+
+bb19:                                             ; preds = %bb17
+  %tmp20 = add nsw i32 %.0, -3
+  %tmp21 = add nuw nsw i64 %indvars.iv, 3
+  %tmp22 = trunc i64 %tmp21 to i32
+  %tmp23 = mul nsw i32 %tmp20, %tmp22
+  %tmp24 = trunc i64 %indvars.iv to i32
+  %tmp25 = srem i32 %tmp23, %tmp24
+  %tmp26 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %indvars.iv
+  store i32 %tmp25, i32* %tmp26, align 4
+  br label %bb27
+
+bb27:                                             ; preds = %bb19
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %tmp28 = add nuw nsw i32 %.0, 1
+  br label %bb17
+
+bb29:                                             ; preds = %bb18
+  ret void
+}
+
+; CHECK: void @dep_free_parametric
+; CHECK-NEXT: bb:
+; CHECK-NEXT: br label %[[LOOP1HEADER:bb[0-9]*]]
+; CHECK: [[LOOP1HEADER]]
+; CHECK: br i1 %{{.*}}, label %[[LOOP1BODY:bb[0-9]*]], label %[[LOOP2PREHEADER:bb[0-9]+]]
+; CHECK: [[LOOP1BODY]]
+; CHECK: br label %[[LOOP1LATCH:bb[0-9]*]]
+; CHECK: [[LOOP1LATCH]]
+; CHECK: br label %[[LOOP2PREHEADER:bb[0-9]+]]
+; CHECK: [[LOOP2PREHEADER]]
+; CHECK: br i1 %{{.*}}, label %[[LOOP2BODY:bb[0-9]*]], label %[[LOOP2EXIT:bb[0-9]*]]
+; CHECK: [[LOOP2BODY]]
+; CHECK: br label %[[LOOP2LATCH:bb[0-9]+]]
+; CHECK: [[LOOP2LATCH]]
+; CHECK: br label %[[LOOP1HEADER]]
+; CHECK: ret void
+define void @dep_free_parametric(i32* noalias %arg, i64 %arg2) {
+bb:
+  br label %bb3
+
+bb3:                                              ; preds = %bb12, %bb
+  %.01 = phi i64 [ 0, %bb ], [ %tmp13, %bb12 ]
+  %tmp = icmp slt i64 %.01, %arg2
+  br i1 %tmp, label %bb5, label %bb15
+
+bb5:                                              ; preds = %bb3
+  %tmp6 = add nsw i64 %.01, -3
+  %tmp7 = add nuw nsw i64 %.01, 3
+  %tmp8 = mul nsw i64 %tmp6, %tmp7
+  %tmp9 = srem i64 %tmp8, %.01
+  %tmp10 = trunc i64 %tmp9 to i32
+  %tmp11 = getelementptr inbounds i32, i32* %arg, i64 %.01
+  store i32 %tmp10, i32* %tmp11, align 4
+  br label %bb12
+
+bb12:                                             ; preds = %bb5
+  %tmp13 = add nuw nsw i64 %.01, 1
+  br label %bb3
+
+bb15:                                             ; preds = %bb25, %bb3
+  %.0 = phi i64 [ 0, %bb3 ], [ %tmp26, %bb25 ]
+  %tmp16 = icmp slt i64 %.0, %arg2
+  br i1 %tmp16, label %bb18, label %bb17
+
+bb17:                                             ; preds = %bb15
+  br label %bb27
+
+bb18:                                             ; preds = %bb15
+  %tmp19 = add nsw i64 %.0, -3
+  %tmp20 = add nuw nsw i64 %.0, 3
+  %tmp21 = mul nsw i64 %tmp19, %tmp20
+  %tmp22 = srem i64 %tmp21, %.0
+  %tmp23 = trunc i64 %tmp22 to i32
+  %tmp24 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %.0
+  store i32 %tmp23, i32* %tmp24, align 4
+  br label %bb25
+
+bb25:                                             ; preds = %bb18
+  %tmp26 = add nuw nsw i64 %.0, 1
+  br label %bb15
+
+bb27:                                             ; preds = %bb17
+  ret void
+}
+
+; CHECK: void @raw_only
+; CHECK-NEXT: bb:
+; CHECK-NEXT: br label %[[LOOP1HEADER:bb[0-9]*]]
+; CHECK: [[LOOP1HEADER]]
+; CHECK: br i1 %{{.*}}, label %[[LOOP1BODY:bb[0-9]*]], label %[[LOOP2PREHEADER:bb[0-9]+]]
+; CHECK: [[LOOP1BODY]]
+; CHECK: br label %[[LOOP1LATCH:bb[0-9]*]]
+; CHECK: [[LOOP1LATCH]]
+; CHECK: br label %[[LOOP2PREHEADER:bb[0-9]+]]
+; CHECK: [[LOOP2PREHEADER]]
+; CHECK: br i1 %{{.*}}, label %[[LOOP2BODY:bb[0-9]*]], label %[[LOOP2EXIT:bb[0-9]*]]
+; CHECK: [[LOOP2BODY]]
+; CHECK: br label %[[LOOP2LATCH:bb[0-9]+]]
+; CHECK: [[LOOP2LATCH]]
+; CHECK: br label %[[LOOP1HEADER]]
+; CHECK: ret void
+define void @raw_only(i32* noalias %arg) {
+bb:
+  br label %bb5
+
+bb5:                                              ; preds = %bb9, %bb
+  %indvars.iv2 = phi i64 [ %indvars.iv.next3, %bb9 ], [ 0, %bb ]
+  %exitcond4 = icmp ne i64 %indvars.iv2, 100
+  br i1 %exitcond4, label %bb7, label %bb11
+
+bb7:                                              ; preds = %bb5
+  %tmp = getelementptr inbounds i32, i32* %arg, i64 %indvars.iv2
+  %tmp8 = trunc i64 %indvars.iv2 to i32
+  store i32 %tmp8, i32* %tmp, align 4
+  br label %bb9
+
+bb9:                                              ; preds = %bb7
+  %indvars.iv.next3 = add nuw nsw i64 %indvars.iv2, 1
+  br label %bb5
+
+bb11:                                             ; preds = %bb18, %bb5
+  %indvars.iv = phi i64 [ %indvars.iv.next, %bb18 ], [ 0, %bb5 ]
+  %exitcond = icmp ne i64 %indvars.iv, 100
+  br i1 %exitcond, label %bb13, label %bb19
+
+bb13:                                             ; preds = %bb11
+  %tmp14 = getelementptr inbounds i32, i32* %arg, i64 %indvars.iv
+  %tmp15 = load i32, i32* %tmp14, align 4
+  %tmp16 = shl nsw i32 %tmp15, 1
+  %tmp17 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %indvars.iv
+  store i32 %tmp16, i32* %tmp17, align 4
+  br label %bb18
+
+bb18:                                             ; preds = %bb13
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %bb11
+
+bb19:                                             ; preds = %bb11
+  ret void
+}
+
+; CHECK: void @raw_only_parametric
+; CHECK-NEXT: bb:
+; CHECK: br label %[[LOOP1HEADER:bb[0-9]*]]
+; CHECK: [[LOOP1HEADER]]
+; CHECK: br i1 %{{.*}}, label %[[LOOP1BODY:bb[0-9]*]], label %[[LOOP2PREHEADER:bb[0-9]+]]
+; CHECK: [[LOOP1BODY]]
+; CHECK: br label %[[LOOP1LATCH:bb[0-9]*]]
+; CHECK: [[LOOP1LATCH]]
+; CHECK: br label %[[LOOP2PREHEADER:bb[0-9]+]]
+; CHECK: [[LOOP2PREHEADER]]
+; CHECK: br i1 %{{.*}}, label %[[LOOP2BODY:bb[0-9]*]], label %[[LOOP2EXIT:bb[0-9]*]]
+; CHECK: [[LOOP2BODY]]
+; CHECK: br label %[[LOOP2LATCH:bb[0-9]+]]
+; CHECK: [[LOOP2LATCH]]
+; CHECK: br label %[[LOOP1HEADER]]
+; CHECK: ret void
+define void @raw_only_parametric(i32* noalias %arg, i32 %arg4) {
+bb:
+  br label %bb5
+
+bb5:                                              ; preds = %bb11, %bb
+  %indvars.iv2 = phi i64 [ %indvars.iv.next3, %bb11 ], [ 0, %bb ]
+  %tmp = sext i32 %arg4 to i64
+  %tmp6 = icmp slt i64 %indvars.iv2, %tmp
+  br i1 %tmp6, label %bb8, label %bb14
+
+bb8:                                              ; preds = %bb5
+  %tmp9 = getelementptr inbounds i32, i32* %arg, i64 %indvars.iv2
+  %tmp10 = trunc i64 %indvars.iv2 to i32
+  store i32 %tmp10, i32* %tmp9, align 4
+  br label %bb11
+
+bb11:                                             ; preds = %bb8
+  %indvars.iv.next3 = add nuw nsw i64 %indvars.iv2, 1
+  br label %bb5
+
+bb14:                                             ; preds = %bb22, %bb5
+  %indvars.iv = phi i64 [ %indvars.iv.next, %bb22 ], [ 0, %bb5 ]
+  %tmp13 = sext i32 %arg4 to i64
+  %tmp15 = icmp slt i64 %indvars.iv, %tmp13
+  br i1 %tmp15, label %bb17, label %bb23
+
+bb17:                                             ; preds = %bb14
+  %tmp18 = getelementptr inbounds i32, i32* %arg, i64 %indvars.iv
+  %tmp19 = load i32, i32* %tmp18, align 4
+  %tmp20 = shl nsw i32 %tmp19, 1
+  %tmp21 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %indvars.iv
+  store i32 %tmp20, i32* %tmp21, align 4
+  br label %bb22
+
+bb22:                                             ; preds = %bb17
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %bb14
+
+bb23:                                             ; preds = %bb14
+  ret void
+}
+
+; CHECK: void @forward_dep
+; CHECK-NEXT: bb:
+; CHECK: br label %[[LOOP1HEADER:bb[0-9]*]]
+; CHECK: [[LOOP1HEADER]]
+; CHECK: br i1 %{{.*}}, label %[[LOOP1BODY:bb[0-9]*]], label %[[LOOP2PREHEADER:bb[0-9]+]]
+; CHECK: [[LOOP1BODY]]
+; CHECK: br label %[[LOOP1LATCH:bb[0-9]*]]
+; CHECK: [[LOOP1LATCH]]
+; CHECK: br label %[[LOOP2PREHEADER:bb[0-9]+]]
+; CHECK: [[LOOP2PREHEADER]]
+; CHECK: br i1 %{{.*}}, label %[[LOOP2BODY:bb[0-9]*]], label %[[LOOP2EXIT:bb[0-9]*]]
+; CHECK: [[LOOP2BODY]]
+; CHECK: br label %[[LOOP2LATCH:bb[0-9]+]]
+; CHECK: [[LOOP2LATCH]]
+; CHECK: br label %[[LOOP1HEADER]]
+; CHECK: ret void
+define void @forward_dep(i32* noalias %arg) {
+bb:
+  br label %bb5
+
+bb5:                                              ; preds = %bb14, %bb
+  %indvars.iv2 = phi i64 [ %indvars.iv.next3, %bb14 ], [ 0, %bb ]
+  %.01 = phi i32 [ 0, %bb ], [ %tmp15, %bb14 ]
+  %exitcond4 = icmp ne i64 %indvars.iv2, 100
+  br i1 %exitcond4, label %bb7, label %bb17
+
+bb7:                                              ; preds = %bb5
+  %tmp = add nsw i32 %.01, -3
+  %tmp8 = add nuw nsw i64 %indvars.iv2, 3
+  %tmp9 = trunc i64 %tmp8 to i32
+  %tmp10 = mul nsw i32 %tmp, %tmp9
+  %tmp11 = trunc i64 %indvars.iv2 to i32
+  %tmp12 = srem i32 %tmp10, %tmp11
+  %tmp13 = getelementptr inbounds i32, i32* %arg, i64 %indvars.iv2
+  store i32 %tmp12, i32* %tmp13, align 4
+  br label %bb14
+
+bb14:                                             ; preds = %bb7
+  %indvars.iv.next3 = add nuw nsw i64 %indvars.iv2, 1
+  %tmp15 = add nuw nsw i32 %.01, 1
+  br label %bb5
+
+bb17:                                             ; preds = %bb25, %bb5
+  %indvars.iv = phi i64 [ %indvars.iv.next, %bb25 ], [ 0, %bb5 ]
+  %exitcond = icmp ne i64 %indvars.iv, 100
+  br i1 %exitcond, label %bb19, label %bb26
+
+bb19:                                             ; preds = %bb17
+  %tmp20 = add nsw i64 %indvars.iv, -3
+  %tmp21 = getelementptr inbounds i32, i32* %arg, i64 %tmp20
+  %tmp22 = load i32, i32* %tmp21, align 4
+  %tmp23 = mul nsw i32 %tmp22, 3
+  %tmp24 = getelementptr inbounds i32, i32* %arg, i64 %indvars.iv
+  store i32 %tmp23, i32* %tmp24, align 4
+  br label %bb25
+
+bb25:                                             ; preds = %bb19
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %bb17
+
+bb26:                                             ; preds = %bb17
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopIdiom/AMDGPU/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/AMDGPU/lit.local.cfg?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/AMDGPU/lit.local.cfg (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/AMDGPU/lit.local.cfg Tue Apr 16 21:52:47 2019
@@ -0,0 +1,3 @@
+if not 'AMDGPU' in config.root.targets:
+    config.unsupported = True
+

Added: llvm/trunk/test/Transforms/LoopIdiom/AMDGPU/popcnt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/AMDGPU/popcnt.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/AMDGPU/popcnt.ll (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/AMDGPU/popcnt.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,127 @@
+; RUN: opt -loop-idiom -mtriple=amdgcn-- -S < %s | FileCheck %s
+
+; Mostly copied from x86 version.
+
+;To recognize this pattern:
+;int popcount(unsigned long long a) {
+;    int c = 0;
+;    while (a) {
+;        c++;
+;        a &= a - 1;
+;    }
+;    return c;
+;}
+;
+
+; CHECK-LABEL: @popcount_i64
+; CHECK: entry
+; CHECK: llvm.ctpop.i64
+; CHECK: ret
+define i32 @popcount_i64(i64 %a) nounwind uwtable readnone ssp {
+entry:
+  %tobool3 = icmp eq i64 %a, 0
+  br i1 %tobool3, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %c.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %a.addr.04 = phi i64 [ %and, %while.body ], [ %a, %entry ]
+  %inc = add nsw i32 %c.05, 1
+  %sub = add i64 %a.addr.04, -1
+  %and = and i64 %sub, %a.addr.04
+  %tobool = icmp eq i64 %and, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+  ret i32 %c.0.lcssa
+}
+
+; CHECK-LABEL: @popcount_i32
+; CHECK: entry
+; CHECK: llvm.ctpop.i32
+; CHECK: ret
+define i32 @popcount_i32(i32 %a) nounwind uwtable readnone ssp {
+entry:
+  %tobool3 = icmp eq i32 %a, 0
+  br i1 %tobool3, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %c.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %a.addr.04 = phi i32 [ %and, %while.body ], [ %a, %entry ]
+  %inc = add nsw i32 %c.05, 1
+  %sub = add i32 %a.addr.04, -1
+  %and = and i32 %sub, %a.addr.04
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+  ret i32 %c.0.lcssa
+}
+
+; CHECK-LABEL: @popcount_i128
+; CHECK: entry
+; CHECK: llvm.ctpop.i128
+; CHECK: ret
+define i32 @popcount_i128(i128 %a) nounwind uwtable readnone ssp {
+entry:
+  %tobool3 = icmp eq i128 %a, 0
+  br i1 %tobool3, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %c.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %a.addr.04 = phi i128 [ %and, %while.body ], [ %a, %entry ]
+  %inc = add nsw i32 %c.05, 1
+  %sub = add i128 %a.addr.04, -1
+  %and = and i128 %sub, %a.addr.04
+  %tobool = icmp eq i128 %and, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+  ret i32 %c.0.lcssa
+}
+
+; To recognize this pattern:
+;int popcount(unsigned long long a, int mydata1, int mydata2) {
+;    int c = 0;
+;    while (a) {
+;        c++;
+;        a &= a - 1;
+;        mydata1 *= c;
+;        mydata2 *= (int)a;
+;    }
+;    return c + mydata1 + mydata2;
+;}
+
+; CHECK-LABEL: @popcount2
+; CHECK: entry
+; CHECK: llvm.ctpop.i64
+; CHECK: ret
+define i32 @popcount2(i64 %a, i32 %mydata1, i32 %mydata2) nounwind uwtable readnone ssp {
+entry:
+  %tobool9 = icmp eq i64 %a, 0
+  br i1 %tobool9, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %c.013 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %mydata2.addr.012 = phi i32 [ %mul1, %while.body ], [ %mydata2, %entry ]
+  %mydata1.addr.011 = phi i32 [ %mul, %while.body ], [ %mydata1, %entry ]
+  %a.addr.010 = phi i64 [ %and, %while.body ], [ %a, %entry ]
+  %inc = add nsw i32 %c.013, 1
+  %sub = add i64 %a.addr.010, -1
+  %and = and i64 %sub, %a.addr.010
+  %mul = mul nsw i32 %inc, %mydata1.addr.011
+  %conv = trunc i64 %and to i32
+  %mul1 = mul nsw i32 %conv, %mydata2.addr.012
+  %tobool = icmp eq i64 %and, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+  %mydata2.addr.0.lcssa = phi i32 [ %mydata2, %entry ], [ %mul1, %while.body ]
+  %mydata1.addr.0.lcssa = phi i32 [ %mydata1, %entry ], [ %mul, %while.body ]
+  %add = add i32 %mydata2.addr.0.lcssa, %mydata1.addr.0.lcssa
+  %add2 = add i32 %add, %c.0.lcssa
+  ret i32 %add2
+}

Added: llvm/trunk/test/Transforms/LoopIdiom/ARM/ctlz.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/ARM/ctlz.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/ARM/ctlz.ll (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/ARM/ctlz.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,246 @@
+; RUN: opt -loop-idiom -mtriple=armv7a < %s -S | FileCheck -check-prefix=LZCNT --check-prefix=ALL %s
+; RUN: opt -loop-idiom -mtriple=armv4t < %s -S | FileCheck -check-prefix=NOLZCNT --check-prefix=ALL %s
+
+; Recognize CTLZ builtin pattern.
+; Here we'll just convert loop to countable,
+; so do not insert builtin if CPU do not support CTLZ
+;
+; int ctlz_and_other(int n, char *a)
+; {
+;   n = n >= 0 ? n : -n;
+;   int i = 0, n0 = n;
+;   while(n >>= 1) {
+;     a[i] = (n0 & (1 << i)) ? 1 : 0;
+;     i++;
+;   }
+;   return i;
+; }
+;
+; LZCNT:  entry
+; LZCNT:  %0 = call i32 @llvm.ctlz.i32(i32 %shr8, i1 true)
+; LZCNT-NEXT:  %1 = sub i32 32, %0
+; LZCNT-NEXT:  %2 = zext i32 %1 to i64
+; LZCNT:  %indvars.iv.next.lcssa = phi i64 [ %2, %while.body ]
+; LZCNT:  %4 = trunc i64 %indvars.iv.next.lcssa to i32
+; LZCNT:  %i.0.lcssa = phi i32 [ 0, %entry ], [ %4, %while.end.loopexit ]
+; LZCNT:  ret i32 %i.0.lcssa
+
+; NOLZCNT:  entry
+; NOLZCNT-NOT:  @llvm.ctlz
+
+; Function Attrs: norecurse nounwind uwtable
+define i32 @ctlz_and_other(i32 %n, i8* nocapture %a) {
+entry:
+  %c = icmp sgt i32 %n, 0
+  %negn = sub nsw i32 0, %n
+  %abs_n = select i1 %c, i32 %n, i32 %negn
+  %shr8 = lshr i32 %abs_n, 1
+  %tobool9 = icmp eq i32 %shr8, 0
+  br i1 %tobool9, label %while.end, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %while.body ], [ 0, %while.body.preheader ]
+  %shr11 = phi i32 [ %shr, %while.body ], [ %shr8, %while.body.preheader ]
+  %0 = trunc i64 %indvars.iv to i32
+  %shl = shl i32 1, %0
+  %and = and i32 %shl, %abs_n
+  %tobool1 = icmp ne i32 %and, 0
+  %conv = zext i1 %tobool1 to i8
+  %arrayidx = getelementptr inbounds i8, i8* %a, i64 %indvars.iv
+  store i8 %conv, i8* %arrayidx, align 1
+  %indvars.iv.next = add nuw i64 %indvars.iv, 1
+  %shr = ashr i32 %shr11, 1
+  %tobool = icmp eq i32 %shr, 0
+  br i1 %tobool, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:                               ; preds = %while.body
+  %1 = trunc i64 %indvars.iv.next to i32
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %i.0.lcssa = phi i32 [ 0, %entry ], [ %1, %while.end.loopexit ]
+  ret i32 %i.0.lcssa
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_zero_check(int n)
+; {
+;   n = n >= 0 ? n : -n;
+;   int i = 0;
+;   while(n) {
+;     n >>= 1;
+;     i++;
+;   }
+;   return i;
+; }
+;
+; ALL:  entry
+; ALL:  %0 = call i32 @llvm.ctlz.i32(i32 %abs_n, i1 true)
+; ALL-NEXT:  %1 = sub i32 32, %0
+; ALL:  %inc.lcssa = phi i32 [ %1, %while.body ]
+; ALL:  %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc.lcssa, %while.end.loopexit ]
+; ALL:  ret i32 %i.0.lcssa
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz_zero_check(i32 %n) {
+entry:
+  %c = icmp sgt i32 %n, 0
+  %negn = sub nsw i32 0, %n
+  %abs_n = select i1 %c, i32 %n, i32 %negn
+  %tobool4 = icmp eq i32 %abs_n, 0
+  br i1 %tobool4, label %while.end, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %i.06 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ]
+  %n.addr.05 = phi i32 [ %shr, %while.body ], [ %abs_n, %while.body.preheader ]
+  %shr = ashr i32 %n.addr.05, 1
+  %inc = add nsw i32 %i.06, 1
+  %tobool = icmp eq i32 %shr, 0
+  br i1 %tobool, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:                               ; preds = %while.body
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.end.loopexit ]
+  ret i32 %i.0.lcssa
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz(int n)
+; {
+;   n = n >= 0 ? n : -n;
+;   int i = 0;
+;   while(n >>= 1) {
+;     i++;
+;   }
+;   return i;
+; }
+;
+; ALL:  entry
+; ALL:  %0 = ashr i32 %abs_n, 1
+; ALL-NEXT:  %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false)
+; ALL-NEXT:  %2 = sub i32 32, %1
+; ALL-NEXT:  %3 = add i32 %2, 1
+; ALL:  %i.0.lcssa = phi i32 [ %2, %while.cond ]
+; ALL:  ret i32 %i.0.lcssa
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz(i32 %n) {
+entry:
+  %c = icmp sgt i32 %n, 0
+  %negn = sub nsw i32 0, %n
+  %abs_n = select i1 %c, i32 %n, i32 %negn
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %n.addr.0 = phi i32 [ %abs_n, %entry ], [ %shr, %while.cond ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %while.cond ]
+  %shr = ashr i32 %n.addr.0, 1
+  %tobool = icmp eq i32 %shr, 0
+  %inc = add nsw i32 %i.0, 1
+  br i1 %tobool, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret i32 %i.0
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_add(int n, int i0)
+; {
+;   n = n >= 0 ? n : -n;
+;   int i = i0;
+;   while(n >>= 1) {
+;     i++;
+;   }
+;   return i;
+; }
+;
+; ALL:  entry
+; ALL:  %0 = ashr i32 %abs_n, 1
+; ALL-NEXT:  %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false)
+; ALL-NEXT:  %2 = sub i32 32, %1
+; ALL-NEXT:  %3 = add i32 %2, 1
+; ALL-NEXT:  %4 = add i32 %2, %i0
+; ALL:  %i.0.lcssa = phi i32 [ %4, %while.cond ]
+; ALL:  ret i32 %i.0.lcssa
+;
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz_add(i32 %n, i32 %i0) {
+entry:
+  %c = icmp sgt i32 %n, 0
+  %negn = sub nsw i32 0, %n
+  %abs_n = select i1 %c, i32 %n, i32 %negn
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %n.addr.0 = phi i32 [ %abs_n, %entry ], [ %shr, %while.cond ]
+  %i.0 = phi i32 [ %i0, %entry ], [ %inc, %while.cond ]
+  %shr = ashr i32 %n.addr.0, 1
+  %tobool = icmp eq i32 %shr, 0
+  %inc = add nsw i32 %i.0, 1
+  br i1 %tobool, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret i32 %i.0
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_sext(short in)
+; {
+;   int n = in;
+;   if (in < 0)
+;     n = -n;
+;   int i = 0;
+;   while(n >>= 1) {
+;     i++;
+;   }
+;   return i;
+; }
+;
+; ALL:  entry
+; ALL:  %0 = ashr i32 %abs_n, 1
+; ALL-NEXT:  %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false)
+; ALL-NEXT:  %2 = sub i32 32, %1
+; ALL-NEXT:  %3 = add i32 %2, 1
+; ALL:  %i.0.lcssa = phi i32 [ %2, %while.cond ]
+; ALL:  ret i32 %i.0.lcssa
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz_sext(i16 %in) {
+entry:
+  %n = sext i16 %in to i32
+  %c = icmp sgt i16 %in, 0
+  %negn = sub nsw i32 0, %n
+  %abs_n = select i1 %c, i32 %n, i32 %negn
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %n.addr.0 = phi i32 [ %abs_n, %entry ], [ %shr, %while.cond ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %while.cond ]
+  %shr = ashr i32 %n.addr.0, 1
+  %tobool = icmp eq i32 %shr, 0
+  %inc = add nsw i32 %i.0, 1
+  br i1 %tobool, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret i32 %i.0
+}

Added: llvm/trunk/test/Transforms/LoopIdiom/X86/ctlz.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/X86/ctlz.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/X86/ctlz.ll (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/X86/ctlz.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,528 @@
+; RUN: opt -loop-idiom -mtriple=x86_64 -mcpu=core-avx2 < %s -S | FileCheck -check-prefix=LZCNT --check-prefix=ALL %s
+; RUN: opt -loop-idiom -mtriple=x86_64 -mcpu=corei7 < %s -S | FileCheck -check-prefix=NOLZCNT --check-prefix=ALL %s
+
+; Recognize CTLZ builtin pattern.
+; Here we'll just convert loop to countable,
+; so do not insert builtin if CPU do not support CTLZ
+;
+; int ctlz_and_other(int n, char *a)
+; {
+;   n = n >= 0 ? n : -n;
+;   int i = 0, n0 = n;
+;   while(n >>= 1) {
+;     a[i] = (n0 & (1 << i)) ? 1 : 0;
+;     i++;
+;   }
+;   return i;
+; }
+;
+; LZCNT:  entry
+; LZCNT:  %0 = call i32 @llvm.ctlz.i32(i32 %shr8, i1 true)
+; LZCNT-NEXT:  %1 = sub i32 32, %0
+; LZCNT-NEXT:  %2 = zext i32 %1 to i64
+; LZCNT:  %indvars.iv.next.lcssa = phi i64 [ %2, %while.body ]
+; LZCNT:  %4 = trunc i64 %indvars.iv.next.lcssa to i32
+; LZCNT:  %i.0.lcssa = phi i32 [ 0, %entry ], [ %4, %while.end.loopexit ]
+; LZCNT:  ret i32 %i.0.lcssa
+
+; NOLZCNT:  entry
+; NOLZCNT-NOT:  @llvm.ctlz
+
+; Function Attrs: norecurse nounwind uwtable
+define i32 @ctlz_and_other(i32 %n, i8* nocapture %a) {
+entry:
+  %c = icmp sgt i32 %n, 0
+  %negn = sub nsw i32 0, %n
+  %abs_n = select i1 %c, i32 %n, i32 %negn
+  %shr8 = lshr i32 %abs_n, 1
+  %tobool9 = icmp eq i32 %shr8, 0
+  br i1 %tobool9, label %while.end, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %while.body ], [ 0, %while.body.preheader ]
+  %shr11 = phi i32 [ %shr, %while.body ], [ %shr8, %while.body.preheader ]
+  %0 = trunc i64 %indvars.iv to i32
+  %shl = shl i32 1, %0
+  %and = and i32 %shl, %abs_n
+  %tobool1 = icmp ne i32 %and, 0
+  %conv = zext i1 %tobool1 to i8
+  %arrayidx = getelementptr inbounds i8, i8* %a, i64 %indvars.iv
+  store i8 %conv, i8* %arrayidx, align 1
+  %indvars.iv.next = add nuw i64 %indvars.iv, 1
+  %shr = ashr i32 %shr11, 1
+  %tobool = icmp eq i32 %shr, 0
+  br i1 %tobool, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:                               ; preds = %while.body
+  %1 = trunc i64 %indvars.iv.next to i32
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %i.0.lcssa = phi i32 [ 0, %entry ], [ %1, %while.end.loopexit ]
+  ret i32 %i.0.lcssa
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_zero_check(int n)
+; {
+;   n = n >= 0 ? n : -n;
+;   int i = 0;
+;   while(n) {
+;     n >>= 1;
+;     i++;
+;   }
+;   return i;
+; }
+;
+; ALL:  entry
+; ALL:  %0 = call i32 @llvm.ctlz.i32(i32 %abs_n, i1 true)
+; ALL-NEXT:  %1 = sub i32 32, %0
+; ALL:  %inc.lcssa = phi i32 [ %1, %while.body ]
+; ALL:  %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc.lcssa, %while.end.loopexit ]
+; ALL:  ret i32 %i.0.lcssa
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz_zero_check(i32 %n) {
+entry:
+  %c = icmp sgt i32 %n, 0
+  %negn = sub nsw i32 0, %n
+  %abs_n = select i1 %c, i32 %n, i32 %negn
+  %tobool4 = icmp eq i32 %abs_n, 0
+  br i1 %tobool4, label %while.end, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %i.06 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ]
+  %n.addr.05 = phi i32 [ %shr, %while.body ], [ %abs_n, %while.body.preheader ]
+  %shr = ashr i32 %n.addr.05, 1
+  %inc = add nsw i32 %i.06, 1
+  %tobool = icmp eq i32 %shr, 0
+  br i1 %tobool, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:                               ; preds = %while.body
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.end.loopexit ]
+  ret i32 %i.0.lcssa
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_zero_check_lshr(int n)
+; {
+;   int i = 0;
+;   while(n) {
+;     n >>= 1;
+;     i++;
+;   }
+;   return i;
+; }
+;
+; ALL:  entry
+; ALL:  %0 = call i32 @llvm.ctlz.i32(i32 %n, i1 true)
+; ALL-NEXT:  %1 = sub i32 32, %0
+; ALL:  %inc.lcssa = phi i32 [ %1, %while.body ]
+; ALL:  %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc.lcssa, %while.end.loopexit ]
+; ALL:  ret i32 %i.0.lcssa
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz_zero_check_lshr(i32 %n) {
+entry:
+  %tobool4 = icmp eq i32 %n, 0
+  br i1 %tobool4, label %while.end, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %i.06 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ]
+  %n.addr.05 = phi i32 [ %shr, %while.body ], [ %n, %while.body.preheader ]
+  %shr = lshr i32 %n.addr.05, 1
+  %inc = add nsw i32 %i.06, 1
+  %tobool = icmp eq i32 %shr, 0
+  br i1 %tobool, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:                               ; preds = %while.body
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.end.loopexit ]
+  ret i32 %i.0.lcssa
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz(int n)
+; {
+;   n = n >= 0 ? n : -n;
+;   int i = 0;
+;   while(n >>= 1) {
+;     i++;
+;   }
+;   return i;
+; }
+;
+; ALL:  entry
+; ALL:  %0 = ashr i32 %abs_n, 1
+; ALL-NEXT:  %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false)
+; ALL-NEXT:  %2 = sub i32 32, %1
+; ALL-NEXT:  %3 = add i32 %2, 1
+; ALL:  %i.0.lcssa = phi i32 [ %2, %while.cond ]
+; ALL:  ret i32 %i.0.lcssa
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz(i32 %n) {
+entry:
+  %c = icmp sgt i32 %n, 0
+  %negn = sub nsw i32 0, %n
+  %abs_n = select i1 %c, i32 %n, i32 %negn
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %n.addr.0 = phi i32 [ %abs_n, %entry ], [ %shr, %while.cond ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %while.cond ]
+  %shr = ashr i32 %n.addr.0, 1
+  %tobool = icmp eq i32 %shr, 0
+  %inc = add nsw i32 %i.0, 1
+  br i1 %tobool, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret i32 %i.0
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_lshr(int n)
+; {
+;   int i = 0;
+;   while(n >>= 1) {
+;     i++;
+;   }
+;   return i;
+; }
+;
+; ALL:  entry
+; ALL:  %0 = lshr i32 %n, 1
+; ALL-NEXT:  %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false)
+; ALL-NEXT:  %2 = sub i32 32, %1
+; ALL-NEXT:  %3 = add i32 %2, 1
+; ALL:  %i.0.lcssa = phi i32 [ %2, %while.cond ]
+; ALL:  ret i32 %i.0.lcssa
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz_lshr(i32 %n) {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %while.cond ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %while.cond ]
+  %shr = lshr i32 %n.addr.0, 1
+  %tobool = icmp eq i32 %shr, 0
+  %inc = add nsw i32 %i.0, 1
+  br i1 %tobool, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret i32 %i.0
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_add(int n, int i0)
+; {
+;   n = n >= 0 ? n : -n;
+;   int i = i0;
+;   while(n >>= 1) {
+;     i++;
+;   }
+;   return i;
+; }
+;
+; ALL:  entry
+; ALL:  %0 = ashr i32 %abs_n, 1
+; ALL-NEXT:  %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false)
+; ALL-NEXT:  %2 = sub i32 32, %1
+; ALL-NEXT:  %3 = add i32 %2, 1
+; ALL-NEXT:  %4 = add i32 %2, %i0
+; ALL:  %i.0.lcssa = phi i32 [ %4, %while.cond ]
+; ALL:  ret i32 %i.0.lcssa
+;
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz_add(i32 %n, i32 %i0) {
+entry:
+  %c = icmp sgt i32 %n, 0
+  %negn = sub nsw i32 0, %n
+  %abs_n = select i1 %c, i32 %n, i32 %negn
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %n.addr.0 = phi i32 [ %abs_n, %entry ], [ %shr, %while.cond ]
+  %i.0 = phi i32 [ %i0, %entry ], [ %inc, %while.cond ]
+  %shr = ashr i32 %n.addr.0, 1
+  %tobool = icmp eq i32 %shr, 0
+  %inc = add nsw i32 %i.0, 1
+  br i1 %tobool, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret i32 %i.0
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_add_lshr(int n, int i0)
+; {
+;   int i = i0;
+;   while(n >>= 1) {
+;     i++;
+;   }
+;   return i;
+; }
+;
+; ALL:  entry
+; ALL:  %0 = lshr i32 %n, 1
+; ALL-NEXT:  %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false)
+; ALL-NEXT:  %2 = sub i32 32, %1
+; ALL-NEXT:  %3 = add i32 %2, 1
+; ALL-NEXT:  %4 = add i32 %2, %i0
+; ALL:  %i.0.lcssa = phi i32 [ %4, %while.cond ]
+; ALL:  ret i32 %i.0.lcssa
+;
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz_add_lshr(i32 %n, i32 %i0) {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %while.cond ]
+  %i.0 = phi i32 [ %i0, %entry ], [ %inc, %while.cond ]
+  %shr = lshr i32 %n.addr.0, 1
+  %tobool = icmp eq i32 %shr, 0
+  %inc = add nsw i32 %i.0, 1
+  br i1 %tobool, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret i32 %i.0
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_sext(short in)
+; {
+;   int n = in;
+;   if (in < 0)
+;     n = -n;
+;   int i = 0;
+;   while(n >>= 1) {
+;     i++;
+;   }
+;   return i;
+; }
+;
+; ALL:  entry
+; ALL:  %0 = ashr i32 %abs_n, 1
+; ALL-NEXT:  %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false)
+; ALL-NEXT:  %2 = sub i32 32, %1
+; ALL-NEXT:  %3 = add i32 %2, 1
+; ALL:  %i.0.lcssa = phi i32 [ %2, %while.cond ]
+; ALL:  ret i32 %i.0.lcssa
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz_sext(i16 %in) {
+entry:
+  %n = sext i16 %in to i32
+  %c = icmp sgt i16 %in, 0
+  %negn = sub nsw i32 0, %n
+  %abs_n = select i1 %c, i32 %n, i32 %negn
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %n.addr.0 = phi i32 [ %abs_n, %entry ], [ %shr, %while.cond ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %while.cond ]
+  %shr = ashr i32 %n.addr.0, 1
+  %tobool = icmp eq i32 %shr, 0
+  %inc = add nsw i32 %i.0, 1
+  br i1 %tobool, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret i32 %i.0
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_sext_lshr(short in)
+; {
+;   int i = 0;
+;   while(in >>= 1) {
+;     i++;
+;   }
+;   return i;
+; }
+;
+; ALL:  entry
+; ALL:  %0 = lshr i32 %n, 1
+; ALL-NEXT:  %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false)
+; ALL-NEXT:  %2 = sub i32 32, %1
+; ALL-NEXT:  %3 = add i32 %2, 1
+; ALL:  %i.0.lcssa = phi i32 [ %2, %while.cond ]
+; ALL:  ret i32 %i.0.lcssa
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz_sext_lshr(i16 %in) {
+entry:
+  %n = sext i16 %in to i32
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %while.cond ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %while.cond ]
+  %shr = lshr i32 %n.addr.0, 1
+  %tobool = icmp eq i32 %shr, 0
+  %inc = add nsw i32 %i.0, 1
+  br i1 %tobool, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret i32 %i.0
+}
+
+; This loop contains a volatile store. If x is initially negative,
+; the code will be an infinite loop because the ashr will eventually produce
+; all ones and continue doing so. This prevents the loop from terminating. If
+; we convert this to a countable loop using ctlz that loop will only run 32
+; times. This is different than the infinite number of times of the original.
+define i32 @foo(i32 %x) {
+; LZCNT-LABEL: @foo(
+; LZCNT-NEXT:  entry:
+; LZCNT-NEXT:    [[V:%.*]] = alloca i8, align 1
+; LZCNT-NEXT:    [[TOBOOL4:%.*]] = icmp eq i32 [[X:%.*]], 0
+; LZCNT-NEXT:    br i1 [[TOBOOL4]], label [[WHILE_END:%.*]], label [[WHILE_BODY_LR_PH:%.*]]
+; LZCNT:       while.body.lr.ph:
+; LZCNT-NEXT:    br label [[WHILE_BODY:%.*]]
+; LZCNT:       while.body:
+; LZCNT-NEXT:    [[CNT_06:%.*]] = phi i32 [ 0, [[WHILE_BODY_LR_PH]] ], [ [[INC:%.*]], [[WHILE_BODY]] ]
+; LZCNT-NEXT:    [[X_ADDR_05:%.*]] = phi i32 [ [[X]], [[WHILE_BODY_LR_PH]] ], [ [[SHR:%.*]], [[WHILE_BODY]] ]
+; LZCNT-NEXT:    [[SHR]] = ashr i32 [[X_ADDR_05]], 1
+; LZCNT-NEXT:    [[INC]] = add i32 [[CNT_06]], 1
+; LZCNT-NEXT:    store volatile i8 42, i8* [[V]], align 1
+; LZCNT-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[SHR]], 0
+; LZCNT-NEXT:    br i1 [[TOBOOL]], label [[WHILE_COND_WHILE_END_CRIT_EDGE:%.*]], label [[WHILE_BODY]]
+; LZCNT:       while.cond.while.end_crit_edge:
+; LZCNT-NEXT:    [[SPLIT:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ]
+; LZCNT-NEXT:    br label [[WHILE_END]]
+; LZCNT:       while.end:
+; LZCNT-NEXT:    [[CNT_0_LCSSA:%.*]] = phi i32 [ [[SPLIT]], [[WHILE_COND_WHILE_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
+; LZCNT-NEXT:    ret i32 [[CNT_0_LCSSA]]
+;
+; NOLZCNT-LABEL: @foo(
+; NOLZCNT-NEXT:  entry:
+; NOLZCNT-NEXT:    [[V:%.*]] = alloca i8, align 1
+; NOLZCNT-NEXT:    [[TOBOOL4:%.*]] = icmp eq i32 [[X:%.*]], 0
+; NOLZCNT-NEXT:    br i1 [[TOBOOL4]], label [[WHILE_END:%.*]], label [[WHILE_BODY_LR_PH:%.*]]
+; NOLZCNT:       while.body.lr.ph:
+; NOLZCNT-NEXT:    br label [[WHILE_BODY:%.*]]
+; NOLZCNT:       while.body:
+; NOLZCNT-NEXT:    [[CNT_06:%.*]] = phi i32 [ 0, [[WHILE_BODY_LR_PH]] ], [ [[INC:%.*]], [[WHILE_BODY]] ]
+; NOLZCNT-NEXT:    [[X_ADDR_05:%.*]] = phi i32 [ [[X]], [[WHILE_BODY_LR_PH]] ], [ [[SHR:%.*]], [[WHILE_BODY]] ]
+; NOLZCNT-NEXT:    [[SHR]] = ashr i32 [[X_ADDR_05]], 1
+; NOLZCNT-NEXT:    [[INC]] = add i32 [[CNT_06]], 1
+; NOLZCNT-NEXT:    store volatile i8 42, i8* [[V]], align 1
+; NOLZCNT-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[SHR]], 0
+; NOLZCNT-NEXT:    br i1 [[TOBOOL]], label [[WHILE_COND_WHILE_END_CRIT_EDGE:%.*]], label [[WHILE_BODY]]
+; NOLZCNT:       while.cond.while.end_crit_edge:
+; NOLZCNT-NEXT:    [[SPLIT:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ]
+; NOLZCNT-NEXT:    br label [[WHILE_END]]
+; NOLZCNT:       while.end:
+; NOLZCNT-NEXT:    [[CNT_0_LCSSA:%.*]] = phi i32 [ [[SPLIT]], [[WHILE_COND_WHILE_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
+; NOLZCNT-NEXT:    ret i32 [[CNT_0_LCSSA]]
+;
+entry:
+  %v = alloca i8, align 1
+  %tobool4 = icmp eq i32 %x, 0
+  br i1 %tobool4, label %while.end, label %while.body.lr.ph
+
+while.body.lr.ph:                                 ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.lr.ph, %while.body
+  %cnt.06 = phi i32 [ 0, %while.body.lr.ph ], [ %inc, %while.body ]
+  %x.addr.05 = phi i32 [ %x, %while.body.lr.ph ], [ %shr, %while.body ]
+  %shr = ashr i32 %x.addr.05, 1
+  %inc = add i32 %cnt.06, 1
+  store volatile i8 42, i8* %v, align 1
+  %tobool = icmp eq i32 %shr, 0
+  br i1 %tobool, label %while.cond.while.end_crit_edge, label %while.body
+
+while.cond.while.end_crit_edge:                   ; preds = %while.body
+  %split = phi i32 [ %inc, %while.body ]
+  br label %while.end
+
+while.end:                                        ; preds = %while.cond.while.end_crit_edge, %entry
+  %cnt.0.lcssa = phi i32 [ %split, %while.cond.while.end_crit_edge ], [ 0, %entry ]
+  ret i32 %cnt.0.lcssa
+}
+
+; We can't easily transform this loop. It returns 1 for an input of both
+; 0 and 1.
+;
+; int ctlz_bad(unsigned n)
+; {
+;   int i = 0;
+;   do {
+;     i++;
+;     n >>= 1;
+;   } while(n != 0) {
+;   return i;
+; }
+;
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz_bad(i32 %n) {
+; ALL-LABEL: @ctlz_bad(
+; ALL-NEXT:  entry:
+; ALL-NEXT:    br label [[WHILE_COND:%.*]]
+; ALL:       while.cond:
+; ALL-NEXT:    [[N_ADDR_0:%.*]] = phi i32 [ [[N:%.*]], [[ENTRY:%.*]] ], [ [[SHR:%.*]], [[WHILE_COND]] ]
+; ALL-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC:%.*]], [[WHILE_COND]] ]
+; ALL-NEXT:    [[SHR]] = lshr i32 [[N_ADDR_0]], 1
+; ALL-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[SHR]], 0
+; ALL-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
+; ALL-NEXT:    br i1 [[TOBOOL]], label [[WHILE_END:%.*]], label [[WHILE_COND]]
+; ALL:       while.end:
+; ALL-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_COND]] ]
+; ALL-NEXT:    ret i32 [[INC_LCSSA]]
+;
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %while.cond ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %while.cond ]
+  %shr = lshr i32 %n.addr.0, 1
+  %tobool = icmp eq i32 %shr, 0
+  %inc = add nsw i32 %i.0, 1
+  br i1 %tobool, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret i32 %inc
+}

Added: llvm/trunk/test/Transforms/LoopIdiom/X86/cttz.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/X86/cttz.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/X86/cttz.ll (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/X86/cttz.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,82 @@
+; RUN: opt -loop-idiom -mtriple=x86_64 -mcpu=core-avx2 < %s -S | FileCheck --check-prefix=ALL %s
+; RUN: opt -loop-idiom -mtriple=x86_64 -mcpu=corei7 < %s -S | FileCheck --check-prefix=ALL %s
+
+; Recognize CTTZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int cttz_zero_check(int n)
+; {
+;   int i = 0;
+;   while(n) {
+;     n <<= 1;
+;     i++;
+;   }
+;   return i;
+; }
+;
+; ALL-LABEL: @cttz_zero_check
+; ALL:       %0 = call i32 @llvm.cttz.i32(i32 %n, i1 true)
+; ALL-NEXT:  %1 = sub i32 32, %0
+;
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @cttz_zero_check(i32 %n) {
+entry:
+  %tobool4 = icmp eq i32 %n, 0
+  br i1 %tobool4, label %while.end, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %i.06 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ]
+  %n.addr.05 = phi i32 [ %shl, %while.body ], [ %n, %while.body.preheader ]
+  %shl = shl i32 %n.addr.05, 1
+  %inc = add nsw i32 %i.06, 1
+  %tobool = icmp eq i32 %shl, 0
+  br i1 %tobool, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:                               ; preds = %while.body
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.end.loopexit ]
+  ret i32 %i.0.lcssa
+}
+
+; Recognize CTTZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int cttz(int n)
+; {
+;   int i = 0;
+;   while(n <<= 1) {
+;     i++;
+;   }
+;   return i;
+; }
+;
+; ALL-LABEL: @cttz
+; ALL:      %0 = shl i32 %n, 1
+; ALL-NEXT: %1 = call i32 @llvm.cttz.i32(i32 %0, i1 false)
+; ALL-NEXT: %2 = sub i32 32, %1
+; ALL-NEXT: %3 = add i32 %2, 1
+;
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @cttz(i32 %n) {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %shl, %while.cond ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %while.cond ]
+  %shl = shl i32 %n.addr.0, 1
+  %tobool = icmp eq i32 %shl, 0
+  %inc = add nsw i32 %i.0, 1
+  br i1 %tobool, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret i32 %i.0
+}
+

Added: llvm/trunk/test/Transforms/LoopIdiom/X86/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/X86/lit.local.cfg?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/X86/lit.local.cfg (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/X86/lit.local.cfg Tue Apr 16 21:52:47 2019
@@ -0,0 +1,3 @@
+if not 'X86' in config.root.targets:
+    config.unsupported = True
+

Added: llvm/trunk/test/Transforms/LoopIdiom/X86/popcnt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/X86/popcnt.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/X86/popcnt.ll (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/X86/popcnt.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,182 @@
+; RUN: opt -loop-idiom < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -S | FileCheck %s
+
+;To recognize this pattern:
+;int popcount(unsigned long long a) {
+;    int c = 0;
+;    while (a) {
+;        c++;
+;        a &= a - 1;
+;    }
+;    return c;
+;}
+; 
+; CHECK: entry
+; CHECK: llvm.ctpop.i64
+; CHECK: ret
+define i32 @popcount(i64 %a) nounwind uwtable readnone ssp {
+entry:
+  %tobool3 = icmp eq i64 %a, 0
+  br i1 %tobool3, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %c.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %a.addr.04 = phi i64 [ %and, %while.body ], [ %a, %entry ]
+  %inc = add nsw i32 %c.05, 1
+  %sub = add i64 %a.addr.04, -1
+  %and = and i64 %sub, %a.addr.04
+  %tobool = icmp eq i64 %and, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+  ret i32 %c.0.lcssa
+}
+
+; To recognize this pattern:
+;int popcount(unsigned long long a, int mydata1, int mydata2) {
+;    int c = 0;
+;    while (a) {
+;        c++;
+;        a &= a - 1;
+;        mydata1 *= c;
+;        mydata2 *= (int)a;
+;    }
+;    return c + mydata1 + mydata2;
+;}
+; CHECK: entry
+; CHECK: llvm.ctpop.i64
+; CHECK: ret
+define i32 @popcount2(i64 %a, i32 %mydata1, i32 %mydata2) nounwind uwtable readnone ssp {
+entry:
+  %tobool9 = icmp eq i64 %a, 0
+  br i1 %tobool9, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %c.013 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %mydata2.addr.012 = phi i32 [ %mul1, %while.body ], [ %mydata2, %entry ]
+  %mydata1.addr.011 = phi i32 [ %mul, %while.body ], [ %mydata1, %entry ]
+  %a.addr.010 = phi i64 [ %and, %while.body ], [ %a, %entry ]
+  %inc = add nsw i32 %c.013, 1
+  %sub = add i64 %a.addr.010, -1
+  %and = and i64 %sub, %a.addr.010
+  %mul = mul nsw i32 %inc, %mydata1.addr.011
+  %conv = trunc i64 %and to i32
+  %mul1 = mul nsw i32 %conv, %mydata2.addr.012
+  %tobool = icmp eq i64 %and, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+  %mydata2.addr.0.lcssa = phi i32 [ %mydata2, %entry ], [ %mul1, %while.body ]
+  %mydata1.addr.0.lcssa = phi i32 [ %mydata1, %entry ], [ %mul, %while.body ]
+  %add = add i32 %mydata2.addr.0.lcssa, %mydata1.addr.0.lcssa
+  %add2 = add i32 %add, %c.0.lcssa
+  ret i32 %add2
+}
+
+; Some variants once cause crash
+target triple = "x86_64-apple-macosx10.8.0"
+
+define i32 @PopCntCrash1(i64 %a) nounwind uwtable readnone ssp {
+entry:
+  %tobool3 = icmp eq i64 %a, 0
+  br i1 %tobool3, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %c.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %a.addr.04 = phi i64 [ %and, %while.body ], [ %a, %entry ]
+  %t = add i32 %c.05, %c.05
+  %inc = add nsw i32 %t, 1
+  %sub = add i64 %a.addr.04, -1
+  %and = and i64 %sub, %a.addr.04
+  %tobool = icmp eq i64 %and, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+  ret i32 %c.0.lcssa
+
+; CHECK: entry
+; CHECK: ret 
+}
+
+define i32 @PopCntCrash2(i64 %a, i32 %b) nounwind uwtable readnone ssp {
+entry:
+  %tobool3 = icmp eq i64 %a, 0
+  br i1 %tobool3, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %c.05 = phi i32 [ %inc, %while.body ], [ %b, %entry ]
+  %a.addr.04 = phi i64 [ %and, %while.body ], [ %a, %entry ]
+  %inc = add nsw i32 %c.05, 1
+  %sub = add i64 %a.addr.04, -1
+  %and = and i64 %sub, %a.addr.04
+  %tobool = icmp eq i64 %and, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+  ret i32 %c.0.lcssa
+}
+
+define i32 @PopCntCrash3(i64 %a, i32 %x) {
+entry:
+  %tobool3 = icmp eq i64 %a, 0
+  %cmp = icmp eq i32 %x, 0
+  br i1 %tobool3, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %c.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %a.addr.04 = phi i64 [ %and, %while.body ], [ %a, %entry ]
+  %inc = add nsw i32 %c.05, 1
+  %sub = add i64 %a.addr.04, -1
+  %and = and i64 %sub, %a.addr.04
+  %tobool = icmp eq i64 %and, 0
+  br i1 %cmp, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+  ret i32 %c.0.lcssa
+}
+
+; The a & (a - 1) in the loop is a & (b - 1) in this code. Make sure we don't
+; convert it.
+define i32 @popcount_bad(i64 %a, i64 %b) nounwind uwtable readnone ssp {
+; CHECK-LABEL: @popcount_bad(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TOBOOL3:%.*]] = icmp eq i64 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL3]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK:       while.body.preheader:
+; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[C_05:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[A_ADDR_04:%.*]] = phi i64 [ [[AND:%.*]], [[WHILE_BODY]] ], [ [[A]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[C_05]], 1
+; CHECK-NEXT:    [[SUB:%.*]] = add i64 [[B:%.*]], -1
+; CHECK-NEXT:    [[AND]] = and i64 [[SUB]], [[A_ADDR_04]]
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i64 [[AND]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    br label [[WHILE_END]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[C_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[C_0_LCSSA]]
+;
+entry:
+  %tobool3 = icmp eq i64 %a, 0
+  br i1 %tobool3, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %c.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %a.addr.04 = phi i64 [ %and, %while.body ], [ %a, %entry ]
+  %inc = add nsw i32 %c.05, 1
+  %sub = add i64 %b, -1
+  %and = and i64 %sub, %a.addr.04
+  %tobool = icmp eq i64 %and, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+  ret i32 %c.0.lcssa
+}

Added: llvm/trunk/test/Transforms/LoopIdiom/X86/unordered-atomic-memcpy.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/X86/unordered-atomic-memcpy.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/X86/unordered-atomic-memcpy.ll (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/X86/unordered-atomic-memcpy.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,456 @@
+; RUN: opt -basicaa -loop-idiom < %s -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+;; memcpy.atomic formation (atomic load & store)
+define void @test1(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test1(
+; CHECK: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %Dest, i8* align 1 %Base, i64 %Size, i32 1)
+; CHECK-NOT: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i8, i32 10000
+  %Dest = alloca i8, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar
+  %DestI = getelementptr i8, i8* %Dest, i64 %indvar
+  %V = load atomic i8, i8* %I.0.014 unordered, align 1
+  store atomic i8 %V, i8* %DestI unordered, align 1
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;; memcpy.atomic formation (atomic store, normal load)
+define void @test2(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test2(
+; CHECK: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %Dest, i8* align 1 %Base, i64 %Size, i32 1)
+; CHECK-NOT: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i8, i32 10000
+  %Dest = alloca i8, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar
+  %DestI = getelementptr i8, i8* %Dest, i64 %indvar
+  %V = load i8, i8* %I.0.014, align 1
+  store atomic i8 %V, i8* %DestI unordered, align 1
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;; memcpy.atomic formation rejection (atomic store, normal load w/ no align)
+define void @test2b(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test2b(
+; CHECK-NOT: call void @llvm.memcpy.element.unordered.atomic
+; CHECK: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i8, i32 10000
+  %Dest = alloca i8, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar
+  %DestI = getelementptr i8, i8* %Dest, i64 %indvar
+  %V = load i8, i8* %I.0.014
+  store atomic i8 %V, i8* %DestI unordered, align 1
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;; memcpy.atomic formation rejection (atomic store, normal load w/ bad align)
+define void @test2c(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test2c(
+; CHECK-NOT: call void @llvm.memcpy.element.unordered.atomic
+; CHECK: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i32, i32 10000
+  %Dest = alloca i32, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i32, i32* %Base, i64 %indvar
+  %DestI = getelementptr i32, i32* %Dest, i64 %indvar
+  %V = load i32, i32* %I.0.014, align 2
+  store atomic i32 %V, i32* %DestI unordered, align 4
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;; memcpy.atomic formation rejection (atomic store w/ bad align, normal load)
+define void @test2d(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test2d(
+; CHECK-NOT: call void @llvm.memcpy.element.unordered.atomic
+; CHECK: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i32, i32 10000
+  %Dest = alloca i32, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i32, i32* %Base, i64 %indvar
+  %DestI = getelementptr i32, i32* %Dest, i64 %indvar
+  %V = load i32, i32* %I.0.014, align 4
+  store atomic i32 %V, i32* %DestI unordered, align 2
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;; memcpy.atomic formation (normal store, atomic load)
+define void @test3(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test3(
+; CHECK: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %Dest, i8* align 1 %Base, i64 %Size, i32 1)
+; CHECK-NOT: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i8, i32 10000
+  %Dest = alloca i8, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar
+  %DestI = getelementptr i8, i8* %Dest, i64 %indvar
+  %V = load atomic i8, i8* %I.0.014 unordered, align 1
+  store i8 %V, i8* %DestI, align 1
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;; memcpy.atomic formation rejection (normal store w/ no align, atomic load)
+define void @test3b(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test3b(
+; CHECK-NOT: call void @llvm.memcpy.element.unordered.atomic
+; CHECK: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i8, i32 10000
+  %Dest = alloca i8, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar
+  %DestI = getelementptr i8, i8* %Dest, i64 %indvar
+  %V = load atomic i8, i8* %I.0.014 unordered, align 1
+  store i8 %V, i8* %DestI
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;; memcpy.atomic formation rejection (normal store, atomic load w/ bad align)
+define void @test3c(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test3c(
+; CHECK-NOT: call void @llvm.memcpy.element.unordered.atomic
+; CHECK: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i32, i32 10000
+  %Dest = alloca i32, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i32, i32* %Base, i64 %indvar
+  %DestI = getelementptr i32, i32* %Dest, i64 %indvar
+  %V = load atomic i32, i32* %I.0.014 unordered, align 2
+  store i32 %V, i32* %DestI, align 4
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;; memcpy.atomic formation rejection (normal store w/ bad align, atomic load)
+define void @test3d(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test3d(
+; CHECK-NOT: call void @llvm.memcpy.element.unordered.atomic
+; CHECK: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i32, i32 10000
+  %Dest = alloca i32, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i32, i32* %Base, i64 %indvar
+  %DestI = getelementptr i32, i32* %Dest, i64 %indvar
+  %V = load atomic i32, i32* %I.0.014 unordered, align 4
+  store i32 %V, i32* %DestI, align 2
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;; memcpy.atomic formation rejection (atomic load, ordered-atomic store)
+define void @test4(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test4(
+; CHECK-NOT: call void @llvm.memcpy.element.unordered.atomic
+; CHECK: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i8, i32 10000
+  %Dest = alloca i8, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar
+  %DestI = getelementptr i8, i8* %Dest, i64 %indvar
+  %V = load atomic i8, i8* %I.0.014 unordered, align 1
+  store atomic i8 %V, i8* %DestI monotonic, align 1
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;; memcpy.atomic formation rejection (ordered-atomic load, unordered-atomic store)
+define void @test5(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test5(
+; CHECK-NOT: call void @llvm.memcpy.element.unordered.atomic
+; CHECK: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i8, i32 10000
+  %Dest = alloca i8, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar
+  %DestI = getelementptr i8, i8* %Dest, i64 %indvar
+  %V = load atomic i8, i8* %I.0.014 monotonic, align 1
+  store atomic i8 %V, i8* %DestI unordered, align 1
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;; memcpy.atomic formation (atomic load & store) -- element size 2
+define void @test6(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test6(
+; CHECK: [[Sz:%[0-9]+]] = shl i64 %Size, 1
+; CHECK: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 2 %Dest{{[0-9]*}}, i8* align 2 %Base{{[0-9]*}}, i64 [[Sz]], i32 2)
+; CHECK-NOT: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i16, i32 10000
+  %Dest = alloca i16, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i16, i16* %Base, i64 %indvar
+  %DestI = getelementptr i16, i16* %Dest, i64 %indvar
+  %V = load atomic i16, i16* %I.0.014 unordered, align 2
+  store atomic i16 %V, i16* %DestI unordered, align 2
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;; memcpy.atomic formation (atomic load & store) -- element size 4
+define void @test7(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test7(
+; CHECK: [[Sz:%[0-9]+]] = shl i64 %Size, 2
+; CHECK: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 4 %Dest{{[0-9]*}}, i8* align 4 %Base{{[0-9]*}}, i64 [[Sz]], i32 4)
+; CHECK-NOT: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i32, i32 10000
+  %Dest = alloca i32, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i32, i32* %Base, i64 %indvar
+  %DestI = getelementptr i32, i32* %Dest, i64 %indvar
+  %V = load atomic i32, i32* %I.0.014 unordered, align 4
+  store atomic i32 %V, i32* %DestI unordered, align 4
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;; memcpy.atomic formation (atomic load & store) -- element size 8
+define void @test8(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test8(
+; CHECK: [[Sz:%[0-9]+]] = shl i64 %Size, 3
+; CHECK: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 8 %Dest{{[0-9]*}}, i8* align 8 %Base{{[0-9]*}}, i64 [[Sz]], i32 8)
+; CHECK-NOT: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i64, i32 10000
+  %Dest = alloca i64, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i64, i64* %Base, i64 %indvar
+  %DestI = getelementptr i64, i64* %Dest, i64 %indvar
+  %V = load atomic i64, i64* %I.0.014 unordered, align 8
+  store atomic i64 %V, i64* %DestI unordered, align 8
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;; memcpy.atomic formation rejection (atomic load & store) -- element size 16
+define void @test9(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test9(
+; CHECK: [[Sz:%[0-9]+]] = shl i64 %Size, 4
+; CHECK: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 %Dest{{[0-9]*}}, i8* align 16 %Base{{[0-9]*}}, i64 [[Sz]], i32 16)
+; CHECK-NOT: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i128, i32 10000
+  %Dest = alloca i128, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i128, i128* %Base, i64 %indvar
+  %DestI = getelementptr i128, i128* %Dest, i64 %indvar
+  %V = load atomic i128, i128* %I.0.014 unordered, align 16
+  store atomic i128 %V, i128* %DestI unordered, align 16
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;; memcpy.atomic formation rejection (atomic load & store) -- element size 32
+define void @test10(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test10(
+; CHECK-NOT: call void @llvm.memcpy.element.unordered.atomic
+; CHECK: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i256, i32 10000
+  %Dest = alloca i256, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i256, i256* %Base, i64 %indvar
+  %DestI = getelementptr i256, i256* %Dest, i64 %indvar
+  %V = load atomic i256, i256* %I.0.014 unordered, align 32
+  store atomic i256 %V, i256* %DestI unordered, align 32
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; Make sure that atomic memset doesn't get recognized by mistake
+define void @test_nomemset(i8* %Base, i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test_nomemset(
+; CHECK-NOT: call void @llvm.memset
+; CHECK: store
+; CHECK: ret void
+bb.nph:                                           ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar
+  store atomic i8 0, i8* %I.0.014 unordered, align 1
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Verify that unordered memset_pattern isn't recognized.
+; This is a replica of test11_pattern from basic.ll
+define void @test_nomemset_pattern(i32* nocapture %P) nounwind ssp {
+; CHECK-LABEL: @test_nomemset_pattern(
+; CHECK-NEXT: entry:
+; CHECK-NOT: bitcast
+; CHECK-NOT: memset_pattern
+; CHECK: store atomic
+; CHECK: ret void
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %for.body ]
+  %arrayidx = getelementptr i32, i32* %P, i64 %indvar
+  store atomic i32 1, i32* %arrayidx unordered, align 4
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, 10000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopIdiom/basic-address-space.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/basic-address-space.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/basic-address-space.ll (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/basic-address-space.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,91 @@
+; RUN: opt -basicaa -loop-idiom < %s -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-p1:64:64:64-p2:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:32-n8:16:32:64"
+target triple = "x86_64-apple-darwin10.0.0"
+
+; Two dimensional nested loop should be promoted to one big memset.
+define void @test10(i8 addrspace(2)* %X) nounwind ssp {
+; CHECK-LABEL: @test10(
+; CHECK: entry:
+; CHECK-NEXT: call void @llvm.memset.p2i8.i16(i8 addrspace(2)* align 1 %X, i8 0, i16 10000, i1 false)
+; CHECK-NOT: store
+; CHECK: ret void
+
+entry:
+  br label %bb.nph
+
+bb.nph:                                           ; preds = %entry, %for.inc10
+  %i.04 = phi i16 [ 0, %entry ], [ %inc12, %for.inc10 ]
+  br label %for.body5
+
+for.body5:                                        ; preds = %for.body5, %bb.nph
+  %j.02 = phi i16 [ 0, %bb.nph ], [ %inc, %for.body5 ]
+  %mul = mul nsw i16 %i.04, 100
+  %add = add nsw i16 %j.02, %mul
+  %arrayidx = getelementptr inbounds i8, i8 addrspace(2)* %X, i16 %add
+  store i8 0, i8 addrspace(2)* %arrayidx, align 1
+  %inc = add nsw i16 %j.02, 1
+  %cmp4 = icmp eq i16 %inc, 100
+  br i1 %cmp4, label %for.inc10, label %for.body5
+
+for.inc10:                                        ; preds = %for.body5
+  %inc12 = add nsw i16 %i.04, 1
+  %cmp = icmp eq i16 %inc12, 100
+  br i1 %cmp, label %for.end13, label %bb.nph
+
+for.end13:                                        ; preds = %for.inc10
+  ret void
+}
+
+define void @test11_pattern(i32 addrspace(2)* nocapture %P) nounwind ssp {
+; CHECK-LABEL: @test11_pattern(
+; CHECK-NOT: memset_pattern
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %for.body ]
+  %arrayidx = getelementptr i32, i32 addrspace(2)* %P, i64 %indvar
+  store i32 1, i32 addrspace(2)* %arrayidx, align 4
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, 10000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; PR9815 - This is a partial overlap case that cannot be safely transformed
+; into a memcpy.
+ at g_50 = addrspace(2) global [7 x i32] [i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0], align 16
+
+
+define i32 @test14() nounwind {
+; CHECK-LABEL: @test14(
+; CHECK: for.body:
+; CHECK: load i32
+; CHECK: store i32
+; CHECK: br i1 %cmp
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %for.body.lr.ph
+  %tmp5 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %add = add nsw i32 %tmp5, 4
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds [7 x i32], [7 x i32] addrspace(2)* @g_50, i32 0, i64 %idxprom
+  %tmp2 = load i32, i32 addrspace(2)* %arrayidx, align 4
+  %add4 = add nsw i32 %tmp5, 5
+  %idxprom5 = sext i32 %add4 to i64
+  %arrayidx6 = getelementptr inbounds [7 x i32], [7 x i32] addrspace(2)* @g_50, i32 0, i64 %idxprom5
+  store i32 %tmp2, i32 addrspace(2)* %arrayidx6, align 4
+  %inc = add nsw i32 %tmp5, 1
+  %cmp = icmp slt i32 %inc, 2
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.inc
+  %tmp8 = load i32, i32 addrspace(2)* getelementptr inbounds ([7 x i32], [7 x i32] addrspace(2)* @g_50, i32 0, i64 6), align 4
+  ret i32 %tmp8
+}
+

Added: llvm/trunk/test/Transforms/LoopIdiom/basic.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/basic.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/basic.ll (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/basic.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,712 @@
+; RUN: opt -basicaa -loop-idiom < %s -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+; For @test11_pattern
+; CHECK: @.memset_pattern = private unnamed_addr constant [4 x i32] [i32 1, i32 1, i32 1, i32 1]
+
+; For @test13_pattern
+; CHECK: @.memset_pattern.1 = private unnamed_addr constant [2 x i32*] [i32* @G, i32* @G]
+
+target triple = "x86_64-apple-darwin10.0.0"
+
+define void @test1(i8* %Base, i64 %Size) nounwind ssp {
+bb.nph:                                           ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar
+  store i8 0, i8* %I.0.014, align 1
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+; CHECK-LABEL: @test1(
+; CHECK: call void @llvm.memset.p0i8.i64(i8* align 1 %Base, i8 0, i64 %Size, i1 false)
+; CHECK-NOT: store
+}
+
+; Make sure memset is formed for larger than 1 byte stores, and that the
+; alignment of the store is preserved
+define void @test1_i16(i16* align 2 %Base, i64 %Size) nounwind ssp {
+bb.nph:                                           ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i16, i16* %Base, i64 %indvar
+  store i16 0, i16* %I.0.014, align 2
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+; CHECK-LABEL: @test1_i16(
+; CHECK: %[[BaseBC:.*]] = bitcast i16* %Base to i8*
+; CHECK: %[[Sz:[0-9]+]] = shl i64 %Size, 1
+; CHECK: call void @llvm.memset.p0i8.i64(i8* align 2 %[[BaseBC]], i8 0, i64 %[[Sz]], i1 false)
+; CHECK-NOT: store
+}
+
+; This is a loop that was rotated but where the blocks weren't merged.  This
+; shouldn't perturb us.
+define void @test1a(i8* %Base, i64 %Size) nounwind ssp {
+bb.nph:                                           ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body.cont ]
+  %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar
+  store i8 0, i8* %I.0.014, align 1
+  %indvar.next = add i64 %indvar, 1
+  br label %for.body.cont
+for.body.cont:
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+; CHECK-LABEL: @test1a(
+; CHECK: call void @llvm.memset.p0i8.i64(i8* align 1 %Base, i8 0, i64 %Size, i1 false)
+; CHECK-NOT: store
+}
+
+
+define void @test2(i32* %Base, i64 %Size) nounwind ssp {
+entry:
+  %cmp10 = icmp eq i64 %Size, 0
+  br i1 %cmp10, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.011 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %add.ptr.i = getelementptr i32, i32* %Base, i64 %i.011
+  store i32 16843009, i32* %add.ptr.i, align 4
+  %inc = add nsw i64 %i.011, 1
+  %exitcond = icmp eq i64 %inc, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+; CHECK-LABEL: @test2(
+; CHECK: br i1 %cmp10,
+; CHECK: %0 = shl i64 %Size, 2
+; CHECK: call void @llvm.memset.p0i8.i64(i8* align 4 %Base1, i8 1, i64 %0, i1 false)
+; CHECK-NOT: store
+}
+
+; This is a case where there is an extra may-aliased store in the loop, we can't
+; promote the memset.
+define void @test3(i32* %Base, i64 %Size, i8 *%MayAlias) nounwind ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.011 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %add.ptr.i = getelementptr i32, i32* %Base, i64 %i.011
+  store i32 16843009, i32* %add.ptr.i, align 4
+  
+  store i8 42, i8* %MayAlias
+  %inc = add nsw i64 %i.011, 1
+  %exitcond = icmp eq i64 %inc, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %entry
+  ret void
+; CHECK-LABEL: @test3(
+; CHECK-NOT: memset
+; CHECK: ret void
+}
+
+; Make sure the first store in the loop is turned into a memset.
+define void @test4(i8* %Base) nounwind ssp {
+bb.nph:                                           ; preds = %entry
+  %Base100 = getelementptr i8, i8* %Base, i64 1000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar
+  store i8 0, i8* %I.0.014, align 1
+  
+  ;; Store beyond the range memset, should be safe to promote.
+  store i8 42, i8* %Base100
+  
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, 100
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+; CHECK-LABEL: @test4(
+; CHECK: call void @llvm.memset.p0i8.i64(i8* align 1 %Base, i8 0, i64 100, i1 false)
+}
+
+; This can't be promoted: the memset is a store of a loop variant value.
+define void @test5(i8* %Base, i64 %Size) nounwind ssp {
+bb.nph:                                           ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar
+  
+  %V = trunc i64 %indvar to i8
+  store i8 %V, i8* %I.0.014, align 1
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+; CHECK-LABEL: @test5(
+; CHECK-NOT: memset
+; CHECK: ret void
+}
+
+
+;; memcpy formation
+define void @test6(i64 %Size) nounwind ssp {
+bb.nph:
+  %Base = alloca i8, i32 10000
+  %Dest = alloca i8, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar
+  %DestI = getelementptr i8, i8* %Dest, i64 %indvar
+  %V = load i8, i8* %I.0.014, align 1
+  store i8 %V, i8* %DestI, align 1
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+; CHECK-LABEL: @test6(
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %Dest, i8* align 1 %Base, i64 %Size, i1 false)
+; CHECK-NOT: store
+; CHECK: ret void
+}
+
+;; memcpy formation, check alignment
+define void @test6_dest_align(i32* noalias align 1 %Base, i32* noalias align 4 %Dest, i64 %Size) nounwind ssp {
+bb.nph:
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i32, i32* %Base, i64 %indvar
+  %DestI = getelementptr i32, i32* %Dest, i64 %indvar
+  %V = load i32, i32* %I.0.014, align 1
+  store i32 %V, i32* %DestI, align 4
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+; CHECK-LABEL: @test6_dest_align(
+; CHECK: %[[Dst:.*]] = bitcast i32* %Dest to i8*
+; CHECK: %[[Src:.*]] = bitcast i32* %Base to i8*
+; CHECK: %[[Sz:[0-9]+]] = shl i64 %Size, 2
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %[[Dst]], i8* align 1 %[[Src]], i64 %[[Sz]], i1 false)
+; CHECK-NOT: store
+; CHECK: ret void
+}
+
+;; memcpy formation, check alignment
+define void @test6_src_align(i32* noalias align 4 %Base, i32* noalias align 1 %Dest, i64 %Size) nounwind ssp {
+bb.nph:
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i32, i32* %Base, i64 %indvar
+  %DestI = getelementptr i32, i32* %Dest, i64 %indvar
+  %V = load i32, i32* %I.0.014, align 4
+  store i32 %V, i32* %DestI, align 1
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+; CHECK-LABEL: @test6_src_align(
+; CHECK: %[[Dst]] = bitcast i32* %Dest to i8*
+; CHECK: %[[Src]] = bitcast i32* %Base to i8*
+; CHECK: %[[Sz:[0-9]+]] = shl i64 %Size, 2
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[Dst]], i8* align 4 %[[Src]], i64 %[[Sz]], i1 false)
+; CHECK-NOT: store
+; CHECK: ret void
+}
+
+
+; This is a loop that was rotated but where the blocks weren't merged.  This
+; shouldn't perturb us.
+define void @test7(i8* %Base, i64 %Size) nounwind ssp {
+bb.nph:                                           ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body.cont ]
+  br label %for.body.cont
+for.body.cont:
+  %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar
+  store i8 0, i8* %I.0.014, align 1
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+; CHECK-LABEL: @test7(
+; CHECK: call void @llvm.memset.p0i8.i64(i8* align 1 %Base, i8 0, i64 %Size, i1 false)
+; CHECK-NOT: store
+}
+
+; This is a loop should not be transformed, it only executes one iteration.
+define void @test8(i64* %Ptr, i64 %Size) nounwind ssp {
+bb.nph:                                           ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %PI = getelementptr i64, i64* %Ptr, i64 %indvar
+  store i64 0, i64 *%PI
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, 1
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+; CHECK-LABEL: @test8(
+; CHECK: store i64 0, i64* %PI
+}
+
+declare i8* @external(i8*)
+
+;; This cannot be transformed into a memcpy, because the read-from location is
+;; mutated by the loop.
+define void @test9(i64 %Size) nounwind ssp {
+bb.nph:
+  %Base = alloca i8, i32 10000
+  %Dest = alloca i8, i32 10000
+  
+  %BaseAlias = call i8* @external(i8* %Base)
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar
+  %DestI = getelementptr i8, i8* %Dest, i64 %indvar
+  %V = load i8, i8* %I.0.014, align 1
+  store i8 %V, i8* %DestI, align 1
+
+  ;; This store can clobber the input.
+  store i8 4, i8* %BaseAlias
+ 
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+; CHECK-LABEL: @test9(
+; CHECK-NOT: llvm.memcpy
+; CHECK: ret void
+}
+
+; Two dimensional nested loop should be promoted to one big memset.
+define void @test10(i8* %X) nounwind ssp {
+entry:
+  br label %bb.nph
+
+bb.nph:                                           ; preds = %entry, %for.inc10
+  %i.04 = phi i32 [ 0, %entry ], [ %inc12, %for.inc10 ]
+  br label %for.body5
+
+for.body5:                                        ; preds = %for.body5, %bb.nph
+  %j.02 = phi i32 [ 0, %bb.nph ], [ %inc, %for.body5 ]
+  %mul = mul nsw i32 %i.04, 100
+  %add = add nsw i32 %j.02, %mul
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i8, i8* %X, i64 %idxprom
+  store i8 0, i8* %arrayidx, align 1
+  %inc = add nsw i32 %j.02, 1
+  %cmp4 = icmp eq i32 %inc, 100
+  br i1 %cmp4, label %for.inc10, label %for.body5
+
+for.inc10:                                        ; preds = %for.body5
+  %inc12 = add nsw i32 %i.04, 1
+  %cmp = icmp eq i32 %inc12, 100
+  br i1 %cmp, label %for.end13, label %bb.nph
+
+for.end13:                                        ; preds = %for.inc10
+  ret void
+; CHECK-LABEL: @test10(
+; CHECK: entry:
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 1 %X, i8 0, i64 10000, i1 false)
+; CHECK-NOT: store
+; CHECK: ret void
+}
+
+; On darwin10 (which is the triple in this .ll file) this loop can be turned
+; into a memset_pattern call.
+; rdar://9009151
+define void @test11_pattern(i32* nocapture %P) nounwind ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %for.body ]
+  %arrayidx = getelementptr i32, i32* %P, i64 %indvar
+  store i32 1, i32* %arrayidx, align 4
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, 10000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+; CHECK-LABEL: @test11_pattern(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: bitcast
+; CHECK-NEXT: memset_pattern
+; CHECK-NOT: store
+; CHECK: ret void
+}
+
+; Store of null should turn into memset of zero.
+define void @test12(i32** nocapture %P) nounwind ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %for.body ]
+  %arrayidx = getelementptr i32*, i32** %P, i64 %indvar
+  store i32* null, i32** %arrayidx, align 4
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, 10000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+; CHECK-LABEL: @test12(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: bitcast
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 %P1, i8 0, i64 80000, i1 false)
+; CHECK-NOT: store
+; CHECK: ret void
+}
+
+ at G = global i32 5
+
+; This store-of-address loop can be turned into a memset_pattern call.
+; rdar://9009151
+define void @test13_pattern(i32** nocapture %P) nounwind ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %for.body ]
+  %arrayidx = getelementptr i32*, i32** %P, i64 %indvar
+  store i32* @G, i32** %arrayidx, align 4
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, 10000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+; CHECK-LABEL: @test13_pattern(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: bitcast
+; CHECK-NEXT: memset_pattern
+; CHECK-NOT: store
+; CHECK: ret void
+}
+
+
+
+; PR9815 - This is a partial overlap case that cannot be safely transformed
+; into a memcpy.
+ at g_50 = global [7 x i32] [i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0], align 16
+
+define i32 @test14() nounwind {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %for.body.lr.ph
+  %tmp5 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %add = add nsw i32 %tmp5, 4
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds [7 x i32], [7 x i32]* @g_50, i32 0, i64 %idxprom
+  %tmp2 = load i32, i32* %arrayidx, align 4
+  %add4 = add nsw i32 %tmp5, 5
+  %idxprom5 = sext i32 %add4 to i64
+  %arrayidx6 = getelementptr inbounds [7 x i32], [7 x i32]* @g_50, i32 0, i64 %idxprom5
+  store i32 %tmp2, i32* %arrayidx6, align 4
+  %inc = add nsw i32 %tmp5, 1
+  %cmp = icmp slt i32 %inc, 2
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.inc
+  %tmp8 = load i32, i32* getelementptr inbounds ([7 x i32], [7 x i32]* @g_50, i32 0, i64 6), align 4
+  ret i32 %tmp8
+; CHECK-LABEL: @test14(
+; CHECK: for.body:
+; CHECK: load i32
+; CHECK: store i32
+; CHECK: br i1 %cmp
+
+}
+
+define void @PR14241(i32* %s, i64 %size) {
+; Ensure that we don't form a memcpy for strided loops. Briefly, when we taught
+; LoopIdiom about memmove and strided loops, this got miscompiled into a memcpy
+; instead of a memmove. If we get the memmove transform back, this will catch
+; regressions.
+;
+; CHECK-LABEL: @PR14241(
+
+entry:
+  %end.idx = add i64 %size, -1
+  %end.ptr = getelementptr inbounds i32, i32* %s, i64 %end.idx
+  br label %while.body
+; CHECK-NOT: memcpy
+;
+; FIXME: When we regain the ability to form a memmove here, this test should be
+; reversed and turned into a positive assertion.
+; CHECK-NOT: memmove
+
+while.body:
+  %phi.ptr = phi i32* [ %s, %entry ], [ %next.ptr, %while.body ]
+  %src.ptr = getelementptr inbounds i32, i32* %phi.ptr, i64 1
+  %val = load i32, i32* %src.ptr, align 4
+; CHECK: load
+  %dst.ptr = getelementptr inbounds i32, i32* %phi.ptr, i64 0
+  store i32 %val, i32* %dst.ptr, align 4
+; CHECK: store
+  %next.ptr = getelementptr inbounds i32, i32* %phi.ptr, i64 1
+  %cmp = icmp eq i32* %next.ptr, %end.ptr
+  br i1 %cmp, label %exit, label %while.body
+
+exit:
+  ret void
+; CHECK: ret void
+}
+
+; Recognize loops with a negative stride.
+define void @test15(i32* nocapture %f) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 65536, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %f, i64 %indvars.iv
+  store i32 0, i32* %arrayidx, align 4
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp = icmp sgt i64 %indvars.iv, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+; CHECK-LABEL: @test15(
+; CHECK: call void @llvm.memset.p0i8.i64(i8* align 4 %f1, i8 0, i64 262148, i1 false)
+; CHECK-NOT: store
+; CHECK: ret void
+}
+
+; Loop with a negative stride.  Verify an aliasing write to f[65536] prevents
+; the creation of a memset.
+define void @test16(i32* nocapture %f) {
+entry:
+  %arrayidx1 = getelementptr inbounds i32, i32* %f, i64 65536
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 65536, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %f, i64 %indvars.iv
+  store i32 0, i32* %arrayidx, align 4
+  store i32 1, i32* %arrayidx1, align 4
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp = icmp sgt i64 %indvars.iv, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+; CHECK-LABEL: @test16(
+; CHECK-NOT: call void @llvm.memset.p0i8.i64
+; CHECK: ret void
+}
+
+; Handle memcpy-able loops with negative stride.
+define noalias i32* @test17(i32* nocapture readonly %a, i32 %c) {
+entry:
+  %conv = sext i32 %c to i64
+  %mul = shl nsw i64 %conv, 2
+  %call = tail call noalias i8* @malloc(i64 %mul)
+  %0 = bitcast i8* %call to i32*
+  %tobool.9 = icmp eq i32 %c, 0
+  br i1 %tobool.9, label %while.end, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %dec10.in = phi i32 [ %dec10, %while.body ], [ %c, %while.body.preheader ]
+  %dec10 = add nsw i32 %dec10.in, -1
+  %idxprom = sext i32 %dec10 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %0, i64 %idxprom
+  store i32 %1, i32* %arrayidx2, align 4
+  %tobool = icmp eq i32 %dec10, 0
+  br i1 %tobool, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:                               ; preds = %while.body
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  ret i32* %0
+; CHECK-LABEL: @test17(
+; CHECK: call void @llvm.memcpy
+; CHECK: ret i32*
+}
+
+declare noalias i8* @malloc(i64)
+
+; Handle memcpy-able loops with negative stride.
+; void test18(unsigned *__restrict__ a, unsigned *__restrict__ b) {
+;   for (int i = 2047; i >= 0; --i) {
+;     a[i] = b[i];
+;   }
+; }
+define void @test18(i32* noalias nocapture %a, i32* noalias nocapture readonly %b) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 2047, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %0, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp = icmp sgt i64 %indvars.iv, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+; CHECK-LABEL: @test18(
+; CHECK: call void @llvm.memcpy
+; CHECK: ret
+}
+
+; Two dimensional nested loop with negative stride should be promoted to one big memset.
+define void @test19(i8* nocapture %X) {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc4
+  %i.06 = phi i32 [ 99, %entry ], [ %dec5, %for.inc4 ]
+  %mul = mul nsw i32 %i.06, 100
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.05 = phi i32 [ 99, %for.cond1.preheader ], [ %dec, %for.body3 ]
+  %add = add nsw i32 %j.05, %mul
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i8, i8* %X, i64 %idxprom
+  store i8 0, i8* %arrayidx, align 1
+  %dec = add nsw i32 %j.05, -1
+  %cmp2 = icmp sgt i32 %j.05, 0
+  br i1 %cmp2, label %for.body3, label %for.inc4
+
+for.inc4:                                         ; preds = %for.body3
+  %dec5 = add nsw i32 %i.06, -1
+  %cmp = icmp sgt i32 %i.06, 0
+  br i1 %cmp, label %for.cond1.preheader, label %for.end6
+
+for.end6:                                         ; preds = %for.inc4
+  ret void
+; CHECK-LABEL: @test19(
+; CHECK: entry:
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 1 %X, i8 0, i64 10000, i1 false)
+; CHECK: ret void
+}
+
+; Handle loops where the trip count is a narrow integer that needs to be
+; extended.
+define void @form_memset_narrow_size(i64* %ptr, i32 %size) {
+; CHECK-LABEL: @form_memset_narrow_size(
+entry:
+  %cmp1 = icmp sgt i32 %size, 0
+  br i1 %cmp1, label %loop.ph, label %exit
+; CHECK:       entry:
+; CHECK:         %[[C1:.*]] = icmp sgt i32 %size, 0
+; CHECK-NEXT:    br i1 %[[C1]], label %loop.ph, label %exit
+
+loop.ph:
+  br label %loop.body
+; CHECK:       loop.ph:
+; CHECK-NEXT:    %[[ZEXT_SIZE:.*]] = zext i32 %size to i64
+; CHECK-NEXT:    %[[SCALED_SIZE:.*]] = shl i64 %[[ZEXT_SIZE]], 3
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 %{{.*}}, i8 0, i64 %[[SCALED_SIZE]], i1 false)
+
+loop.body:
+  %storemerge4 = phi i32 [ 0, %loop.ph ], [ %inc, %loop.body ]
+  %idxprom = sext i32 %storemerge4 to i64
+  %arrayidx = getelementptr inbounds i64, i64* %ptr, i64 %idxprom
+  store i64 0, i64* %arrayidx, align 8
+  %inc = add nsw i32 %storemerge4, 1
+  %cmp2 = icmp slt i32 %inc, %size
+  br i1 %cmp2, label %loop.body, label %loop.exit
+
+loop.exit:
+  br label %exit
+
+exit:
+  ret void
+}
+
+define void @form_memcpy_narrow_size(i64* noalias %dst, i64* noalias %src, i32 %size) {
+; CHECK-LABEL: @form_memcpy_narrow_size(
+entry:
+  %cmp1 = icmp sgt i32 %size, 0
+  br i1 %cmp1, label %loop.ph, label %exit
+; CHECK:       entry:
+; CHECK:         %[[C1:.*]] = icmp sgt i32 %size, 0
+; CHECK-NEXT:    br i1 %[[C1]], label %loop.ph, label %exit
+
+loop.ph:
+  br label %loop.body
+; CHECK:       loop.ph:
+; CHECK-NEXT:    %[[ZEXT_SIZE:.*]] = zext i32 %size to i64
+; CHECK-NEXT:    %[[SCALED_SIZE:.*]] = shl i64 %[[ZEXT_SIZE]], 3
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %{{.*}}, i8* align 8 %{{.*}}, i64 %[[SCALED_SIZE]], i1 false)
+
+loop.body:
+  %storemerge4 = phi i32 [ 0, %loop.ph ], [ %inc, %loop.body ]
+  %idxprom1 = sext i32 %storemerge4 to i64
+  %arrayidx1 = getelementptr inbounds i64, i64* %src, i64 %idxprom1
+  %v = load i64, i64* %arrayidx1, align 8
+  %idxprom2 = sext i32 %storemerge4 to i64
+  %arrayidx2 = getelementptr inbounds i64, i64* %dst, i64 %idxprom2
+  store i64 %v, i64* %arrayidx2, align 8
+  %inc = add nsw i32 %storemerge4, 1
+  %cmp2 = icmp slt i32 %inc, %size
+  br i1 %cmp2, label %loop.body, label %loop.exit
+
+loop.exit:
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Validate that "memset_pattern" has the proper attributes.
+; CHECK: declare void @memset_pattern16(i8* nocapture, i8* nocapture readonly, i64) [[ATTRS:#[0-9]+]]
+; CHECK: [[ATTRS]] = { argmemonly }

Added: llvm/trunk/test/Transforms/LoopIdiom/crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/crash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/crash.ll (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/crash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,25 @@
+; RUN: opt -basicaa -loop-idiom -S < %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Don't crash inside DependenceAnalysis
+; PR14219
+define void @test1(i64* %iwork, i64 %x)  {
+bb0:
+  %mul116 = mul nsw i64 %x, %x
+  %incdec.ptr6.sum175 = add i64 42, %x
+  %arrayidx135 = getelementptr inbounds i64, i64* %iwork, i64 %incdec.ptr6.sum175
+  br label %bb1
+bb1:
+  %storemerge4226 = phi i64 [ 0, %bb0 ], [ %inc139, %bb1 ]
+  store i64 1, i64* %arrayidx135, align 8
+  %incdec.ptr6.sum176 = add i64 %mul116, %storemerge4226
+  %arrayidx137 = getelementptr inbounds i64, i64* %iwork, i64 %incdec.ptr6.sum176
+  store i64 1, i64* %arrayidx137, align 8
+  %inc139 = add nsw i64 %storemerge4226, 1
+  %cmp131 = icmp sgt i64 %storemerge4226, 42
+  br i1 %cmp131, label %bb2, label %bb1
+bb2:
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/LoopIdiom/ctpop-multiple-users-crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/ctpop-multiple-users-crash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/ctpop-multiple-users-crash.ll (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/ctpop-multiple-users-crash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,34 @@
+; RUN: opt -loop-idiom -S < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios8.0.0"
+
+; When we replace the precondition with a ctpop, we need to ensure
+; that only the first branch reads the ctpop.  The store prior
+; to that should continue to read from the original compare.
+
+; CHECK: %tobool.5 = icmp ne i32 %num, 0
+; CHECK: store i1 %tobool.5, i1* %ptr
+
+define internal fastcc i32 @num_bits_set(i32 %num, i1* %ptr) #1 {
+entry:
+  %tobool.5 = icmp ne i32 %num, 0
+  store i1 %tobool.5, i1* %ptr
+  br i1 %tobool.5, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %count.07 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %num.addr.06 = phi i32 [ %num, %for.body.lr.ph ], [ %and, %for.body ]
+  %sub = add i32 %num.addr.06, -1
+  %and = and i32 %sub, %num.addr.06
+  %inc = add nsw i32 %count.07, 1
+  %tobool = icmp ne i32 %and, 0
+  br i1 %tobool, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  %count.0.lcssa = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  ret i32 %count.0.lcssa
+}
\ No newline at end of file

Added: llvm/trunk/test/Transforms/LoopIdiom/dbginfo-cost.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/dbginfo-cost.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/dbginfo-cost.ll (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/dbginfo-cost.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,68 @@
+; RUN: opt -S -loop-idiom -mtriple=systemz-unknown -mcpu=z13 %s | FileCheck %s
+
+; CHECK: @llvm.ctlz.i32
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #0
+define dso_local i32 @CeilLog2(i32 %arg) local_unnamed_addr #1 !dbg !38 {
+bb:
+  %tmp4 = add i32 %arg, -1, !dbg !45
+  call void @llvm.dbg.value(metadata i32 0, metadata !44, metadata !DIExpression()), !dbg !45
+  %tmp71 = icmp eq i32 %tmp4, 0, !dbg !45
+  br i1 %tmp71, label %bb13, label %bb8.preheader, !dbg !48
+
+bb8.preheader:                                    ; preds = %bb
+  br label %bb8, !dbg !49
+
+bb8:                                              ; preds = %bb8.preheader, %bb8
+  %tmp2.03 = phi i32 [ %tmp12, %bb8 ], [ 0, %bb8.preheader ]
+  %tmp1.02 = phi i32 [ %tmp10, %bb8 ], [ %tmp4, %bb8.preheader ]
+  call void @llvm.dbg.value(metadata i32 %tmp2.03, metadata !44, metadata !DIExpression()), !dbg !45
+  %tmp10 = lshr i32 %tmp1.02, 1, !dbg !49
+  %tmp12 = add nuw nsw i32 %tmp2.03, 1, !dbg !51
+  call void @llvm.dbg.value(metadata i32 %tmp12, metadata !44, metadata !DIExpression()), !dbg !45
+  %tmp7 = icmp eq i32 %tmp10, 0, !dbg !45
+  br i1 %tmp7, label %bb13.loopexit, label %bb8, !dbg !48, !llvm.loop !52
+
+bb13.loopexit:                                    ; preds = %bb8
+  %tmp12.lcssa = phi i32 [ %tmp12, %bb8 ], !dbg !51
+  br label %bb13, !dbg !54
+
+bb13:                                             ; preds = %bb13.loopexit, %bb
+  %tmp2.0.lcssa = phi i32 [ 0, %bb ], [ %tmp12.lcssa, %bb13.loopexit ], !dbg !55
+  call void @llvm.dbg.value(metadata i32 %tmp2.0.lcssa, metadata !44, metadata !DIExpression()), !dbg !45
+  ret i32 %tmp2.0.lcssa, !dbg !54
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata) #2
+
+attributes #0 = { nounwind readnone speculatable "target-cpu"="z13" }
+attributes #1 = { norecurse nounwind readnone "target-cpu"="z13" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone speculatable }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!36, !37}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 9.0.0 (ijonpan at m35lp38.lnxne.boe:llvm/llvm-dev-2/tools/clang a87ff88c6466fbedd6281513b9480a2cad6c08c8) (llvm/llvm-dev-2 922a3b1b3254bf3310c467e880a5419c1e13c87f)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, nameTableKind: None)
+!1 = !DIFile(filename: "configfile.c", directory: "/home/ijonpan/minispec-2006/spec-llvm/464.h264ref/build")
+!2 = !{}
+!4 = !DIFile(filename: "./global.h", directory: "/home/ijonpan/minispec-2006/spec-llvm/464.h264ref/build")
+!5 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+!36 = !{i32 2, !"Debug Info Version", i32 3}
+!37 = !{i32 1, !"wchar_size", i32 4}
+!38 = distinct !DISubprogram(name: "CeilLog2", scope: !1, file: !1, line: 599, type: !39, scopeLine: 600, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !41)
+!39 = !DISubroutineType(types: !40)
+!40 = !{!5, !5}
+!41 = !{!42, !43, !44}
+!42 = !DILocalVariable(name: "uiVal", arg: 1, scope: !38, file: !1, line: 599, type: !5)
+!43 = !DILocalVariable(name: "uiTmp", scope: !38, file: !1, line: 601, type: !5)
+!44 = !DILocalVariable(name: "uiRet", scope: !38, file: !1, line: 602, type: !5)
+!45 = !DILocation(line: 601, column: 25, scope: !38)
+!48 = !DILocation(line: 604, column: 3, scope: !38)
+!49 = !DILocation(line: 606, column: 11, scope: !50)
+!50 = distinct !DILexicalBlock(scope: !38, file: !1, line: 605, column: 3)
+!51 = !DILocation(line: 607, column: 10, scope: !50)
+!52 = distinct !{!52, !48, !53}
+!53 = !DILocation(line: 608, column: 3, scope: !38)
+!54 = !DILocation(line: 609, column: 3, scope: !38)
+!55 = !DILocation(line: 0, scope: !38)

Added: llvm/trunk/test/Transforms/LoopIdiom/debug-line.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/debug-line.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/debug-line.ll (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/debug-line.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,52 @@
+; RUN: opt -loop-idiom < %s -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin10.0.0"
+
+
+define void @foo(double* nocapture %a) nounwind ssp !dbg !0 {
+entry:
+  tail call void @llvm.dbg.value(metadata double* %a, metadata !5, metadata !DIExpression()), !dbg !8
+  tail call void @llvm.dbg.value(metadata i32 0, metadata !10, metadata !DIExpression()), !dbg !14
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %for.body ]
+  %arrayidx = getelementptr double, double* %a, i64 %indvar
+; CHECK: call void @llvm.memset{{.+}} !dbg 
+  store double 0.000000e+00, double* %arrayidx, align 8, !dbg !15
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp ne i64 %indvar.next, 1000
+  br i1 %exitcond, label %for.body, label %for.end, !dbg !14
+
+for.end:                                          ; preds = %for.body
+  tail call void @llvm.dbg.value(metadata !{null}, metadata !10, metadata !DIExpression()), !dbg !16
+  ret void, !dbg !17
+}
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
+
+declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone
+
+!llvm.module.flags = !{!19}
+!llvm.dbg.cu = !{!2}
+
+!0 = distinct !DISubprogram(name: "foo", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !2, file: !18, scope: !1, type: !3)
+!1 = !DIFile(filename: "li.c", directory: "/private/tmp")
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 2.9 (trunk 127165:127174)", isOptimized: true, emissionKind: FullDebug, file: !18, enums: !9, retainedTypes: !9)
+!3 = !DISubroutineType(types: !4)
+!4 = !{null}
+!5 = !DILocalVariable(name: "a", line: 2, arg: 1, scope: !0, file: !1, type: !6)
+!6 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, scope: !2, baseType: !7)
+!7 = !DIBasicType(tag: DW_TAG_base_type, name: "double", size: 64, align: 64, encoding: DW_ATE_float)
+!8 = !DILocation(line: 2, column: 18, scope: !0)
+!9 = !{}
+!10 = !DILocalVariable(name: "i", line: 3, scope: !11, file: !1, type: !13)
+!11 = distinct !DILexicalBlock(line: 3, column: 3, file: !18, scope: !12)
+!12 = distinct !DILexicalBlock(line: 2, column: 21, file: !18, scope: !0)
+!13 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!14 = !DILocation(line: 3, column: 3, scope: !12)
+!15 = !DILocation(line: 4, column: 5, scope: !11)
+!16 = !DILocation(line: 3, column: 29, scope: !11)
+!17 = !DILocation(line: 5, column: 1, scope: !12)
+!18 = !DIFile(filename: "li.c", directory: "/private/tmp")
+!19 = !{i32 1, !"Debug Info Version", i32 3}

Added: llvm/trunk/test/Transforms/LoopIdiom/int_sideeffect.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/int_sideeffect.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/int_sideeffect.ll (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/int_sideeffect.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,23 @@
+; RUN: opt -S < %s -loop-idiom | FileCheck %s
+
+declare void @llvm.sideeffect()
+
+; Loop idiom recognition across a @llvm.sideeffect.
+
+; CHECK-LABEL: zero
+; CHECK: llvm.memset
+define void @zero(float* %p, i64 %n) nounwind {
+bb7.lr.ph:
+  br label %bb7
+
+bb7:
+  %i.02 = phi i64 [ 0, %bb7.lr.ph ], [ %tmp13, %bb7 ]
+  %tmp10 = getelementptr inbounds float, float* %p, i64 %i.02
+  store float 0.000000e+00, float* %tmp10, align 4
+  %tmp13 = add i64 %i.02, 1
+  %tmp6 = icmp ult i64 %tmp13, %n
+  br i1 %tmp6, label %bb7, label %bb14
+
+bb14:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopIdiom/lir-heurs-multi-block-loop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/lir-heurs-multi-block-loop.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/lir-heurs-multi-block-loop.ll (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/lir-heurs-multi-block-loop.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,182 @@
+;  RUN: opt -basicaa -loop-idiom -use-lir-code-size-heurs=true < %s -S | FileCheck %s
+
+; When compiling for codesize we avoid idiom recognition for a
+; multi-block loop unless it is one of
+; - a loop_memset idiom, or
+; - a memset/memcpy idiom in a nested loop.
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1)
+ at APPLES = common global i32 0, align 4
+ at ORANGES = common global i32 0, align 4
+
+; LIR allowed: loop_memset idiom in multi-block loop.
+; ===================================================
+; CHECK-LABEL: @LoopMemset
+; CHECK: for.body.preheader:
+; CHECK: call void @llvm.memset
+; CHECK: for.body:
+;
+define i32 @LoopMemset([2048 x i8]* noalias nocapture %DST, i32 %SIZE) local_unnamed_addr optsize {
+entry:
+  %cmp12 = icmp sgt i32 %SIZE, 0
+  br i1 %cmp12, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %BASKET.013 = phi i32 [ %BASKET.1, %for.inc ], [ 0, %for.body.preheader ]
+  %arraydecay = getelementptr inbounds [2048 x i8], [2048 x i8]* %DST, i64 %indvars.iv, i64 0
+  tail call void @llvm.memset.p0i8.i64(i8* %arraydecay, i8 -1, i64 2048, i1 false)
+  %0 = trunc i64 %indvars.iv to i32
+  %rem11 = and i32 %0, 1
+  %cmp1 = icmp eq i32 %rem11, 0
+  %1 = load i32, i32* @ORANGES, align 4
+  %2 = load i32, i32* @APPLES, align 4
+  br i1 %cmp1, label %if.then, label %if.else
+
+if.else:                                          ; preds = %for.body
+  %dec3 = add nsw i32 %2, -1
+  store i32 %dec3, i32* @APPLES, align 4
+  br label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %dec = add nsw i32 %1, -1
+  store i32 %dec, i32* @ORANGES, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %if.else
+  %.pn = phi i32 [ %2, %if.then ], [ %1, %if.else ]
+  %BASKET.1 = add nsw i32 %.pn, %BASKET.013
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %SIZE
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.inc
+  %BASKET.1.lcssa = phi i32 [ %BASKET.1, %for.inc ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %BASKET.0.lcssa = phi i32 [ 0, %entry ], [ %BASKET.1.lcssa, %for.end.loopexit ]
+  ret i32 %BASKET.0.lcssa
+}
+
+; LIR allowed: memset idiom in multi-block nested loop,
+; which is recognized as a loop_memset in its turn.
+; =====================================================
+; CHECK-LABEL: @NestedMemset_LoopMemset
+; CHECK: for.cond1.preheader.preheader:
+; CHECK: call void @llvm.memset
+; CHECK: for.cond1.preheader:
+;
+define i32 @NestedMemset_LoopMemset([2046 x i8]* noalias nocapture %DST, i32 %SIZE) local_unnamed_addr optsize {
+entry:
+  %cmp25 = icmp sgt i32 %SIZE, 0
+  br i1 %cmp25, label %for.cond1.preheader.preheader, label %for.end11
+
+for.cond1.preheader.preheader:                    ; preds = %entry
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.inc9
+  %i.027 = phi i32 [ %inc10, %for.inc9 ], [ 0, %for.cond1.preheader.preheader ]
+  %BASKET.026 = phi i32 [ %BASKET.2.lcssa, %for.inc9 ], [ 0, %for.cond1.preheader.preheader ]
+  %idxprom4 = sext i32 %i.027 to i64
+  %rem22 = and i32 %i.027, 1
+  %cmp6 = icmp eq i32 %rem22, 0
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.inc ]
+  %BASKET.123 = phi i32 [ %BASKET.026, %for.cond1.preheader ], [ %BASKET.2, %for.inc ]
+  %arrayidx5 = getelementptr inbounds [2046 x i8], [2046 x i8]* %DST, i64 %idxprom4, i64 %indvars.iv
+  store i8 -1, i8* %arrayidx5, align 1
+  %0 = load i32, i32* @APPLES, align 4
+  %1 = load i32, i32* @ORANGES, align 4
+  br i1 %cmp6, label %if.then, label %if.else
+
+if.else:                                          ; preds = %for.body3
+  %dec8 = add nsw i32 %0, -1
+  store i32 %dec8, i32* @APPLES, align 4
+  br label %for.inc
+
+if.then:                                          ; preds = %for.body3
+  %dec = add nsw i32 %1, -1
+  store i32 %dec, i32* @ORANGES, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %if.else
+  %.pn = phi i32 [ %0, %if.then ], [ %1, %if.else ]
+  %BASKET.2 = add nsw i32 %.pn, %BASKET.123
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 2046
+  br i1 %exitcond, label %for.body3, label %for.inc9
+
+for.inc9:                                         ; preds = %for.inc
+  %BASKET.2.lcssa = phi i32 [ %BASKET.2, %for.inc ]
+  %inc10 = add nsw i32 %i.027, 1
+  %cmp = icmp slt i32 %inc10, %SIZE
+  br i1 %cmp, label %for.cond1.preheader, label %for.end11.loopexit
+
+for.end11.loopexit:                               ; preds = %for.inc9
+  %BASKET.2.lcssa.lcssa = phi i32 [ %BASKET.2.lcssa, %for.inc9 ]
+  br label %for.end11
+
+for.end11:                                        ; preds = %for.end11.loopexit, %entry
+  %BASKET.0.lcssa = phi i32 [ 0, %entry ], [ %BASKET.2.lcssa.lcssa, %for.end11.loopexit ]
+  ret i32 %BASKET.0.lcssa
+}
+
+; LIR avoided: memset idiom in multi-block top-level loop.
+; ========================================================
+; CHECK-LABEL: @Non_NestedMemset 
+; CHECK-NOT: call void @llvm.memset
+;
+define i32 @Non_NestedMemset(i8* noalias nocapture %DST, i32 %SIZE) local_unnamed_addr optsize {
+entry:
+  %cmp12 = icmp sgt i32 %SIZE, 0
+  br i1 %cmp12, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %BASKET.013 = phi i32 [ %BASKET.1, %for.inc ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i8, i8* %DST, i64 %indvars.iv
+  store i8 -1, i8* %arrayidx, align 1
+  %0 = trunc i64 %indvars.iv to i32
+  %rem11 = and i32 %0, 1
+  %cmp1 = icmp eq i32 %rem11, 0
+  %1 = load i32, i32* @ORANGES, align 4
+  %2 = load i32, i32* @APPLES, align 4
+  br i1 %cmp1, label %if.then, label %if.else
+
+if.else:                                          ; preds = %for.body
+  %dec3 = add nsw i32 %2, -1
+  store i32 %dec3, i32* @APPLES, align 4
+  br label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %dec = add nsw i32 %1, -1
+  store i32 %dec, i32* @ORANGES, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %if.else
+  %.pn = phi i32 [ %2, %if.then ], [ %1, %if.else ]
+  %BASKET.1 = add nsw i32 %.pn, %BASKET.013
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %SIZE
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.inc
+  %BASKET.1.lcssa = phi i32 [ %BASKET.1, %for.inc ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %BASKET.0.lcssa = phi i32 [ 0, %entry ], [ %BASKET.1.lcssa, %for.end.loopexit ]
+  ret i32 %BASKET.0.lcssa
+}
+

Added: llvm/trunk/test/Transforms/LoopIdiom/memset_noidiom.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/memset_noidiom.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/memset_noidiom.ll (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/memset_noidiom.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,30 @@
+; RUN: opt -loop-idiom < %s -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin10.0.0"
+
+; CHECK-LABEL: @memset(
+; CHECK-NOT: llvm.memset
+define i8* @memset(i8* %b, i32 %c, i64 %len) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp ult i64 0, %len
+  br i1 %cmp1, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %conv6 = trunc i32 %c to i8
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %indvar = phi i64 [ 0, %for.body.lr.ph ], [ %indvar.next, %for.body ]
+  %p.02 = getelementptr i8, i8* %b, i64 %indvar
+  store i8 %conv6, i8* %p.02, align 1
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp ne i64 %indvar.next, %len
+  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  ret i8* %b
+}
+

Added: llvm/trunk/test/Transforms/LoopIdiom/non-canonical-loop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/non-canonical-loop.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/non-canonical-loop.ll (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/non-canonical-loop.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,34 @@
+; RUN: opt -S -loop-idiom < %s
+; Don't crash
+; PR13892
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test(i32* %currMB) nounwind uwtable {
+entry:
+  br i1 undef, label %start.exit, label %if.then.i
+
+if.then.i:                                        ; preds = %entry
+  unreachable
+
+start.exit:                       ; preds = %entry
+  indirectbr i8* undef, [label %0, label %for.bodyprime]
+
+; <label>:0                                       ; preds = %start.exit
+  unreachable
+
+for.bodyprime:                                    ; preds = %for.bodyprime, %start.exit
+  %i.057375 = phi i32 [ 0, %start.exit ], [ %1, %for.bodyprime ]
+  %arrayidx8prime = getelementptr inbounds i32, i32* %currMB, i32 %i.057375
+  store i32 0, i32* %arrayidx8prime, align 4
+  %1 = add i32 %i.057375, 1
+  %cmp5prime = icmp slt i32 %1, 4
+  br i1 %cmp5prime, label %for.bodyprime, label %for.endprime
+
+for.endprime:                                     ; preds = %for.bodyprime
+  br label %for.body23prime
+
+for.body23prime:                                  ; preds = %for.body23prime, %for.endprime
+  br label %for.body23prime
+}

Added: llvm/trunk/test/Transforms/LoopIdiom/non-integral-pointers.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/non-integral-pointers.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/non-integral-pointers.ll (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/non-integral-pointers.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,48 @@
+; RUN: opt -S -basicaa -loop-idiom < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:4"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @f_0(i8 addrspace(3)** %ptr) {
+; CHECK-LABEL: @f_0(
+; CHECK: call{{.*}}memset
+
+; LIR'ing stores of pointers with address space 3 is fine, since
+; they're integral pointers.
+
+entry:
+  br label %for.body
+
+for.body:
+  %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %for.body ]
+  %arrayidx = getelementptr i8 addrspace(3)*, i8 addrspace(3)** %ptr, i64 %indvar
+  store i8 addrspace(3)* null, i8 addrspace(3)** %arrayidx, align 4
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, 10000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @f_1(i8 addrspace(4)** %ptr) {
+; CHECK-LABEL: @f_1(
+; CHECK-NOT: call{{.*}}memset
+
+; LIR'ing stores of pointers with address space 4 is not ok, since
+; they're non-integral pointers.
+
+entry:
+  br label %for.body
+
+for.body:
+  %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %for.body ]
+  %arrayidx = getelementptr i8 addrspace(4)*, i8 addrspace(4)** %ptr, i64 %indvar
+  store i8 addrspace(4)* null, i8 addrspace(4)** %arrayidx, align 4
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, 10000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopIdiom/nontemporal_store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/nontemporal_store.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/nontemporal_store.ll (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/nontemporal_store.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,32 @@
+; RUN: opt -loop-idiom < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,loop(loop-idiom)' < %s -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.bigBlock_t = type { [256 x <4 x float>] }
+
+; CHECK-LABEL: @test(
+; CHECK-NOT: llvm.memset
+define void @test(%struct.bigBlock_t* %p) {
+entry:
+  %0 = getelementptr inbounds %struct.bigBlock_t, %struct.bigBlock_t* %p, i64 0, i32 0, i64 0, i64 0
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %index.02 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %dst.01 = phi float* [ %0, %entry ], [ %add.ptr2, %for.body ]
+  %cast.i5 = bitcast float* %dst.01 to <4 x float>*
+  store <4 x float> zeroinitializer, <4 x float>* %cast.i5, align 16, !nontemporal !0
+  %add.ptr1 = getelementptr inbounds float, float* %dst.01, i64 4
+  %cast.i = bitcast float* %add.ptr1 to <4 x float>*
+  store <4 x float> zeroinitializer, <4 x float>* %cast.i, align 16, !nontemporal !0
+  %add.ptr2 = getelementptr inbounds float, float* %dst.01, i64 8
+  %add = add nuw nsw i32 %index.02, 32
+  %cmp = icmp ult i32 %add, 4096
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!0 = !{i32 1}

Added: llvm/trunk/test/Transforms/LoopIdiom/pr28196.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/pr28196.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/pr28196.ll (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/pr28196.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,53 @@
+; RUN: opt -loop-idiom -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test1() {
+entry:
+  br label %for.body.preheader
+
+for.body.preheader:                               ; preds = %for.cond
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %add.ptr3 = getelementptr inbounds i32, i32* null, i32 %indvars.iv
+  %add.ptr4 = getelementptr inbounds i32, i32* %add.ptr3, i32 1
+  %0 = load i32, i32* %add.ptr4, align 4
+  store i32 %0, i32* %add.ptr3, align 4
+  %indvars.iv.next = add nsw i32 %indvars.iv, 1
+  %exitcond = icmp ne i32 %indvars.iv.next, 6
+  br i1 %exitcond, label %for.body, label %for.body.preheader
+}
+
+; CHECK-LABEL: define void @test1(
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 null, i8* align 4 inttoptr (i64 4 to i8*), i64 24, i1 false)
+; CHECK-NOT: store
+
+define void @test1_no_null_opt() #0 {
+entry:
+  br label %for.body.preheader
+
+for.body.preheader:                               ; preds = %for.cond
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %add.ptr3 = getelementptr inbounds i32, i32* null, i32 %indvars.iv
+  %add.ptr4 = getelementptr inbounds i32, i32* %add.ptr3, i32 1
+  %0 = load i32, i32* %add.ptr4, align 4
+  store i32 %0, i32* %add.ptr3, align 4
+  %indvars.iv.next = add nsw i32 %indvars.iv, 1
+  %exitcond = icmp ne i32 %indvars.iv.next, 6
+  br i1 %exitcond, label %for.body, label %for.body.preheader
+}
+
+; CHECK-LABEL: define void @test1_no_null_opt(
+; CHECK-NOT: call void @llvm.memcpy
+; CHECK: getelementptr
+; CHECK: getelementptr
+; CHECK: load
+; CHECK: store
+
+attributes #0 = { "null-pointer-is-valid"="true" }

Added: llvm/trunk/test/Transforms/LoopIdiom/pr33114.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/pr33114.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/pr33114.ll (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/pr33114.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; Check that we're not crashing while looking at the recurrence variable.
+; RUN: opt -S -loop-idiom %s | FileCheck %s
+
+define void @tinkywinky() {
+; CHECK-LABEL: @tinkywinky(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[PH:%.*]]
+; CHECK:       ph:
+; CHECK-NEXT:    [[MYPHI:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[PATATINO:%.*]] = ashr i32 [[MYPHI]], undef
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[PATATINO]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[EXIT_LOOPEXIT:%.*]], label [[IF_END]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 true, label %exit, label %ph
+
+ph:
+  %myphi = phi i32 [ 1, %entry ]
+  br label %if.end
+
+if.end:
+  %patatino = ashr i32 %myphi, undef
+  %tobool = icmp eq i32 %patatino, 0
+  br i1 %tobool, label %exit, label %if.end
+
+exit:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopIdiom/scev-invalidation.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/scev-invalidation.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/scev-invalidation.ll (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/scev-invalidation.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,74 @@
+; RUN: opt -S -indvars -loop-idiom < %s
+; PR14214
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @quote_arg() nounwind {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %backslashes.0 = phi i32 [ undef, %entry ], [ %backslashes.2, %for.inc ]
+  %p.0 = phi i8* [ undef, %entry ], [ %incdec.ptr3, %for.inc ]
+  %q.0 = phi i8* [ undef, %entry ], [ %q.2, %for.inc ]
+  %0 = load i8, i8* %p.0, align 1
+  switch i8 %0, label %while.cond.preheader [
+    i8 0, label %for.cond4.preheader
+    i8 92, label %for.inc
+  ]
+
+while.cond.preheader:                             ; preds = %for.cond
+  %tobool210 = icmp eq i32 %backslashes.0, 0
+  br i1 %tobool210, label %for.inc.loopexit, label %while.body.lr.ph
+
+while.body.lr.ph:                                 ; preds = %while.cond.preheader
+  %1 = add i32 %backslashes.0, -1
+  %2 = zext i32 %1 to i64
+  br label %while.body
+
+for.cond4.preheader:                              ; preds = %for.cond
+  %tobool57 = icmp eq i32 %backslashes.0, 0
+  br i1 %tobool57, label %for.end10, label %for.body6.lr.ph
+
+for.body6.lr.ph:                                  ; preds = %for.cond4.preheader
+  br label %for.body6
+
+while.body:                                       ; preds = %while.body.lr.ph, %while.body
+  %q.112 = phi i8* [ %q.0, %while.body.lr.ph ], [ %incdec.ptr, %while.body ]
+  %backslashes.111 = phi i32 [ %backslashes.0, %while.body.lr.ph ], [ %dec, %while.body ]
+  %incdec.ptr = getelementptr inbounds i8, i8* %q.112, i64 1
+  store i8 92, i8* %incdec.ptr, align 1
+  %dec = add nsw i32 %backslashes.111, -1
+  %tobool2 = icmp eq i32 %dec, 0
+  br i1 %tobool2, label %while.cond.for.inc.loopexit_crit_edge, label %while.body
+
+while.cond.for.inc.loopexit_crit_edge:            ; preds = %while.body
+  %scevgep.sum = add i64 %2, 1
+  %scevgep13 = getelementptr i8, i8* %q.0, i64 %scevgep.sum
+  br label %for.inc.loopexit
+
+for.inc.loopexit:                                 ; preds = %while.cond.for.inc.loopexit_crit_edge, %while.cond.preheader
+  %q.1.lcssa = phi i8* [ %scevgep13, %while.cond.for.inc.loopexit_crit_edge ], [ %q.0, %while.cond.preheader ]
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.inc.loopexit, %for.cond
+  %backslashes.2 = phi i32 [ %backslashes.0, %for.cond ], [ 0, %for.inc.loopexit ]
+  %q.2 = phi i8* [ %q.0, %for.cond ], [ %q.1.lcssa, %for.inc.loopexit ]
+  %incdec.ptr3 = getelementptr inbounds i8, i8* %p.0, i64 1
+  br label %for.cond
+
+for.body6:                                        ; preds = %for.body6.lr.ph, %for.body6
+  %q.39 = phi i8* [ %q.0, %for.body6.lr.ph ], [ %incdec.ptr7, %for.body6 ]
+  %backslashes.38 = phi i32 [ %backslashes.0, %for.body6.lr.ph ], [ %dec9, %for.body6 ]
+  %incdec.ptr7 = getelementptr inbounds i8, i8* %q.39, i64 1
+  store i8 92, i8* %incdec.ptr7, align 1
+  %dec9 = add nsw i32 %backslashes.38, -1
+  %tobool5 = icmp eq i32 %dec9, 0
+  br i1 %tobool5, label %for.cond4.for.end10_crit_edge, label %for.body6
+
+for.cond4.for.end10_crit_edge:                    ; preds = %for.body6
+  br label %for.end10
+
+for.end10:                                        ; preds = %for.cond4.for.end10_crit_edge, %for.cond4.preheader
+  ret i32 undef
+}

Added: llvm/trunk/test/Transforms/LoopIdiom/scev-invalidation_topmostloop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/scev-invalidation_topmostloop.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/scev-invalidation_topmostloop.ll (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/scev-invalidation_topmostloop.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,25 @@
+; RUN: opt -S -indvars -loop-idiom -verify -loop-simplifycfg -loop-idiom < %s | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: @f1()
+; CHECK-NEXT: entry:
+define void @f1() {
+entry:
+  br label %lbl1
+
+lbl1:                                             ; preds = %if.end, %entry
+  br label %for
+
+for:                                              ; preds = %if.end, %lbl1
+  br label %lor.end
+
+lor.end:                                          ; preds = %for
+  br i1 undef, label %for.end, label %if.end
+
+if.end:                                           ; preds = %lor.end
+  br i1 undef, label %lbl1, label %for
+
+for.end:                                          ; preds = %lor.end
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopIdiom/struct-custom-dl.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/struct-custom-dl.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/struct-custom-dl.ll (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/struct-custom-dl.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,212 @@
+; RUN: opt -basicaa -loop-idiom < %s -S | FileCheck %s
+target datalayout = "e-p:40:64:64:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+%struct.foo = type { i32, i32 }
+%struct.foo1 = type { i32, i32, i32 }
+%struct.foo2 = type { i32, i16, i16 }
+
+;void bar1(foo_t *f, unsigned n) {
+;  for (unsigned i = 0; i < n; ++i) {
+;    f[i].a = 0;
+;    f[i].b = 0;
+;  }
+;}
+define void @bar1(%struct.foo* %f, i32 %n) nounwind ssp {
+entry:
+  %cmp1 = icmp eq i32 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %a = getelementptr inbounds %struct.foo, %struct.foo* %f, i32 %indvars.iv, i32 0
+  store i32 0, i32* %a, align 4
+  %b = getelementptr inbounds %struct.foo, %struct.foo* %f, i32 %indvars.iv, i32 1
+  store i32 0, i32* %b, align 4
+  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+  %exitcond = icmp ne i32 %indvars.iv.next, %n
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @bar1(
+; CHECK: call void @llvm.memset
+; CHECK-NOT: store
+}
+
+;void bar2(foo_t *f, unsigned n) {
+;  for (unsigned i = 0; i < n; ++i) {
+;    f[i].b = 0;
+;    f[i].a = 0;
+;  }
+;}
+define void @bar2(%struct.foo* %f, i32 %n) nounwind ssp {
+entry:
+  %cmp1 = icmp eq i32 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %b = getelementptr inbounds %struct.foo, %struct.foo* %f, i32 %indvars.iv, i32 1
+  store i32 0, i32* %b, align 4
+  %a = getelementptr inbounds %struct.foo, %struct.foo* %f, i32 %indvars.iv, i32 0
+  store i32 0, i32* %a, align 4
+  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+  %exitcond = icmp ne i32 %indvars.iv.next, %n
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @bar2(
+; CHECK: call void @llvm.memset
+; CHECK-NOT: store
+}
+
+;void bar3(foo_t *f, unsigned n) {
+;  for (unsigned i = n; i > 0; --i) {
+;    f[i].a = 0;
+;    f[i].b = 0;
+;  }
+;}
+define void @bar3(%struct.foo* nocapture %f, i32 %n) nounwind ssp {
+entry:
+  %cmp1 = icmp eq i32 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i32 [ %n, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %a = getelementptr inbounds %struct.foo, %struct.foo* %f, i32 %indvars.iv, i32 0
+  store i32 0, i32* %a, align 4
+  %b = getelementptr inbounds %struct.foo, %struct.foo* %f, i32 %indvars.iv, i32 1
+  store i32 0, i32* %b, align 4
+  %dec = add i32 %indvars.iv, -1
+  %cmp = icmp eq i32 %dec, 0
+  %indvars.iv.next = add nsw i32 %indvars.iv, -1
+  br i1 %cmp, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @bar3(
+; CHECK: call void @llvm.memset
+; CHECK-NOT: store
+}
+
+;void bar4(foo_t *f, unsigned n) {
+;  for (unsigned i = 0; i < n; ++i) {
+;    f[i].a = 0;
+;    f[i].b = 1;
+;  }
+;}
+define void @bar4(%struct.foo* nocapture %f, i32 %n) nounwind ssp {
+entry:
+  %cmp1 = icmp eq i32 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %a = getelementptr inbounds %struct.foo, %struct.foo* %f, i32 %indvars.iv, i32 0
+  store i32 0, i32* %a, align 4
+  %b = getelementptr inbounds %struct.foo, %struct.foo* %f, i32 %indvars.iv, i32 1
+  store i32 1, i32* %b, align 4
+  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+  %exitcond = icmp ne i32 %indvars.iv.next, %n
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @bar4(
+; CHECK-NOT: call void @llvm.memset
+}
+
+;void bar5(foo1_t *f, unsigned n) {
+;  for (unsigned i = 0; i < n; ++i) {
+;    f[i].a = 0;
+;    f[i].b = 0;
+;  }
+;}
+define void @bar5(%struct.foo1* nocapture %f, i32 %n) nounwind ssp {
+entry:
+  %cmp1 = icmp eq i32 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %a = getelementptr inbounds %struct.foo1, %struct.foo1* %f, i32 %indvars.iv, i32 0
+  store i32 0, i32* %a, align 4
+  %b = getelementptr inbounds %struct.foo1, %struct.foo1* %f, i32 %indvars.iv, i32 1
+  store i32 0, i32* %b, align 4
+  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+  %exitcond = icmp ne i32 %indvars.iv.next, %n
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @bar5(
+; CHECK-NOT: call void @llvm.memset
+}
+
+;void bar6(foo2_t *f, unsigned n) {
+;  for (unsigned i = 0; i < n; ++i) {
+;    f[i].a = 0;
+;    f[i].b = 0;
+;    f[i].c = 0;
+;  }
+;}
+define void @bar6(%struct.foo2* nocapture %f, i32 %n) nounwind ssp {
+entry:
+  %cmp1 = icmp eq i32 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %a = getelementptr inbounds %struct.foo2, %struct.foo2* %f, i32 %indvars.iv, i32 0
+  store i32 0, i32* %a, align 4
+  %b = getelementptr inbounds %struct.foo2, %struct.foo2* %f, i32 %indvars.iv, i32 1
+  store i16 0, i16* %b, align 4
+  %c = getelementptr inbounds %struct.foo2, %struct.foo2* %f, i32 %indvars.iv, i32 2
+  store i16 0, i16* %c, align 2
+  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+  %exitcond = icmp ne i32 %indvars.iv.next, %n
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @bar6(
+; CHECK: call void @llvm.memset
+; CHECK-NOT: store
+}

Added: llvm/trunk/test/Transforms/LoopIdiom/struct.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/struct.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/struct.ll (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/struct.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,221 @@
+; RUN: opt -basicaa -loop-idiom < %s -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+target triple = "x86_64-apple-darwin10.0.0"
+
+%struct.foo = type { i32, i32 }
+%struct.foo1 = type { i32, i32, i32 }
+%struct.foo2 = type { i32, i16, i16 }
+
+;void bar1(foo_t *f, unsigned n) {
+;  for (unsigned i = 0; i < n; ++i) {
+;    f[i].a = 0;
+;    f[i].b = 0;
+;  }
+;}
+define void @bar1(%struct.foo* %f, i32 %n) nounwind ssp {
+entry:
+  %cmp1 = icmp eq i32 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %a = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 0
+  store i32 0, i32* %a, align 4
+  %b = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 1
+  store i32 0, i32* %b, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @bar1(
+; CHECK: call void @llvm.memset
+; CHECK-NOT: store
+}
+
+;void bar2(foo_t *f, unsigned n) {
+;  for (unsigned i = 0; i < n; ++i) {
+;    f[i].b = 0;
+;    f[i].a = 0;
+;  }
+;}
+define void @bar2(%struct.foo* %f, i32 %n) nounwind ssp {
+entry:
+  %cmp1 = icmp eq i32 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %b = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 1
+  store i32 0, i32* %b, align 4
+  %a = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 0
+  store i32 0, i32* %a, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @bar2(
+; CHECK: call void @llvm.memset
+; CHECK-NOT: store
+}
+
+;void bar3(foo_t *f, unsigned n) {
+;  for (unsigned i = n; i > 0; --i) {
+;    f[i].a = 0;
+;    f[i].b = 0;
+;  }
+;}
+define void @bar3(%struct.foo* nocapture %f, i32 %n) nounwind ssp {
+entry:
+  %cmp1 = icmp eq i32 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %0 = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %a = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 0
+  store i32 0, i32* %a, align 4
+  %b = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 1
+  store i32 0, i32* %b, align 4
+  %1 = trunc i64 %indvars.iv to i32
+  %dec = add i32 %1, -1
+  %cmp = icmp eq i32 %dec, 0
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  br i1 %cmp, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @bar3(
+; CHECK: call void @llvm.memset
+; CHECK-NOT: store
+}
+
+;void bar4(foo_t *f, unsigned n) {
+;  for (unsigned i = 0; i < n; ++i) {
+;    f[i].a = 0;
+;    f[i].b = 1;
+;  }
+;}
+define void @bar4(%struct.foo* nocapture %f, i32 %n) nounwind ssp {
+entry:
+  %cmp1 = icmp eq i32 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %a = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 0
+  store i32 0, i32* %a, align 4
+  %b = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 1
+  store i32 1, i32* %b, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @bar4(
+; CHECK-NOT: call void @llvm.memset 
+}
+
+;void bar5(foo1_t *f, unsigned n) {
+;  for (unsigned i = 0; i < n; ++i) {
+;    f[i].a = 0;
+;    f[i].b = 0;
+;  }
+;}
+define void @bar5(%struct.foo1* nocapture %f, i32 %n) nounwind ssp {
+entry:
+  %cmp1 = icmp eq i32 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %a = getelementptr inbounds %struct.foo1, %struct.foo1* %f, i64 %indvars.iv, i32 0
+  store i32 0, i32* %a, align 4
+  %b = getelementptr inbounds %struct.foo1, %struct.foo1* %f, i64 %indvars.iv, i32 1
+  store i32 0, i32* %b, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @bar5(
+; CHECK-NOT: call void @llvm.memset 
+}
+
+;void bar6(foo2_t *f, unsigned n) {
+;  for (unsigned i = 0; i < n; ++i) {
+;    f[i].a = 0;
+;    f[i].b = 0;
+;    f[i].c = 0;
+;  }
+;}
+define void @bar6(%struct.foo2* nocapture %f, i32 %n) nounwind ssp {
+entry:
+  %cmp1 = icmp eq i32 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %a = getelementptr inbounds %struct.foo2, %struct.foo2* %f, i64 %indvars.iv, i32 0
+  store i32 0, i32* %a, align 4
+  %b = getelementptr inbounds %struct.foo2, %struct.foo2* %f, i64 %indvars.iv, i32 1
+  store i16 0, i16* %b, align 4
+  %c = getelementptr inbounds %struct.foo2, %struct.foo2* %f, i64 %indvars.iv, i32 2
+  store i16 0, i16* %c, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @bar6(
+; CHECK: call void @llvm.memset
+; CHECK-NOT: store
+}

Added: llvm/trunk/test/Transforms/LoopIdiom/struct_pattern.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/struct_pattern.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/struct_pattern.ll (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/struct_pattern.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,186 @@
+; RUN: opt -basicaa -loop-idiom < %s -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+; CHECK: @.memset_pattern = private unnamed_addr constant [4 x i32] [i32 2, i32 2, i32 2, i32 2], align 16
+; CHECK: @.memset_pattern.1 = private unnamed_addr constant [4 x i32] [i32 2, i32 2, i32 2, i32 2], align 16
+; CHECK: @.memset_pattern.2 = private unnamed_addr constant [4 x i32] [i32 2, i32 2, i32 2, i32 2], align 16
+
+target triple = "x86_64-apple-darwin10.0.0"
+
+%struct.foo = type { i32, i32 }
+%struct.foo1 = type { i32, i32, i32 }
+
+;void bar1(foo_t *f, unsigned n) {
+;  for (unsigned i = 0; i < n; ++i) {
+;    f[i].a = 2;
+;    f[i].b = 2;
+;  }
+;}
+define void @bar1(%struct.foo* %f, i32 %n) nounwind ssp {
+entry:
+  %cmp1 = icmp eq i32 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %a = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 0
+  store i32 2, i32* %a, align 4
+  %b = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 1
+  store i32 2, i32* %b, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @bar1(
+; CHECK: call void @memset_pattern16
+; CHECK-NOT: store
+}
+
+;void bar2(foo_t *f, unsigned n) {
+;  for (unsigned i = 0; i < n; ++i) {
+;    f[i].b = 2;
+;    f[i].a = 2;
+;  }
+;}
+define void @bar2(%struct.foo* %f, i32 %n) nounwind ssp {
+entry:
+  %cmp1 = icmp eq i32 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %b = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 1
+  store i32 2, i32* %b, align 4
+  %a = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 0
+  store i32 2, i32* %a, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @bar2(
+; CHECK: call void @memset_pattern16
+; CHECK-NOT: store
+}
+
+;void bar3(foo_t *f, unsigned n) {
+;  for (unsigned i = n; i > 0; --i) {
+;    f[i].a = 2;
+;    f[i].b = 2;
+;  }
+;}
+define void @bar3(%struct.foo* nocapture %f, i32 %n) nounwind ssp {
+entry:
+  %cmp1 = icmp eq i32 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %0 = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %a = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 0
+  store i32 2, i32* %a, align 4
+  %b = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 1
+  store i32 2, i32* %b, align 4
+  %1 = trunc i64 %indvars.iv to i32
+  %dec = add i32 %1, -1
+  %cmp = icmp eq i32 %dec, 0
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  br i1 %cmp, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @bar3(
+; CHECK: call void @memset_pattern16
+; CHECK-NOT: store
+}
+
+;void bar4(foo_t *f, unsigned n) {
+;  for (unsigned i = 0; i < n; ++i) {
+;    f[i].a = 0;
+;    f[i].b = 1;
+;  }
+;}
+define void @bar4(%struct.foo* nocapture %f, i32 %n) nounwind ssp {
+entry:
+  %cmp1 = icmp eq i32 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %a = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 0
+  store i32 0, i32* %a, align 4
+  %b = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 1
+  store i32 1, i32* %b, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @bar4(
+; CHECK-NOT: call void @memset_pattern16 
+}
+
+;void bar5(foo1_t *f, unsigned n) {
+;  for (unsigned i = 0; i < n; ++i) {
+;    f[i].a = 1;
+;    f[i].b = 1;
+;  }
+;}
+define void @bar5(%struct.foo1* nocapture %f, i32 %n) nounwind ssp {
+entry:
+  %cmp1 = icmp eq i32 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %a = getelementptr inbounds %struct.foo1, %struct.foo1* %f, i64 %indvars.iv, i32 0
+  store i32 1, i32* %a, align 4
+  %b = getelementptr inbounds %struct.foo1, %struct.foo1* %f, i64 %indvars.iv, i32 1
+  store i32 1, i32* %b, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @bar5(
+; CHECK-NOT: call void @memset_pattern16
+}

Added: llvm/trunk/test/Transforms/LoopIdiom/unordered-atomic-memcpy-noarch.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/unordered-atomic-memcpy-noarch.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/unordered-atomic-memcpy-noarch.ll (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/unordered-atomic-memcpy-noarch.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,28 @@
+; RUN: opt -basicaa -loop-idiom < %s -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+;; memcpy.atomic formation (atomic load & store) -- element size 2
+;;  Will not create call due to a max element size of 0
+define void @test1(i64 %Size) nounwind ssp {
+; CHECK-LABEL: @test1(
+; CHECK-NOT: call void @llvm.memcpy.element.unordered.atomic
+; CHECK: store
+; CHECK: ret void
+bb.nph:
+  %Base = alloca i16, i32 10000
+  %Dest = alloca i16, i32 10000
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i16, i16* %Base, i64 %indvar
+  %DestI = getelementptr i16, i16* %Dest, i64 %indvar
+  %V = load atomic i16, i16* %I.0.014 unordered, align 2
+  store atomic i16 %V, i16* %DestI unordered, align 2
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopIdiom/unroll-custom-dl.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/unroll-custom-dl.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/unroll-custom-dl.ll (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/unroll-custom-dl.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,78 @@
+; RUN: opt -basicaa -loop-idiom < %s -S | FileCheck %s
+target datalayout = "e-p:64:64:64:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+; CHECK: @.memset_pattern = private unnamed_addr constant [4 x i32] [i32 2, i32 2, i32 2, i32 2], align 16
+
+target triple = "x86_64-apple-darwin10.0.0"
+
+;void test(int *f, unsigned n) {
+;  for (unsigned i = 0; i < 2 * n; i += 2) {
+;    f[i] = 0;
+;    f[i+1] = 0;
+;  }
+;}
+define void @test(i32* %f, i32 %n) nounwind ssp {
+entry:
+  %0 = shl i32 %n, 1
+  %cmp1 = icmp eq i32 %0, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %f, i32 %indvars.iv
+  store i32 0, i32* %arrayidx, align 4
+  %1 = or i32 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %f, i32 %1
+  store i32 0, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 2
+  %cmp = icmp ult i32 %indvars.iv.next, %0
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @test(
+; CHECK: call void @llvm.memset
+; CHECK-NOT: store
+}
+
+;void test_pattern(int *f, unsigned n) {
+;  for (unsigned i = 0; i < 2 * n; i += 2) {
+;    f[i] = 2;
+;    f[i+1] = 2;
+;  }
+;}
+define void @test_pattern(i32* %f, i32 %n) nounwind ssp {
+entry:
+  %mul = shl i32 %n, 1
+  %cmp1 = icmp eq i32 %mul, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %f, i32 %indvars.iv
+  store i32 2, i32* %arrayidx, align 4
+  %x1 = or i32 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %f, i32 %x1
+  store i32 2, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 2
+  %cmp = icmp ult i32 %indvars.iv.next, %mul
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @test_pattern(
+; CHECK: call void @memset_pattern16
+; CHECK-NOT: store
+}

Added: llvm/trunk/test/Transforms/LoopIdiom/unroll.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/unroll.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/unroll.ll (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/unroll.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,80 @@
+; RUN: opt -basicaa -loop-idiom < %s -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+; CHECK: @.memset_pattern = private unnamed_addr constant [4 x i32] [i32 2, i32 2, i32 2, i32 2], align 16
+
+target triple = "x86_64-apple-darwin10.0.0"
+
+;void test(int *f, unsigned n) {
+;  for (unsigned i = 0; i < 2 * n; i += 2) {
+;    f[i] = 0;
+;    f[i+1] = 0;
+;  }
+;}
+define void @test(i32* %f, i32 %n) nounwind ssp {
+entry:
+  %mul = shl i32 %n, 1
+  %cmp1 = icmp eq i32 %mul, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %0 = zext i32 %mul to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %f, i64 %indvars.iv
+  store i32 0, i32* %arrayidx, align 4
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %f, i64 %1
+  store i32 0, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp ult i64 %indvars.iv.next, %0
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @test(
+; CHECK: call void @llvm.memset
+; CHECK-NOT: store
+}
+
+;void test_pattern(int *f, unsigned n) {
+;  for (unsigned i = 0; i < 2 * n; i += 2) {
+;    f[i] = 2;
+;    f[i+1] = 2;
+;  }
+;}
+define void @test_pattern(i32* %f, i32 %n) nounwind ssp {
+entry:
+  %mul = shl i32 %n, 1
+  %cmp1 = icmp eq i32 %mul, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %0 = zext i32 %mul to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %f, i64 %indvars.iv
+  store i32 2, i32* %arrayidx, align 4
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %f, i64 %1
+  store i32 2, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp ult i64 %indvars.iv.next, %0
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @test_pattern(
+; CHECK: call void @memset_pattern16
+; CHECK-NOT: store
+}

Added: llvm/trunk/test/Transforms/LoopIdiom/unsafe.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/unsafe.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/unsafe.ll (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/unsafe.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,55 @@
+; RUN: opt -S < %s -loop-idiom | FileCheck %s
+; CHECK-NOT: memset
+; check that memset is not generated (for stores) because that will result
+; in udiv hoisted out of the loop by the SCEV Expander
+; TODO: ideally we should be able to generate memset
+; if SCEV expander is taught to generate the dependencies
+; at the right point.
+
+ at a = global i32 0, align 4
+ at b = global i32 0, align 4
+ at c = external local_unnamed_addr global [1 x i8], align 1
+
+define void @e() local_unnamed_addr {
+entry:
+  %d0 = load i32, i32* @a, align 4
+  %d1 = load i32, i32* @b, align 4
+  br label %for.cond1thread-pre-split
+
+for.cond1thread-pre-split:                        ; preds = %for.body5, %entry
+  %div = udiv i32 %d0, %d1
+  br label %for.body5
+
+for.body5:                                        ; preds = %for.body5, %for.cond1thread-pre-split
+  %indvars.iv = phi i64 [ 0, %for.cond1thread-pre-split ], [ %indvars.iv.next, %for.body5 ]
+  %divx = sext i32 %div to i64
+  %0 = add nsw i64 %divx, %indvars.iv
+  %arrayidx = getelementptr inbounds [1 x i8], [1 x i8]* @c, i64 0, i64 %0
+  store i8 0, i8* %arrayidx, align 1
+  %indvars.iv.next = add nsw i64 %indvars.iv, 1
+  %1 = trunc i64 %indvars.iv.next to i32
+  %tobool4 = icmp eq i32 %1, 0
+  br i1 %tobool4, label %for.cond1thread-pre-split, label %for.body5
+}
+
+; The loop's trip count is depending on an unsafe operation
+; udiv. SCEV expander hoists it out of the loop, so loop-idiom
+; should check that the memset is not generated in this case.
+define void @f(i32 %a, i32 %b, i8* nocapture %x) local_unnamed_addr {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body6, %entry
+  %div = udiv i32 %a, %b
+  %conv = zext i32 %div to i64
+  br label %for.body6
+
+for.body6:                                        ; preds = %for.body6, %for.body
+  %i.09 = phi i64 [ %inc, %for.body6 ], [ 0, %for.body ]
+  %arrayidx = getelementptr inbounds i8, i8* %x, i64 %i.09
+  store i8 0, i8* %arrayidx, align 1
+  %inc = add nuw nsw i64 %i.09, 1
+  %cmp3 = icmp slt i64 %inc, %conv
+  br i1 %cmp3, label %for.body6, label %for.body
+}
+

Added: llvm/trunk/test/Transforms/LoopIdiom/unwind.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopIdiom/unwind.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopIdiom/unwind.ll (added)
+++ llvm/trunk/test/Transforms/LoopIdiom/unwind.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,33 @@
+; RUN: opt -loop-idiom < %s -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @ff()
+
+define void @test(i8* noalias nocapture %base, i64 %size) #1 {
+entry:
+  %cmp3 = icmp eq i64 %size, 0
+  br i1 %cmp3, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+; CHECK-LABEL: @test(
+; CHECK-NOT: llvm.memset
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  tail call void @ff()
+  %arrayidx = getelementptr inbounds i8, i8* %base, i64 %indvars.iv
+  store i8 0, i8* %arrayidx, align 1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, %size
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+attributes #1 = { uwtable }

Added: llvm/trunk/test/Transforms/LoopInstSimplify/basic.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopInstSimplify/basic.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopInstSimplify/basic.ll (added)
+++ llvm/trunk/test/Transforms/LoopInstSimplify/basic.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,165 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S %s -passes=loop-instsimplify | FileCheck %s
+; RUN: opt -S %s -passes=loop-instsimplify -enable-mssa-loop-dependency=true -verify-memoryssa | FileCheck %s
+
+; Test very basic folding and propagation occurs within a loop body. This should
+; collapse to the loop iteration structure and the LCSSA PHI node.
+define i32 @test1(i32 %n, i32 %x) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[I_NEXT]] = add nsw i32 [[I]], 1
+; CHECK-NEXT:    [[I_CMP:%.*]] = icmp slt i32 [[I_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[I_CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[X_LCSSA:%.*]] = phi i32 [ [[X:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[X_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %x.add = add nsw i32 %x, 0
+  %x.sub = sub i32 %x.add, 0
+  %x.and = and i32 %x.sub, -1
+  %i.next = add nsw i32 %i, 1
+  %i.cmp = icmp slt i32 %i.next, %n
+  br i1 %i.cmp, label %loop, label %exit
+
+exit:
+  %x.lcssa = phi i32 [ %x.and, %loop ]
+  ret i32 %x.lcssa
+}
+
+; Test basic loop structure that still has a simplification feed a prior PHI.
+define i32 @test2(i32 %n, i32 %x) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[I_NEXT]] = add nsw i32 [[I]], 1
+; CHECK-NEXT:    [[I_CMP:%.*]] = icmp slt i32 [[I_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[I_CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[X_LCSSA:%.*]] = phi i32 [ [[X:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[X_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %x.loop = phi i32 [ %x, %entry ], [ %x.next, %loop ]
+  %x.next = add nsw i32 %x.loop, 0
+  %i.next = add nsw i32 %i, 1
+  %i.cmp = icmp slt i32 %i.next, %n
+  br i1 %i.cmp, label %loop, label %exit
+
+exit:
+  %x.lcssa = phi i32 [ %x.loop, %loop ]
+  ret i32 %x.lcssa
+}
+
+; Test a diamond CFG with inner PHI nodes.
+define i32 @test3(i32 %n, i32 %x) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[X_CMP:%.*]] = icmp slt i32 [[I]], 42
+; CHECK-NEXT:    br i1 [[X_CMP]], label [[LOOP_LHS:%.*]], label [[LOOP_RHS:%.*]]
+; CHECK:       loop.lhs:
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.rhs:
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[I_NEXT]] = add nsw i32 [[I]], 1
+; CHECK-NEXT:    [[I_CMP:%.*]] = icmp slt i32 [[I_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[I_CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[X_LCSSA:%.*]] = phi i32 [ [[X:%.*]], [[LOOP_LATCH]] ]
+; CHECK-NEXT:    ret i32 [[X_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop.latch ]
+  %x.loop = phi i32 [ %x, %entry ], [ %x.phi, %loop.latch ]
+  %x.add = add nsw i32 %x.loop, 0
+  %x.cmp = icmp slt i32 %i, 42
+  br i1 %x.cmp, label %loop.lhs, label %loop.rhs
+
+loop.lhs:
+  %x.l.add = add nsw i32 %x.add, 0
+  br label %loop.latch
+
+loop.rhs:
+  %x.r.sub = sub nsw i32 %x.add, 0
+  br label %loop.latch
+
+loop.latch:
+  %x.phi = phi i32 [ %x.l.add, %loop.lhs ], [ %x.r.sub, %loop.rhs ]
+  %i.next = add nsw i32 %i, 1
+  %i.cmp = icmp slt i32 %i.next, %n
+  br i1 %i.cmp, label %loop, label %exit
+
+exit:
+  %x.lcssa = phi i32 [ %x.loop, %loop.latch ]
+  ret i32 %x.lcssa
+}
+
+; Test an inner loop that is only simplified when processing the outer loop, and
+; an outer loop only simplified when processing the inner loop.
+define i32 @test4(i32 %n, i32 %m, i32 %x) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_INNER:%.*]]
+; CHECK:       loop.inner:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[J_NEXT:%.*]], [[LOOP_INNER]] ]
+; CHECK-NEXT:    [[J_NEXT]] = add nsw i32 [[J]], 1
+; CHECK-NEXT:    [[J_CMP:%.*]] = icmp slt i32 [[J_NEXT]], [[M:%.*]]
+; CHECK-NEXT:    br i1 [[J_CMP]], label [[LOOP_INNER]], label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[I_NEXT]] = add nsw i32 [[I]], 1
+; CHECK-NEXT:    [[I_CMP:%.*]] = icmp slt i32 [[I_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[I_CMP]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[X_LCSSA:%.*]] = phi i32 [ [[X:%.*]], [[LOOP_LATCH]] ]
+; CHECK-NEXT:    ret i32 [[X_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop.latch ]
+  %x.loop = phi i32 [ %x, %entry ], [ %x.inner.lcssa, %loop.latch ]
+  %x.add = add nsw i32 %x.loop, 0
+  br label %loop.inner
+
+loop.inner:
+  %j = phi i32 [ 0, %loop ], [ %j.next, %loop.inner ]
+  %x.inner.loop = phi i32 [ %x.add, %loop ], [ %x.inner.add, %loop.inner ]
+  %x.inner.add = add nsw i32 %x.inner.loop, 0
+  %j.next = add nsw i32 %j, 1
+  %j.cmp = icmp slt i32 %j.next, %m
+  br i1 %j.cmp, label %loop.inner, label %loop.latch
+
+loop.latch:
+  %x.inner.lcssa = phi i32 [ %x.inner.loop, %loop.inner ]
+  %i.next = add nsw i32 %i, 1
+  %i.cmp = icmp slt i32 %i.next, %n
+  br i1 %i.cmp, label %loop, label %exit
+
+exit:
+  %x.lcssa = phi i32 [ %x.loop, %loop.latch ]
+  ret i32 %x.lcssa
+}

Added: llvm/trunk/test/Transforms/LoopInterchange/call-instructions.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopInterchange/call-instructions.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopInterchange/call-instructions.ll (added)
+++ llvm/trunk/test/Transforms/LoopInterchange/call-instructions.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,120 @@
+; REQUIRES: asserts
+; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S \
+; RUN:     -verify-dom-info -verify-loop-info -stats 2>&1 | FileCheck -check-prefix=STATS %s
+; RUN: FileCheck --input-file=%t %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = common global [100 x [100 x i32]] zeroinitializer
+
+declare void @foo(i64 %a)
+declare void @bar(i64 %a) readnone
+
+;;--------------------------------------Test case 01------------------------------------
+;; Not safe to interchange, because the called function `foo` is not marked as
+;; readnone, so it could introduce dependences.
+;;
+;;  for(int i=0;i<100;i++) {
+;;    for(int j=1;j<100;j++) {
+;;      foo(i);
+;;      A[j][i] = A[j][i]+k;
+;;    }
+;; }
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            CallInst
+; CHECK-NEXT: Function:        interchange_01
+; CHECK-NEXT: Args:
+; CHECK-NEXT  - String:          Cannot interchange loops due to call instruction.
+
+define void @interchange_01(i32 %k) {
+entry:
+  br label %for1.header
+
+for1.header:
+  %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for1.inc10 ]
+  br label %for2
+
+for2:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for2 ], [ 1, %for1.header ]
+  call void @foo(i64 %indvars.iv23)
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv23
+  %lv = load i32, i32* %arrayidx5
+  %add = add nsw i32 %lv, %k
+  store i32 %add, i32* %arrayidx5
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv, 99
+  br i1 %exitcond, label %for2.loopexit , label %for2
+
+for2.loopexit:
+  br label %for1.inc10
+
+for1.inc10:
+  %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
+  %exitcond26 = icmp eq i64 %indvars.iv23, 99
+  br i1 %exitcond26, label %for1.loopexit, label %for1.header
+
+for1.loopexit:
+  br label %exit
+
+exit:
+  ret void
+}
+
+;;--------------------------------------Test case 02------------------------------------
+;; Safe to interchange, because the called function `bar` is marked as readnone,
+;; so it cannot introduce dependences.
+;;
+;;  for(int i=0;i<100;i++) {
+;;    for(int j=1;j<100;j++) {
+;;      bar(i);
+;;      A[j][i] = A[j][i]+k;
+;;    }
+;; }
+
+; CHECK: --- !Passed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        interchange_02
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Loop interchanged with enclosing loop.
+; CHECK-NEXT: ...
+
+define void @interchange_02(i32 %k) {
+entry:
+  br label %for1.header
+
+for1.header:
+  %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for1.inc10 ]
+  br label %for2
+
+for2:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for2 ], [ 1, %for1.header ]
+  call void @bar(i64 %indvars.iv23)
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv23
+  %lv = load i32, i32* %arrayidx5
+  %add = add nsw i32 %lv, %k
+  store i32 %add, i32* %arrayidx5
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv, 99
+  br i1 %exitcond, label %for2.loopexit , label %for2
+
+for2.loopexit:
+  br label %for1.inc10
+
+for1.inc10:
+  %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
+  %exitcond26 = icmp eq i64 %indvars.iv23, 99
+  br i1 %exitcond26, label %for1.loopexit, label %for1.header
+
+for1.loopexit:
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check stats, we interchanged 1 out of 2 loops.
+; STATS: 1 loop-interchange - Number of loops interchanged

Added: llvm/trunk/test/Transforms/LoopInterchange/currentLimitation.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopInterchange/currentLimitation.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopInterchange/currentLimitation.ll (added)
+++ llvm/trunk/test/Transforms/LoopInterchange/currentLimitation.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,97 @@
+; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' \
+; RUN:   -pass-remarks-output=%t -verify-loop-info -verify-dom-info -S | FileCheck -check-prefix=IR %s
+; RUN: FileCheck --input-file=%t %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+ 
+ at A = common global [100 x [100 x i32]] zeroinitializer
+ at B = common global [100 x [100 x [100 x i32]]] zeroinitializer
+ at C = common global [100 x [100 x i64]] zeroinitializer
+ 
+;;--------------------------------------Test case 01------------------------------------
+;; [FIXME] This loop though valid is currently not interchanged due to the limitation that we cannot split the inner loop latch due to multiple use of inner induction
+;; variable.(used to increment the loop counter and to access A[j+1][i+1]
+;;  for(int i=0;i<N-1;i++)
+;;    for(int j=1;j<N-1;j++)
+;;      A[j+1][i+1] = A[j+1][i+1] + k;
+
+; FIXME: Currently fails because of DA changes.
+; IR-LABEL: @interchange_01
+; IR-NOT: split
+
+; CHECK:      Name:            Dependence
+; CHECK-NEXT: Function:        interchange_01
+
+define void @interchange_01(i32 %k, i32 %N) {
+ entry:
+   %sub = add nsw i32 %N, -1
+   %cmp26 = icmp sgt i32 %N, 1
+   br i1 %cmp26, label %for.cond1.preheader.lr.ph, label %for.end17
+ 
+ for.cond1.preheader.lr.ph:
+   %cmp324 = icmp sgt i32 %sub, 1
+   %0 = add i32 %N, -2
+   %1 = sext i32 %sub to i64
+   br label %for.cond1.preheader
+ 
+ for.cond.loopexit:
+   %cmp = icmp slt i64 %indvars.iv.next29, %1
+   br i1 %cmp, label %for.cond1.preheader, label %for.end17
+ 
+ for.cond1.preheader:
+   %indvars.iv28 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next29, %for.cond.loopexit ]
+   %indvars.iv.next29 = add nuw nsw i64 %indvars.iv28, 1
+   br i1 %cmp324, label %for.body4, label %for.cond.loopexit
+ 
+ for.body4:
+   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4 ], [ 1, %for.cond1.preheader ]
+   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+   %arrayidx7 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %indvars.iv.next, i64 %indvars.iv.next29
+   %2 = load i32, i32* %arrayidx7
+   %add8 = add nsw i32 %2, %k
+   store i32 %add8, i32* %arrayidx7
+   %lftr.wideiv = trunc i64 %indvars.iv to i32
+   %exitcond = icmp eq i32 %lftr.wideiv, %0
+   br i1 %exitcond, label %for.cond.loopexit, label %for.body4
+ 
+ for.end17: 
+   ret void
+}
+
+; When currently cannot interchange this loop, because transform currently
+; expects the latches to be the exiting blocks too.
+
+; IR-LABEL: @interchange_02
+; IR-NOT: split
+;
+; CHECK:      Name:            ExitingNotLatch
+; CHECK-NEXT: Function:        interchange_02
+define void @interchange_02(i64 %k, i64 %N) {
+entry:
+  br label %for1.header
+
+for1.header:
+  %j23 = phi i64 [ 0, %entry ], [ %j.next24, %for1.inc10 ]
+  br label %for2
+
+for2:
+  %j = phi i64 [ %j.next, %latch ], [ 0, %for1.header ]
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i64]], [100 x [100 x i64]]* @C, i64 0, i64 %j, i64 %j23
+  %lv = load i64, i64* %arrayidx5
+  %add = add nsw i64 %lv, %k
+  store i64 %add, i64* %arrayidx5
+  %exitcond = icmp eq i64 %j, 99
+  br i1 %exitcond, label %for1.inc10, label %latch
+latch:
+  %j.next = add nuw nsw i64 %j, 1
+  br label %for2
+
+for1.inc10:
+  %j.next24 = add nuw nsw i64 %j23, 1
+  %exitcond26 = icmp eq i64 %j23, 99
+  br i1 %exitcond26, label %for.end12, label %for1.header
+
+for.end12:
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopInterchange/debuginfo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopInterchange/debuginfo.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopInterchange/debuginfo.ll (added)
+++ llvm/trunk/test/Transforms/LoopInterchange/debuginfo.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,70 @@
+; RUN: opt < %s -basicaa -loop-interchange -pass-remarks='loop-interchange' -pass-remarks-output=%t -S \
+; RUN:     -verify-dom-info -verify-loop-info | FileCheck %s
+; RUN: FileCheck -check-prefix=REMARK --input-file=%t %s
+
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = common global [100 x [100 x i64]] zeroinitializer
+
+;;  for(int i=0;i<100;i++)
+;;    for(int j=0;j<100;j++)
+;;      A[j][i] = A[j][i]+k;
+
+; REMARK:      Name:            Interchanged
+; REMARK-NEXT: Function:        interchange_01
+; CHECK: split
+
+define void @interchange_01(i64 %k, i64 %N) !dbg !5 {
+entry:
+  br label %for1.header
+
+for1.header:
+  %j23 = phi i64 [ 0, %entry ], [ %j.next24, %for1.inc10 ]
+  call void @llvm.dbg.value(metadata i64 %j, metadata !13, metadata !DIExpression()), !dbg !14
+  br label %for2
+
+for2:
+  %j = phi i64 [ %j.next, %for2 ], [ 0, %for1.header ]
+  call void @llvm.dbg.value(metadata i64 %j, metadata !13, metadata !DIExpression()), !dbg !14
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i64]], [100 x [100 x i64]]* @A, i64 0, i64 %j, i64 %j23
+  %lv = load i64, i64* %arrayidx5
+  %add = add nsw i64 %lv, %k
+  store i64 %add, i64* %arrayidx5
+  %j.next = add nuw nsw i64 %j, 1
+  %exitcond = icmp eq i64 %j, 99
+  call void @llvm.dbg.value(metadata i64 %j, metadata !13, metadata !DIExpression()), !dbg !14
+  br i1 %exitcond, label %for1.inc10, label %for2
+
+for1.inc10:
+  %j.next24 = add nuw nsw i64 %j23, 1
+  call void @llvm.dbg.value(metadata i64 %j, metadata !13, metadata !DIExpression()), !dbg !14
+  %exitcond26 = icmp eq i64 %j23, 99
+  br i1 %exitcond26, label %for.end12, label %for1.header
+
+for.end12:
+  ret void
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "test.c", directory: "/test")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !8, !8, !11}
+!8 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !9)
+!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 32, align: 32)
+!10 = !DIBasicType(name: "float", size: 32, align: 32, encoding: DW_ATE_float)
+!11 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!12 = !{!13}
+!13 = !DILocalVariable(name: "a", arg: 1, scope: !5, file: !1, line: 1, type: !8)
+!14 = !DILocation(line: 1, column: 27, scope: !5)

Added: llvm/trunk/test/Transforms/LoopInterchange/inner-only-reductions.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopInterchange/inner-only-reductions.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopInterchange/inner-only-reductions.ll (added)
+++ llvm/trunk/test/Transforms/LoopInterchange/inner-only-reductions.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,124 @@
+; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S \
+; RUN:     -verify-dom-info -verify-loop-info -verify-loop-lcssa 2>&1 | FileCheck -check-prefix=IR %s
+; RUN: FileCheck --input-file=%t %s
+
+; Inner loop only reductions are not supported currently. See discussion at
+; D53027 for more information on the required checks.
+
+ at A = common global [500 x [500 x i32]] zeroinitializer
+ at X = common global i32 0
+ at B = common global [500 x [500 x i32]] zeroinitializer
+ at Y = common global i32 0
+
+;; global X
+
+;;  for( int i=1;i<N;i++)
+;;    for( int j=1;j<N;j++)
+;;      X+=A[j][i];
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedPHI
+; CHECK-NEXT: Function:        reduction_01
+
+; IR-LABEL: @reduction_01(
+; IR-NOT: split
+
+define void @reduction_01(i32 %N) {
+entry:
+  %cmp16 = icmp sgt i32 %N, 1
+  br i1 %cmp16, label %for.body3.lr.ph, label %for.end8
+
+for.body3.lr.ph:                                  ; preds = %for.cond1.for.inc6_crit_edge, %entry
+  %indvars.iv18 = phi i64 [ %indvars.iv.next19, %for.cond1.for.inc6_crit_edge ], [ 1, %entry ]
+  %X.promoted = load i32, i32* @X
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body3.lr.ph
+  %indvars.iv = phi i64 [ 1, %for.body3.lr.ph ], [ %indvars.iv.next, %for.body3 ]
+  %add15 = phi i32 [ %X.promoted, %for.body3.lr.ph ], [ %add, %for.body3 ]
+  %arrayidx5 = getelementptr inbounds [500 x [500 x i32]], [500 x [500 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv18
+  %0 = load i32, i32* %arrayidx5
+  %add = add nsw i32 %add15, %0
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.cond1.for.inc6_crit_edge, label %for.body3
+
+for.cond1.for.inc6_crit_edge:                     ; preds = %for.body3
+  %add.lcssa = phi i32 [ %add, %for.body3 ]
+  store i32 %add.lcssa, i32* @X
+  %indvars.iv.next19 = add nuw nsw i64 %indvars.iv18, 1
+  %lftr.wideiv20 = trunc i64 %indvars.iv.next19 to i32
+  %exitcond21 = icmp eq i32 %lftr.wideiv20, %N
+  br i1 %exitcond21, label %for.end8, label %for.body3.lr.ph
+
+for.end8:                                         ; preds = %for.cond1.for.inc6_crit_edge, %entry
+  ret void
+}
+
+;; Not tightly nested. Do not interchange.
+;;  for( int i=1;i<N;i++)
+;;    for( int j=1;j<N;j++) {
+;;      for( int k=1;k<N;k++) {
+;;        X+=A[k][j];
+;;      }
+;;      Y+=B[j][i];
+;;    }
+
+;; Not tightly nested. Do not interchange.
+;; Not interchanged hence the phi's in the inner loop will not be split.
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedPHIOuter
+; CHECK-NEXT: Function:        reduction_03
+
+; IR-LABEL: @reduction_03(
+; IR-NOT: split
+
+define void @reduction_03(i32 %N) {
+entry:
+  %cmp35 = icmp sgt i32 %N, 1
+  br i1 %cmp35, label %for.cond4.preheader.lr.ph, label %for.end19
+
+for.cond4.preheader.lr.ph:                        ; preds = %for.cond1.for.inc17_crit_edge, %entry
+  %indvars.iv41 = phi i64 [ %indvars.iv.next42, %for.cond1.for.inc17_crit_edge ], [ 1, %entry ]
+  %Y.promoted = load i32, i32* @Y
+  br label %for.body6.lr.ph
+
+for.body6.lr.ph:                                  ; preds = %for.cond4.for.end_crit_edge, %for.cond4.preheader.lr.ph
+  %indvars.iv37 = phi i64 [ 1, %for.cond4.preheader.lr.ph ], [ %indvars.iv.next38, %for.cond4.for.end_crit_edge ]
+  %add1334 = phi i32 [ %Y.promoted, %for.cond4.preheader.lr.ph ], [ %add13, %for.cond4.for.end_crit_edge ]
+  %X.promoted = load i32, i32* @X
+  br label %for.body6
+
+for.body6:                                        ; preds = %for.body6, %for.body6.lr.ph
+  %indvars.iv = phi i64 [ 1, %for.body6.lr.ph ], [ %indvars.iv.next, %for.body6 ]
+  %arrayidx8 = getelementptr inbounds [500 x [500 x i32]], [500 x [500 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv37
+  %0 = load i32, i32* %arrayidx8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.cond4.for.end_crit_edge, label %for.body6
+
+for.cond4.for.end_crit_edge:                      ; preds = %for.body6
+  %arrayidx12 = getelementptr inbounds [500 x [500 x i32]], [500 x [500 x i32]]* @B, i64 0, i64 %indvars.iv37, i64 %indvars.iv41
+  %1 = load i32, i32* %arrayidx12
+  %add13 = add nsw i32 %add1334, %1
+  %indvars.iv.next38 = add nuw nsw i64 %indvars.iv37, 1
+  %lftr.wideiv39 = trunc i64 %indvars.iv.next38 to i32
+  %exitcond40 = icmp eq i32 %lftr.wideiv39, %N
+  br i1 %exitcond40, label %for.cond1.for.inc17_crit_edge, label %for.body6.lr.ph
+
+for.cond1.for.inc17_crit_edge:                    ; preds = %for.cond4.for.end_crit_edge
+  %add13.lcssa = phi i32 [ %add13, %for.cond4.for.end_crit_edge ]
+  store i32 %add13.lcssa, i32* @Y
+  %indvars.iv.next42 = add nuw nsw i64 %indvars.iv41, 1
+  %lftr.wideiv43 = trunc i64 %indvars.iv.next42 to i32
+  %exitcond44 = icmp eq i32 %lftr.wideiv43, %N
+  br i1 %exitcond44, label %for.end19, label %for.cond4.preheader.lr.ph
+
+for.end19:                                        ; preds = %for.cond1.for.inc17_crit_edge, %entry
+  ret void
+}

Added: llvm/trunk/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll (added)
+++ llvm/trunk/test/Transforms/LoopInterchange/interchange-flow-dep-outer.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,76 @@
+; REQUIRES: asserts
+; RUN: opt < %s -basicaa -loop-interchange -verify-dom-info -verify-loop-info \
+; RUN:     -S -debug 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = common global [100 x [100 x i32]] zeroinitializer
+ at B = common global [100 x i32] zeroinitializer
+ at C = common global [100 x [100 x i32]] zeroinitializer
+ at D = common global [100 x [100 x [100 x i32]]] zeroinitializer
+
+;; Test that a flow dependency in outer loop doesn't prevent interchange in
+;; loops i and j.
+;;
+;;  for (int k = 0; k < 100; ++k) {
+;;    T[k] = fn1();
+;;    for (int i = 0; i < 1000; ++i)
+;;      for(int j = 1; j < 1000; ++j)
+;;        Arr[j][i] = Arr[j][i]+k;
+;;    fn2(T[k]);
+;;  }
+
+; CHECK: Processing Inner Loop Id = 2 and OuterLoopId = 1
+; CHECK: Loops interchanged.
+
+; CHECK: Processing Inner Loop Id = 1 and OuterLoopId = 0
+; CHECK: Not interchanging loops. Cannot prove legality.
+
+ at T = internal global [100 x double] zeroinitializer, align 4
+ at Arr = internal global [1000 x [1000 x i32]] zeroinitializer, align 4
+
+define void @interchange_09(i32 %k) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup4
+  ret void
+
+for.body:                                         ; preds = %for.cond.cleanup4, %entry
+  %indvars.iv45 = phi i64 [ 0, %entry ], [ %indvars.iv.next46, %for.cond.cleanup4 ]
+  %call = call double @fn1()
+  %arrayidx = getelementptr inbounds [100 x double], [100 x double]* @T, i64 0, i64 %indvars.iv45
+  store double %call, double* %arrayidx, align 8
+  br label %for.cond6.preheader
+
+for.cond6.preheader:                              ; preds = %for.cond.cleanup8, %for.body
+  %indvars.iv42 = phi i64 [ 0, %for.body ], [ %indvars.iv.next43, %for.cond.cleanup8 ]
+  br label %for.body9
+
+for.cond.cleanup4:                                ; preds = %for.cond.cleanup8
+  %tmp = load double, double* %arrayidx, align 8
+  call void @fn2(double %tmp)
+  %indvars.iv.next46 = add nuw nsw i64 %indvars.iv45, 1
+  %exitcond47 = icmp ne i64 %indvars.iv.next46, 100
+  br i1 %exitcond47, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup8:                                ; preds = %for.body9
+  %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1
+  %exitcond44 = icmp ne i64 %indvars.iv.next43, 1000
+  br i1 %exitcond44, label %for.cond6.preheader, label %for.cond.cleanup4
+
+for.body9:                                        ; preds = %for.body9, %for.cond6.preheader
+  %indvars.iv = phi i64 [ 1, %for.cond6.preheader ], [ %indvars.iv.next, %for.body9 ]
+  %arrayidx13 = getelementptr inbounds [1000 x [1000 x i32]], [1000 x [1000 x i32]]* @Arr, i64 0, i64 %indvars.iv, i64 %indvars.iv42
+  %tmp1 = load i32, i32* %arrayidx13, align 4
+  %tmp2 = trunc i64 %indvars.iv45 to i32
+  %add = add nsw i32 %tmp1, %tmp2
+  store i32 %add, i32* %arrayidx13, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.body9, label %for.cond.cleanup8
+}
+
+declare double @fn1() readnone
+declare void @fn2(double) readnone

Added: llvm/trunk/test/Transforms/LoopInterchange/interchange-insts-between-indvar.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopInterchange/interchange-insts-between-indvar.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopInterchange/interchange-insts-between-indvar.ll (added)
+++ llvm/trunk/test/Transforms/LoopInterchange/interchange-insts-between-indvar.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,48 @@
+; RUN: opt < %s -basicaa -loop-interchange -verify-dom-info -verify-loop-info \
+; RUN:     -S -pass-remarks=loop-interchange 2>&1 | FileCheck %s
+
+ at A10 = local_unnamed_addr global [3 x [3 x i32]] zeroinitializer, align 16
+
+;; Test to make sure we can handle zext instructions introduced by
+;; IndVarSimplify.
+;;
+;;  for (int i = 0; i < 2; ++i)
+;;    for(int j = 0; j < n; ++j) {
+;;      A[j][i] = i;
+;;    }
+
+; CHECK: Loop interchanged with enclosing loop.
+
+ at A11 = local_unnamed_addr global [3 x [3 x i32]] zeroinitializer, align 16
+
+define void @interchange_11(i32 %n) {
+entry:
+  br label %for.cond1.preheader
+
+for.cond.loopexit:                                ; preds = %for.body4
+  %exitcond28 = icmp ne i64 %indvars.iv.next27, 2
+  br i1 %exitcond28, label %for.cond1.preheader, label %for.cond.cleanup
+
+for.cond1.preheader:                              ; preds = %for.cond.loopexit, %entry
+  %indvars.iv26 = phi i64 [ 0, %entry ], [ %indvars.iv.next27, %for.cond.loopexit ]
+  %indvars.iv.next27 = add nuw nsw i64 %indvars.iv26, 1
+  br label %for.body4
+
+for.cond.cleanup:                                 ; preds = %for.cond.loopexit
+  ret void
+
+for.body4:                                        ; preds = %for.body4, %for.cond1.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body4 ]
+; The store below does not appear in the C snippet above.
+; With two stores in the loop there may be WAW dependences, and interchange is illegal.
+;  %arrayidx6 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv26
+;  %tmp = trunc i64 %indvars.iv26 to i32
+;  store i32 %tmp, i32* %arrayidx6, align 4
+  %arrayidx10 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* @A10, i64 0, i64 %indvars.iv, i64 %indvars.iv.next27
+  %tmp1 = trunc i64 %indvars.iv to i32
+  store i32 %tmp1, i32* %arrayidx10, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %n.wide = zext i32 %n to i64
+  %exitcond = icmp ne i64 %indvars.iv.next, %n.wide
+  br i1 %exitcond, label %for.body4, label %for.cond.loopexit
+}

Added: llvm/trunk/test/Transforms/LoopInterchange/interchange-no-deps.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopInterchange/interchange-no-deps.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopInterchange/interchange-no-deps.ll (added)
+++ llvm/trunk/test/Transforms/LoopInterchange/interchange-no-deps.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,95 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-interchange -simplifycfg -pass-remarks-output=%t \
+; RUN:     -pass-remarks=loop-interchange -pass-remarks-missed=loop-interchange -stats -S 2>&1 \
+; RUN:     | FileCheck -check-prefix=STATS %s
+; RUN: FileCheck -input-file %t %s
+
+
+; no_deps_interchange just accesses a single nested array and can be interchange.
+; CHECK:      Name:       Interchanged
+; CHECK-NEXT: Function:   no_deps_interchange
+define i32 @no_deps_interchange([1024 x i32]* nocapture %Arr) local_unnamed_addr #0 {
+entry:
+  br label %for1.header
+
+for1.header:                                         ; preds = %entry, %for1.inc
+  %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for1.inc ]
+  br label %for2
+
+for2:                                        ; preds = %for1.header, %for2
+  %indvars.iv = phi i64 [ 0, %for1.header ], [ %indvars.iv.next, %for2 ]
+  %arrayidx6 = getelementptr inbounds [1024 x i32], [1024 x i32]* %Arr, i64 %indvars.iv, i64 %indvars.iv19
+  store i32 0, i32* %arrayidx6, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for2, label %for1.inc
+
+for1.inc:
+  %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
+  %exitcond21 = icmp ne i64 %indvars.iv.next20, 1024
+  br i1 %exitcond21, label %for1.header, label %exit
+
+exit:                                 ; preds = %for1.inc
+  ret i32 0
+
+}
+
+; Only the inner loop induction variable is used for memory accesses.
+; Interchanging is not beneficial.
+; CHECK:      Name:       InterchangeNotProfitable
+; CHECK-NEXT: Function:   no_bad_order
+define i32 @no_bad_order(i32* %Arr) {
+entry:
+  br label %for1.header
+
+for1.header:                                         ; preds = %entry, %for1.inc
+  %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for1.inc ]
+  br label %for2
+
+for2:                                        ; preds = %for1.header, %for2
+  %indvars.iv = phi i64 [ 0, %for1.header ], [ %indvars.iv.next, %for2 ]
+  %arrayidx6 = getelementptr inbounds i32, i32* %Arr, i64 %indvars.iv
+  store i32 0, i32* %arrayidx6, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for2, label %for1.inc
+
+for1.inc:
+  %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
+  %exitcond21 = icmp ne i64 %indvars.iv.next20, 1024
+  br i1 %exitcond21, label %for1.header, label %exit
+
+exit:                                 ; preds = %for1.inc
+  ret i32 0
+}
+
+; No memory access using any induction variables, interchanging not beneficial.
+; CHECK:      Name:        InterchangeNotProfitable
+; CHECK-NEXT: Function:    no_mem_instrs
+define i32 @no_mem_instrs(i64* %ptr) {
+entry:
+  br label %for1.header
+
+for1.header:                                         ; preds = %entry, %for1.inc
+  %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for1.inc ]
+  br label %for2
+
+for2:                                        ; preds = %for1.header, %for2
+  %indvars.iv = phi i64 [ 0, %for1.header ], [ %indvars.iv.next, %for2 ]
+  store i64 %indvars.iv, i64* %ptr, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for2, label %for1.inc
+
+for1.inc:
+  %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
+  %exitcond21 = icmp ne i64 %indvars.iv.next20, 1024
+  br i1 %exitcond21, label %for1.header, label %exit
+
+exit:                                 ; preds = %for1.inc
+  ret i32 0
+}
+
+
+; Check stats, we interchanged 1 out of 3 loops.
+; STATS: 1 loop-interchange - Number of loops interchanged

Added: llvm/trunk/test/Transforms/LoopInterchange/interchangeable.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopInterchange/interchangeable.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopInterchange/interchangeable.ll (added)
+++ llvm/trunk/test/Transforms/LoopInterchange/interchangeable.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,167 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = common global [100 x [100 x i64]] zeroinitializer
+ at B = common global [100 x i64] zeroinitializer
+
+;;  for(int i=0;i<100;i++)
+;;    for(int j=0;j<100;j++)
+;;      A[j][i] = A[j][i]+k;
+
+define void @interchange_01(i64 %k, i64 %N) {
+; CHECK-LABEL: @interchange_01(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR2_PREHEADER:%.*]]
+; CHECK:       for1.header.preheader:
+; CHECK-NEXT:    br label [[FOR1_HEADER:%.*]]
+; CHECK:       for1.header:
+; CHECK-NEXT:    [[INDVARS_IV23:%.*]] = phi i64 [ [[INDVARS_IV_NEXT24:%.*]], [[FOR1_INC10:%.*]] ], [ 0, [[FOR1_HEADER_PREHEADER:%.*]] ]
+; CHECK-NEXT:    br label [[FOR2_SPLIT1:%.*]]
+; CHECK:       for2.preheader:
+; CHECK-NEXT:    br label [[FOR2:%.*]]
+; CHECK:       for2:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR2_SPLIT:%.*]] ], [ 0, [[FOR2_PREHEADER]] ]
+; CHECK-NEXT:    br label [[FOR1_HEADER_PREHEADER]]
+; CHECK:       for2.split1:
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x [100 x i64]], [100 x [100 x i64]]* @A, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV23]]
+; CHECK-NEXT:    [[LV:%.*]] = load i64, i64* [[ARRAYIDX5]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[LV]], [[K:%.*]]
+; CHECK-NEXT:    store i64 [[ADD]], i64* [[ARRAYIDX5]]
+; CHECK-NEXT:    br label [[FOR1_INC10]]
+; CHECK:       for2.split:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], 99
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END12:%.*]], label [[FOR2]]
+; CHECK:       for1.inc10:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT24]] = add nuw nsw i64 [[INDVARS_IV23]], 1
+; CHECK-NEXT:    [[EXITCOND26:%.*]] = icmp eq i64 [[INDVARS_IV23]], 99
+; CHECK-NEXT:    br i1 [[EXITCOND26]], label [[FOR2_SPLIT]], label [[FOR1_HEADER]]
+; CHECK:       for.end12:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for1.header
+
+for1.header:
+  %j23 = phi i64 [ 0, %entry ], [ %j.next24, %for1.inc10 ]
+  br label %for2
+
+for2:
+  %j = phi i64 [ %j.next, %for2 ], [ 0, %for1.header ]
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i64]], [100 x [100 x i64]]* @A, i64 0, i64 %j, i64 %j23
+  %lv = load i64, i64* %arrayidx5
+  %add = add nsw i64 %lv, %k
+  store i64 %add, i64* %arrayidx5
+  %j.next = add nuw nsw i64 %j, 1
+  %exitcond = icmp eq i64 %j, 99
+  br i1 %exitcond, label %for1.inc10, label %for2
+
+for1.inc10:
+  %j.next24 = add nuw nsw i64 %j23, 1
+  %exitcond26 = icmp eq i64 %j23, 99
+  br i1 %exitcond26, label %for.end12, label %for1.header
+
+for.end12:
+  ret void
+}
+
+;; for(int i=0;i<100;i++)
+;;   for(int j=100;j>=0;j--)
+;;     A[j][i] = A[j][i]+k;
+
+define void @interchange_02(i64 %k) {
+; CHECK-LABEL: @interchange_02(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR3_PREHEADER:%.*]]
+; CHECK:       for1.header.preheader:
+; CHECK-NEXT:    br label [[FOR1_HEADER:%.*]]
+; CHECK:       for1.header:
+; CHECK-NEXT:    [[INDVARS_IV19:%.*]] = phi i64 [ [[INDVARS_IV_NEXT20:%.*]], [[FOR1_INC10:%.*]] ], [ 0, [[FOR1_HEADER_PREHEADER:%.*]] ]
+; CHECK-NEXT:    br label [[FOR3_SPLIT1:%.*]]
+; CHECK:       for3.preheader:
+; CHECK-NEXT:    br label [[FOR3:%.*]]
+; CHECK:       for3:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR3_SPLIT:%.*]] ], [ 100, [[FOR3_PREHEADER]] ]
+; CHECK-NEXT:    br label [[FOR1_HEADER_PREHEADER]]
+; CHECK:       for3.split1:
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x [100 x i64]], [100 x [100 x i64]]* @A, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV19]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* [[ARRAYIDX5]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP0]], [[K:%.*]]
+; CHECK-NEXT:    store i64 [[ADD]], i64* [[ARRAYIDX5]]
+; CHECK-NEXT:    br label [[FOR1_INC10]]
+; CHECK:       for3.split:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[INDVARS_IV]], 0
+; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR3]], label [[FOR_END11:%.*]]
+; CHECK:       for1.inc10:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT20]] = add nuw nsw i64 [[INDVARS_IV19]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT20]], 100
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR3_SPLIT]], label [[FOR1_HEADER]]
+; CHECK:       for.end11:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for1.header
+
+for1.header:
+  %j19 = phi i64 [ 0, %entry ], [ %j.next20, %for1.inc10 ]
+  br label %for3
+
+for3:
+  %j = phi i64 [ 100, %for1.header ], [ %j.next, %for3 ]
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i64]], [100 x [100 x i64]]* @A, i64 0, i64 %j, i64 %j19
+  %0 = load i64, i64* %arrayidx5
+  %add = add nsw i64 %0, %k
+  store i64 %add, i64* %arrayidx5
+  %j.next = add nsw i64 %j, -1
+  %cmp2 = icmp sgt i64 %j, 0
+  br i1 %cmp2, label %for3, label %for1.inc10
+
+for1.inc10:
+  %j.next20 = add nuw nsw i64 %j19, 1
+  %exitcond = icmp eq i64 %j.next20, 100
+  br i1 %exitcond, label %for.end11, label %for1.header
+
+for.end11:
+  ret void
+}
+
+;; Test to make sure we can handle output dependencies.
+;;
+;;  for (int i = 1; i < 100; ++i)
+;;    for(int j = 1; j < 99; ++j) {
+;;      A[j][i] = i;
+;;      A[j][i+1] = j;
+;;    }
+;; FIXME: DA misses this case after D35430
+
+define void @interchange_10() {
+entry:
+  br label %for1.header
+
+for1.header:
+  %j23 = phi i64 [ 1, %entry ], [ %j.next24, %for1.inc10 ]
+  %j.next24 = add nuw nsw i64 %j23, 1
+  br label %for2
+
+for2:
+  %j = phi i64 [ %j.next, %for2 ], [ 1, %for1.header ]
+  %j.next = add nuw nsw i64 %j, 1
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i64]], [100 x [100 x i64]]* @A, i64 0, i64 %j, i64 %j23
+  store i64 %j, i64* %arrayidx5
+  %arrayidx10 = getelementptr inbounds [100 x [100 x i64]], [100 x [100 x i64]]* @A, i64 0, i64 %j, i64 %j.next24
+  store i64 %j23, i64* %arrayidx10
+  %exitcond = icmp eq i64 %j, 99
+  br i1 %exitcond, label %for1.inc10, label %for2
+
+for1.inc10:
+  %exitcond26 = icmp eq i64 %j23, 98
+  br i1 %exitcond26, label %for.end12, label %for1.header
+
+for.end12:
+  ret void
+
+}