[llvm] r358552 - Revert "Temporarily Revert "Add basic loop fusion pass.""

Tue Apr 16 21:53:01 PDT 2019

Added: llvm/trunk/test/Transforms/PhaseOrdering/2010-03-22-empty-baseclass.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PhaseOrdering/2010-03-22-empty-baseclass.ll?rev=358552&view=auto
==============================================================================

--- llvm/trunk/test/Transforms/PhaseOrdering/2010-03-22-empty-baseclass.ll (added)
+++ llvm/trunk/test/Transforms/PhaseOrdering/2010-03-22-empty-baseclass.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,162 @@
+; RUN: opt -O2 -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin11.1"
+
+%"struct.boost::compressed_pair<empty_t,int>" = type { %"struct.boost::details::compressed_pair_imp<empty_t,int,1>" }
+%"struct.boost::details::compressed_pair_imp<empty_t,int,1>" = type { i32 }
+%struct.empty_base_t = type <{ i8 }>
+%struct.empty_t = type <{ i8 }>
+
+ at .str = private constant [25 x i8] c"x.second() was clobbered\00", align 1 ; <[25 x i8]*> [#uses=1]
+
+define i32 @main(i32 %argc, i8** %argv) ssp {
+entry:
+  %argc_addr = alloca i32, align 4                ; <i32*> [#uses=1]
+  %argv_addr = alloca i8**, align 8               ; <i8***> [#uses=1]
+  %retval = alloca i32                            ; <i32*> [#uses=2]
+  %0 = alloca i32                                 ; <i32*> [#uses=2]
+  %retval.1 = alloca i8                           ; <i8*> [#uses=2]
+  %1 = alloca %struct.empty_base_t                ; <%struct.empty_base_t*> [#uses=1]
+  %2 = alloca %struct.empty_base_t*               ; <%struct.empty_base_t**> [#uses=1]
+  %x = alloca %"struct.boost::compressed_pair<empty_t,int>" ; <%"struct.boost::compressed_pair<empty_t,int>"*> [#uses=3]
+  %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
+  store i32 %argc, i32* %argc_addr
+  store i8** %argv, i8*** %argv_addr
+  %3 = call i32* @_ZN5boost15compressed_pairI7empty_tiE6secondEv(%"struct.boost::compressed_pair<empty_t,int>"* %x) ssp ; <i32*> [#uses=1]
+  store i32 -3, i32* %3, align 4
+  %4 = call %struct.empty_base_t* @_ZN5boost15compressed_pairI7empty_tiE5firstEv(%"struct.boost::compressed_pair<empty_t,int>"* %x) ssp ; <%struct.empty_base_t*> [#uses=1]
+  store %struct.empty_base_t* %4, %struct.empty_base_t** %2, align 8
+  call void @_ZN7empty_tC1Ev(%struct.empty_base_t* %1) nounwind
+  %5 = call i32* @_ZN5boost15compressed_pairI7empty_tiE6secondEv(%"struct.boost::compressed_pair<empty_t,int>"* %x) ssp ; <i32*> [#uses=1]
+  %6 = load i32, i32* %5, align 4                      ; <i32> [#uses=1]
+  %7 = icmp ne i32 %6, -3                         ; <i1> [#uses=1]
+  %8 = zext i1 %7 to i8                           ; <i8> [#uses=1]
+  store i8 %8, i8* %retval.1, align 1
+  %9 = load i8, i8* %retval.1, align 1                ; <i8> [#uses=1]
+  %toBool = icmp ne i8 %9, 0                      ; <i1> [#uses=1]
+  br i1 %toBool, label %bb, label %bb1
+
+bb:                                               ; preds = %entry
+  %10 = call i32 @puts(i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str, i64 0, i64 0)) ; <i32> [#uses=0]
+  call void @abort() noreturn
+  unreachable
+
+bb1:                                              ; preds = %entry
+  store i32 0, i32* %0, align 4
+  %11 = load i32, i32* %0, align 4                     ; <i32> [#uses=1]
+  store i32 %11, i32* %retval, align 4
+  br label %return
+
+; CHECK-NOT: x.second() was clobbered
+; CHECK: ret i32
+return:                                           ; preds = %bb1
+  %retval2 = load i32, i32* %retval                    ; <i32> [#uses=1]
+  ret i32 %retval2
+}
+
+define linkonce_odr void @_ZN12empty_base_tC2Ev(%struct.empty_base_t* %this) nounwind ssp align 2 {
+entry:
+  %this_addr = alloca %struct.empty_base_t*, align 8 ; <%struct.empty_base_t**> [#uses=1]
+  %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
+  store %struct.empty_base_t* %this, %struct.empty_base_t** %this_addr
+  br label %return
+
+return:                                           ; preds = %entry
+  ret void
+}
+
+define linkonce_odr void @_ZN7empty_tC1Ev(%struct.empty_base_t* %this) nounwind ssp align 2 {
+entry:
+  %this_addr = alloca %struct.empty_base_t*, align 8 ; <%struct.empty_base_t**> [#uses=2]
+  %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
+  store %struct.empty_base_t* %this, %struct.empty_base_t** %this_addr
+  %0 = load %struct.empty_base_t*, %struct.empty_base_t** %this_addr, align 8 ; <%struct.empty_base_t*> [#uses=1]
+  call void @_ZN12empty_base_tC2Ev(%struct.empty_base_t* %0) nounwind
+  br label %return
+
+return:                                           ; preds = %entry
+  ret void
+}
+
+define linkonce_odr i32* @_ZN5boost7details19compressed_pair_impI7empty_tiLi1EE6secondEv(%"struct.boost::details::compressed_pair_imp<empty_t,int,1>"* %this) nounwind ssp align 2 {
+entry:
+  %this_addr = alloca %"struct.boost::details::compressed_pair_imp<empty_t,int,1>"*, align 8 ; <%"struct.boost::details::compressed_pair_imp<empty_t,int,1>"**> [#uses=2]
+  %retval = alloca i32*                           ; <i32**> [#uses=2]
+  %0 = alloca i32*                                ; <i32**> [#uses=2]
+  %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
+  store %"struct.boost::details::compressed_pair_imp<empty_t,int,1>"* %this, %"struct.boost::details::compressed_pair_imp<empty_t,int,1>"** %this_addr
+  %1 = load %"struct.boost::details::compressed_pair_imp<empty_t,int,1>"*, %"struct.boost::details::compressed_pair_imp<empty_t,int,1>"** %this_addr, align 8 ; <%"struct.boost::details::compressed_pair_imp<empty_t,int,1>"*> [#uses=1]
+  %2 = getelementptr inbounds %"struct.boost::details::compressed_pair_imp<empty_t,int,1>", %"struct.boost::details::compressed_pair_imp<empty_t,int,1>"* %1, i32 0, i32 0 ; <i32*> [#uses=1]
+  store i32* %2, i32** %0, align 8
+  %3 = load i32*, i32** %0, align 8                     ; <i32*> [#uses=1]
+  store i32* %3, i32** %retval, align 8
+  br label %return
+
+return:                                           ; preds = %entry
+  %retval1 = load i32*, i32** %retval                   ; <i32*> [#uses=1]
+  ret i32* %retval1
+}
+
+define linkonce_odr i32* @_ZN5boost15compressed_pairI7empty_tiE6secondEv(%"struct.boost::compressed_pair<empty_t,int>"* %this) ssp align 2 {
+entry:
+  %this_addr = alloca %"struct.boost::compressed_pair<empty_t,int>"*, align 8 ; <%"struct.boost::compressed_pair<empty_t,int>"**> [#uses=2]
+  %retval = alloca i32*                           ; <i32**> [#uses=2]
+  %0 = alloca i32*                                ; <i32**> [#uses=2]
+  %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
+  store %"struct.boost::compressed_pair<empty_t,int>"* %this, %"struct.boost::compressed_pair<empty_t,int>"** %this_addr
+  %1 = load %"struct.boost::compressed_pair<empty_t,int>"*, %"struct.boost::compressed_pair<empty_t,int>"** %this_addr, align 8 ; <%"struct.boost::compressed_pair<empty_t,int>"*> [#uses=1]
+  %2 = getelementptr inbounds %"struct.boost::compressed_pair<empty_t,int>", %"struct.boost::compressed_pair<empty_t,int>"* %1, i32 0, i32 0 ; <%"struct.boost::details::compressed_pair_imp<empty_t,int,1>"*> [#uses=1]
+  %3 = call i32* @_ZN5boost7details19compressed_pair_impI7empty_tiLi1EE6secondEv(%"struct.boost::details::compressed_pair_imp<empty_t,int,1>"* %2) nounwind ; <i32*> [#uses=1]
+  store i32* %3, i32** %0, align 8
+  %4 = load i32*, i32** %0, align 8                     ; <i32*> [#uses=1]
+  store i32* %4, i32** %retval, align 8
+  br label %return
+
+return:                                           ; preds = %entry
+  %retval1 = load i32*, i32** %retval                   ; <i32*> [#uses=1]
+  ret i32* %retval1
+}
+
+define linkonce_odr %struct.empty_base_t* @_ZN5boost7details19compressed_pair_impI7empty_tiLi1EE5firstEv(%"struct.boost::details::compressed_pair_imp<empty_t,int,1>"* %this) nounwind ssp align 2 {
+entry:
+  %this_addr = alloca %"struct.boost::details::compressed_pair_imp<empty_t,int,1>"*, align 8 ; <%"struct.boost::details::compressed_pair_imp<empty_t,int,1>"**> [#uses=2]
+  %retval = alloca %struct.empty_base_t*          ; <%struct.empty_base_t**> [#uses=2]
+  %0 = alloca %struct.empty_base_t*               ; <%struct.empty_base_t**> [#uses=2]
+  %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
+  store %"struct.boost::details::compressed_pair_imp<empty_t,int,1>"* %this, %"struct.boost::details::compressed_pair_imp<empty_t,int,1>"** %this_addr
+  %1 = load %"struct.boost::details::compressed_pair_imp<empty_t,int,1>"*, %"struct.boost::details::compressed_pair_imp<empty_t,int,1>"** %this_addr, align 8 ; <%"struct.boost::details::compressed_pair_imp<empty_t,int,1>"*> [#uses=1]
+  %2 = bitcast %"struct.boost::details::compressed_pair_imp<empty_t,int,1>"* %1 to %struct.empty_base_t* ; <%struct.empty_base_t*> [#uses=1]
+  store %struct.empty_base_t* %2, %struct.empty_base_t** %0, align 8
+  %3 = load %struct.empty_base_t*, %struct.empty_base_t** %0, align 8    ; <%struct.empty_base_t*> [#uses=1]
+  store %struct.empty_base_t* %3, %struct.empty_base_t** %retval, align 8
+  br label %return
+
+return:                                           ; preds = %entry
+  %retval1 = load %struct.empty_base_t*, %struct.empty_base_t** %retval  ; <%struct.empty_base_t*> [#uses=1]
+  ret %struct.empty_base_t* %retval1
+}
+
+define linkonce_odr %struct.empty_base_t* @_ZN5boost15compressed_pairI7empty_tiE5firstEv(%"struct.boost::compressed_pair<empty_t,int>"* %this) ssp align 2 {
+entry:
+  %this_addr = alloca %"struct.boost::compressed_pair<empty_t,int>"*, align 8 ; <%"struct.boost::compressed_pair<empty_t,int>"**> [#uses=2]
+  %retval = alloca %struct.empty_base_t*          ; <%struct.empty_base_t**> [#uses=2]
+  %0 = alloca %struct.empty_base_t*               ; <%struct.empty_base_t**> [#uses=2]
+  %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
+  store %"struct.boost::compressed_pair<empty_t,int>"* %this, %"struct.boost::compressed_pair<empty_t,int>"** %this_addr
+  %1 = load %"struct.boost::compressed_pair<empty_t,int>"*, %"struct.boost::compressed_pair<empty_t,int>"** %this_addr, align 8 ; <%"struct.boost::compressed_pair<empty_t,int>"*> [#uses=1]
+  %2 = getelementptr inbounds %"struct.boost::compressed_pair<empty_t,int>", %"struct.boost::compressed_pair<empty_t,int>"* %1, i32 0, i32 0 ; <%"struct.boost::details::compressed_pair_imp<empty_t,int,1>"*> [#uses=1]
+  %3 = call %struct.empty_base_t* @_ZN5boost7details19compressed_pair_impI7empty_tiLi1EE5firstEv(%"struct.boost::details::compressed_pair_imp<empty_t,int,1>"* %2) nounwind ; <%struct.empty_base_t*> [#uses=1]
+  store %struct.empty_base_t* %3, %struct.empty_base_t** %0, align 8
+  %4 = load %struct.empty_base_t*, %struct.empty_base_t** %0, align 8    ; <%struct.empty_base_t*> [#uses=1]
+  store %struct.empty_base_t* %4, %struct.empty_base_t** %retval, align 8
+  br label %return
+
+return:                                           ; preds = %entry
+  %retval1 = load %struct.empty_base_t*, %struct.empty_base_t** %retval  ; <%struct.empty_base_t*> [#uses=1]
+  ret %struct.empty_base_t* %retval1
+}
+
+declare i32 @puts(i8*)
+
+declare void @abort() noreturn

Added: llvm/trunk/test/Transforms/PhaseOrdering/PR6627.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PhaseOrdering/PR6627.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/PhaseOrdering/PR6627.ll (added)
+++ llvm/trunk/test/Transforms/PhaseOrdering/PR6627.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,93 @@
+; RUN: opt -O3 -S < %s | FileCheck %s
+; XFAIL: *
+
+declare i32 @doo(...)
+
+; PR6627 - This whole nasty sequence should be flattened down to a single
+; 32-bit comparison.
+define void @test2(i8* %arrayidx) nounwind ssp {
+entry:
+  %xx = bitcast i8* %arrayidx to i32*
+  %x1 = load i32, i32* %xx, align 4
+  %tmp = trunc i32 %x1 to i8
+  %conv = zext i8 %tmp to i32
+  %cmp = icmp eq i32 %conv, 127
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %entry
+  %arrayidx4 = getelementptr inbounds i8, i8* %arrayidx, i64 1
+  %tmp5 = load i8, i8* %arrayidx4, align 1
+  %conv6 = zext i8 %tmp5 to i32
+  %cmp7 = icmp eq i32 %conv6, 69
+  br i1 %cmp7, label %land.lhs.true9, label %if.end
+
+land.lhs.true9:                                   ; preds = %land.lhs.true
+  %arrayidx12 = getelementptr inbounds i8, i8* %arrayidx, i64 2
+  %tmp13 = load i8, i8* %arrayidx12, align 1
+  %conv14 = zext i8 %tmp13 to i32
+  %cmp15 = icmp eq i32 %conv14, 76
+  br i1 %cmp15, label %land.lhs.true17, label %if.end
+
+land.lhs.true17:                                  ; preds = %land.lhs.true9
+  %arrayidx20 = getelementptr inbounds i8, i8* %arrayidx, i64 3
+  %tmp21 = load i8, i8* %arrayidx20, align 1
+  %conv22 = zext i8 %tmp21 to i32
+  %cmp23 = icmp eq i32 %conv22, 70
+  br i1 %cmp23, label %if.then, label %if.end
+
+if.then:                                          ; preds = %land.lhs.true17
+  %call25 = call i32 (...) @doo()
+  br label %if.end
+
+if.end:
+  ret void
+
+; CHECK-LABEL: @test2(
+; CHECK: %x1 = load i32, i32* %xx, align 4
+; CHECK-NEXT: icmp eq i32 %x1, 1179403647
+; CHECK-NEXT: br i1 {{.*}}, label %if.then, label %if.end 
+}
+
+; PR6627 - This should all be flattened down to one compare.  This is the same
+; as test2, except that the initial load is done as an i8 instead of i32, thus
+; requiring widening.
+define void @test2a(i8* %arrayidx) nounwind ssp {
+entry:
+  %x1 = load i8, i8* %arrayidx, align 4
+  %conv = zext i8 %x1 to i32
+  %cmp = icmp eq i32 %conv, 127
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %entry
+  %arrayidx4 = getelementptr inbounds i8, i8* %arrayidx, i64 1
+  %tmp5 = load i8, i8* %arrayidx4, align 1
+  %conv6 = zext i8 %tmp5 to i32
+  %cmp7 = icmp eq i32 %conv6, 69
+  br i1 %cmp7, label %land.lhs.true9, label %if.end
+
+land.lhs.true9:                                   ; preds = %land.lhs.true
+  %arrayidx12 = getelementptr inbounds i8, i8* %arrayidx, i64 2
+  %tmp13 = load i8, i8* %arrayidx12, align 1
+  %conv14 = zext i8 %tmp13 to i32
+  %cmp15 = icmp eq i32 %conv14, 76
+  br i1 %cmp15, label %land.lhs.true17, label %if.end
+
+land.lhs.true17:                                  ; preds = %land.lhs.true9
+  %arrayidx20 = getelementptr inbounds i8, i8* %arrayidx, i64 3
+  %tmp21 = load i8, i8* %arrayidx20, align 1
+  %conv22 = zext i8 %tmp21 to i32
+  %cmp23 = icmp eq i32 %conv22, 70
+  br i1 %cmp23, label %if.then, label %if.end
+
+if.then:                                          ; preds = %land.lhs.true17
+  %call25 = call i32 (...) @doo()
+  br label %if.end
+
+if.end:
+  ret void
+
+; CHECK-LABEL: @test2a(
+; CHECK: %x1 = load i32, i32* {{.*}}, align 4
+; CHECK-NEXT: icmp eq i32 %x1, 1179403647
+; CHECK-NEXT: br i1 {{.*}}, label %if.then, label %if.end 
+}

Added: llvm/trunk/test/Transforms/PhaseOrdering/basic.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PhaseOrdering/basic.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/PhaseOrdering/basic.ll (added)
+++ llvm/trunk/test/Transforms/PhaseOrdering/basic.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,51 @@
+; RUN: opt -O3 -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-macosx10.6.7"
+
+declare i8* @malloc(i64)
+declare void @free(i8*)
+
+
+; PR2338
+define void @test1() nounwind ssp {
+  %retval = alloca i32, align 4
+  %i = alloca i8*, align 8
+  %call = call i8* @malloc(i64 1)
+  store i8* %call, i8** %i, align 8
+  %tmp = load i8*, i8** %i, align 8
+  store i8 1, i8* %tmp
+  %tmp1 = load i8*, i8** %i, align 8
+  call void @free(i8* %tmp1)
+  ret void
+
+; CHECK-LABEL: @test1(
+; CHECK-NEXT: ret void
+}
+
+; This function exposes a phase ordering problem when InstCombine is
+; turning %add into a bitmask, making it difficult to spot a 0 return value.
+;
+; It it also important that %add is expressed as a multiple of %div so scalar
+; evolution can recognize it.
+define i32 @test2(i32 %a, i32* %p) nounwind uwtable ssp {
+entry:
+  %div = udiv i32 %a, 4
+  %arrayidx = getelementptr inbounds i32, i32* %p, i64 0
+  store i32 %div, i32* %arrayidx, align 4
+  %add = add i32 %div, %div
+  %arrayidx1 = getelementptr inbounds i32, i32* %p, i64 1
+  store i32 %add, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %p, i64 1
+  %0 = load i32, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %p, i64 0
+  %1 = load i32, i32* %arrayidx3, align 4
+  %mul = mul i32 2, %1
+  %sub = sub i32 %0, %mul
+  ret i32 %sub
+
+; CHECK-LABEL: @test2(
+; CHECK: %div = lshr i32 %a, 2
+; CHECK: %add = shl nuw nsw i32 %div, 1
+; CHECK: ret i32 0
+}

Added: llvm/trunk/test/Transforms/PhaseOrdering/bitfield-bittests.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PhaseOrdering/bitfield-bittests.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/PhaseOrdering/bitfield-bittests.ll (added)
+++ llvm/trunk/test/Transforms/PhaseOrdering/bitfield-bittests.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,130 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -O3 -S < %s                    | FileCheck %s
+; RUN: opt -passes='default<O3>' -S < %s  | FileCheck %s
+
+; These are tests that check for set/clear bits in a bitfield based on PR37098:
+; https://bugs.llvm.org/show_bug.cgi?id=37098
+;
+; The initial IR from clang has been transformed by SROA, but no other passes
+; have run yet. In all cases, we should reduce these to a mask and compare
+; instead of shift/cast/logic ops.
+;
+; Currently, this happens mostly through a combination of instcombine and
+; aggressive-instcombine. If pass ordering changes, we may have to adjust
+; the pattern matching in 1 or both of those passes.
+
+; Legal i32 is required to allow casting transforms that eliminate the zexts.
+target datalayout = "n32"
+
+define i32 @allclear(i32 %a) {
+; CHECK-LABEL: @allclear(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[A:%.*]], 15
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %a.sroa.0.0.trunc = trunc i32 %a to i8
+  %a.sroa.5.0.shift = lshr i32 %a, 8
+  %bf.clear = and i8 %a.sroa.0.0.trunc, 1
+  %bf.cast = zext i8 %bf.clear to i32
+  %bf.lshr = lshr i8 %a.sroa.0.0.trunc, 1
+  %bf.clear2 = and i8 %bf.lshr, 1
+  %bf.cast3 = zext i8 %bf.clear2 to i32
+  %or = or i32 %bf.cast, %bf.cast3
+  %bf.lshr5 = lshr i8 %a.sroa.0.0.trunc, 2
+  %bf.clear6 = and i8 %bf.lshr5, 1
+  %bf.cast7 = zext i8 %bf.clear6 to i32
+  %or8 = or i32 %or, %bf.cast7
+  %bf.lshr10 = lshr i8 %a.sroa.0.0.trunc, 3
+  %bf.clear11 = and i8 %bf.lshr10, 1
+  %bf.cast12 = zext i8 %bf.clear11 to i32
+  %or13 = or i32 %or8, %bf.cast12
+  %cmp = icmp eq i32 %or13, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @anyset(i32 %a) {
+; CHECK-LABEL: @anyset(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[A:%.*]], 15
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %a.sroa.0.0.trunc = trunc i32 %a to i8
+  %a.sroa.5.0.shift = lshr i32 %a, 8
+  %bf.clear = and i8 %a.sroa.0.0.trunc, 1
+  %bf.cast = zext i8 %bf.clear to i32
+  %bf.lshr = lshr i8 %a.sroa.0.0.trunc, 1
+  %bf.clear2 = and i8 %bf.lshr, 1
+  %bf.cast3 = zext i8 %bf.clear2 to i32
+  %or = or i32 %bf.cast, %bf.cast3
+  %bf.lshr5 = lshr i8 %a.sroa.0.0.trunc, 2
+  %bf.clear6 = and i8 %bf.lshr5, 1
+  %bf.cast7 = zext i8 %bf.clear6 to i32
+  %or8 = or i32 %or, %bf.cast7
+  %bf.lshr10 = lshr i8 %a.sroa.0.0.trunc, 3
+  %bf.clear11 = and i8 %bf.lshr10, 1
+  %bf.cast12 = zext i8 %bf.clear11 to i32
+  %or13 = or i32 %or8, %bf.cast12
+  %cmp = icmp ne i32 %or13, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @allset(i32 %a) {
+; CHECK-LABEL: @allset(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[A:%.*]], 15
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 15
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %a.sroa.0.0.trunc = trunc i32 %a to i8
+  %a.sroa.5.0.shift = lshr i32 %a, 8
+  %bf.clear = and i8 %a.sroa.0.0.trunc, 1
+  %bf.cast = zext i8 %bf.clear to i32
+  %bf.lshr = lshr i8 %a.sroa.0.0.trunc, 1
+  %bf.clear2 = and i8 %bf.lshr, 1
+  %bf.cast3 = zext i8 %bf.clear2 to i32
+  %and = and i32 %bf.cast, %bf.cast3
+  %bf.lshr5 = lshr i8 %a.sroa.0.0.trunc, 2
+  %bf.clear6 = and i8 %bf.lshr5, 1
+  %bf.cast7 = zext i8 %bf.clear6 to i32
+  %and8 = and i32 %and, %bf.cast7
+  %bf.lshr10 = lshr i8 %a.sroa.0.0.trunc, 3
+  %bf.clear11 = and i8 %bf.lshr10, 1
+  %bf.cast12 = zext i8 %bf.clear11 to i32
+  %and13 = and i32 %and8, %bf.cast12
+  %cmp = icmp ne i32 %and13, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @anyclear(i32 %a) {
+; CHECK-LABEL: @anyclear(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[A:%.*]], 15
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 15
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %a.sroa.0.0.trunc = trunc i32 %a to i8
+  %a.sroa.5.0.shift = lshr i32 %a, 8
+  %bf.clear = and i8 %a.sroa.0.0.trunc, 1
+  %bf.cast = zext i8 %bf.clear to i32
+  %bf.lshr = lshr i8 %a.sroa.0.0.trunc, 1
+  %bf.clear2 = and i8 %bf.lshr, 1
+  %bf.cast3 = zext i8 %bf.clear2 to i32
+  %and = and i32 %bf.cast, %bf.cast3
+  %bf.lshr5 = lshr i8 %a.sroa.0.0.trunc, 2
+  %bf.clear6 = and i8 %bf.lshr5, 1
+  %bf.cast7 = zext i8 %bf.clear6 to i32
+  %and8 = and i32 %and, %bf.cast7
+  %bf.lshr10 = lshr i8 %a.sroa.0.0.trunc, 3
+  %bf.clear11 = and i8 %bf.lshr10, 1
+  %bf.cast12 = zext i8 %bf.clear11 to i32
+  %and13 = and i32 %and8, %bf.cast12
+  %cmp = icmp eq i32 %and13, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+

Added: llvm/trunk/test/Transforms/PhaseOrdering/gdce.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PhaseOrdering/gdce.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/PhaseOrdering/gdce.ll (added)
+++ llvm/trunk/test/Transforms/PhaseOrdering/gdce.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,106 @@
+; RUN: opt -O2 -S < %s | FileCheck %s
+
+; Run global DCE to eliminate unused ctor and dtor.
+; rdar://9142819
+
+; CHECK: main
+; CHECK-NOT: _ZN4BaseC1Ev
+; CHECK-NOT: _ZN4BaseD1Ev
+; CHECK-NOT: _ZN4BaseD2Ev
+; CHECK-NOT: _ZN4BaseC2Ev
+; CHECK-NOT: _ZN4BaseD0Ev
+
+%class.Base = type { i32 (...)** }
+
+ at _ZTV4Base = linkonce_odr unnamed_addr constant [4 x i8*] [i8* null, i8* bitcast ({ i8*, i8* }* @_ZTI4Base to i8*), i8* bitcast (void (%class.Base*)* @_ZN4BaseD1Ev to i8*), i8* bitcast (void (%class.Base*)* @_ZN4BaseD0Ev to i8*)]
+ at _ZTVN10__cxxabiv117__class_type_infoE = external global i8*
+ at _ZTS4Base = linkonce_odr constant [6 x i8] c"4Base\00"
+ at _ZTI4Base = linkonce_odr unnamed_addr constant { i8*, i8* } { i8* bitcast (i8** getelementptr inbounds (i8*, i8** @_ZTVN10__cxxabiv117__class_type_infoE, i64 2) to i8*), i8* getelementptr inbounds ([6 x i8], [6 x i8]* @_ZTS4Base, i32 0, i32 0) }
+
+define i32 @main() uwtable ssp {
+entry:
+  %retval = alloca i32, align 4
+  %b = alloca %class.Base, align 8
+  %cleanup.dest.slot = alloca i32
+  store i32 0, i32* %retval
+  call void @_ZN4BaseC1Ev(%class.Base* %b)
+  store i32 0, i32* %retval
+  store i32 1, i32* %cleanup.dest.slot
+  call void @_ZN4BaseD1Ev(%class.Base* %b)
+  %0 = load i32, i32* %retval
+  ret i32 %0
+}
+
+define linkonce_odr void @_ZN4BaseC1Ev(%class.Base* %this) unnamed_addr uwtable ssp align 2 {
+entry:
+  %this.addr = alloca %class.Base*, align 8
+  store %class.Base* %this, %class.Base** %this.addr, align 8
+  %this1 = load %class.Base*, %class.Base** %this.addr
+  call void @_ZN4BaseC2Ev(%class.Base* %this1)
+  ret void
+}
+
+define linkonce_odr void @_ZN4BaseD1Ev(%class.Base* %this) unnamed_addr uwtable ssp align 2 {
+entry:
+  %this.addr = alloca %class.Base*, align 8
+  store %class.Base* %this, %class.Base** %this.addr, align 8
+  %this1 = load %class.Base*, %class.Base** %this.addr
+  call void @_ZN4BaseD2Ev(%class.Base* %this1)
+  ret void
+}
+
+define linkonce_odr void @_ZN4BaseD2Ev(%class.Base* %this) unnamed_addr nounwind uwtable ssp align 2 {
+entry:
+  %this.addr = alloca %class.Base*, align 8
+  store %class.Base* %this, %class.Base** %this.addr, align 8
+  %this1 = load %class.Base*, %class.Base** %this.addr
+  ret void
+}
+
+define linkonce_odr void @_ZN4BaseC2Ev(%class.Base* %this) unnamed_addr nounwind uwtable ssp align 2 {
+entry:
+  %this.addr = alloca %class.Base*, align 8
+  store %class.Base* %this, %class.Base** %this.addr, align 8
+  %this1 = load %class.Base*, %class.Base** %this.addr
+  %0 = bitcast %class.Base* %this1 to i8***
+  store i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTV4Base, i64 0, i64 2), i8*** %0
+  ret void
+}
+
+define linkonce_odr void @_ZN4BaseD0Ev(%class.Base* %this) unnamed_addr uwtable ssp align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  %this.addr = alloca %class.Base*, align 8
+  %exn.slot = alloca i8*
+  %ehselector.slot = alloca i32
+  store %class.Base* %this, %class.Base** %this.addr, align 8
+  %this1 = load %class.Base*, %class.Base** %this.addr
+  invoke void @_ZN4BaseD1Ev(%class.Base* %this1)
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:                                      ; preds = %entry
+  %0 = bitcast %class.Base* %this1 to i8*
+  call void @_ZdlPv(i8* %0) nounwind
+  ret void
+
+lpad:                                             ; preds = %entry
+  %1 = landingpad { i8*, i32 }
+          cleanup
+  %2 = extractvalue { i8*, i32 } %1, 0
+  store i8* %2, i8** %exn.slot
+  %3 = extractvalue { i8*, i32 } %1, 1
+  store i32 %3, i32* %ehselector.slot
+  %4 = bitcast %class.Base* %this1 to i8*
+  call void @_ZdlPv(i8* %4) nounwind
+  br label %eh.resume
+
+eh.resume:                                        ; preds = %lpad
+  %exn = load i8*, i8** %exn.slot
+  %sel = load i32, i32* %ehselector.slot
+  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0
+  %lpad.val2 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1
+  resume { i8*, i32 } %lpad.val2
+}
+
+declare i32 @__gxx_personality_v0(...)
+
+declare void @_ZdlPv(i8*) nounwind

Added: llvm/trunk/test/Transforms/PhaseOrdering/globalaa-retained.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PhaseOrdering/globalaa-retained.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/PhaseOrdering/globalaa-retained.ll (added)
+++ llvm/trunk/test/Transforms/PhaseOrdering/globalaa-retained.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,66 @@
+; RUN: opt -O3 -S < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64"
+
+ at v = internal unnamed_addr global i32 0, align 4
+ at p = common global i32* null, align 8
+
+
+; This test checks that a number of loads and stores are eliminated,
+; that can only be eliminated based on GlobalsAA information. As such,
+; it tests that GlobalsAA information is retained until the passes
+; that perform this optimization, and it protects against accidentally
+; dropping the GlobalsAA information earlier in the pipeline, which
+; has happened a few times.
+
+; GlobalsAA invalidation might happen later in the FunctionPassManager
+; pipeline than the optimization eliminating unnecessary loads/stores.
+; Since GlobalsAA is a module-level analysis, any FunctionPass
+; invalidating the GlobalsAA information will affect FunctionPass
+; pipelines that execute later. For example, assume a FunctionPass1 |
+; FunctionPass2 pipeline and 2 functions to be processed: f1 and f2.
+; Assume furthermore that FunctionPass1 uses GlobalsAA info to do an
+; optimization, and FunctionPass2 invalidates GlobalsAA. Assume the
+; function passes run in the following order: FunctionPass1(f1),
+; FunctionPass2(f1), FunctionPass1(f2), FunctionPass2(f2). Then
+; FunctionPass1 will not be able to optimize f2, since GlobalsAA will
+; have been invalidated in FuntionPass2(f1).
+
+; To try and also test this scenario, there is an empty function
+; before and after the function we're checking so that one of them
+; will be processed by the whole set of FunctionPasses before @f. That
+; will ensure that if the invalidation happens, it happens before the
+; actual optimizations on @f start.
+define void @bar() {
+entry:
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @f(i32 %n) {
+entry:
+  %0 = load i32, i32* @v, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* @v, align 4
+  %1 = load i32*, i32** @p, align 8
+  store i32 %n, i32* %1, align 4
+  %2 = load i32, i32* @v, align 4
+  %inc1 = add nsw i32 %2, 1
+  store i32 %inc1, i32* @v, align 4
+  ret void
+}
+
+; check variable v is loaded/stored only once after optimization,
+; which should be prove that globalsAA survives until the optimization
+; that can use it to optimize away the duplicate load/stores on
+; variable v.
+; CHECK:     load i32, i32* @v, align 4
+; CHECK:     store i32 {{.*}}, i32* @v, align 4
+; CHECK-NOT: load i32, i32* @v, align 4
+; CHECK-NOT:     store i32 {{.*}}, i32* @v, align 4
+
+; Same as @bar above, in case the functions are processed in reverse order.
+define void @bar2() {
+entry:
+  ret void
+}

Added: llvm/trunk/test/Transforms/PhaseOrdering/rotate.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PhaseOrdering/rotate.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/PhaseOrdering/rotate.ll (added)
+++ llvm/trunk/test/Transforms/PhaseOrdering/rotate.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -O3 -S < %s                    | FileCheck %s --check-prefixes=ANY,OLDPM
+; RUN: opt -passes='default<O3>' -S < %s  | FileCheck %s --check-prefixes=ANY,NEWPM
+
+; This should become a single funnel shift through a combination
+; of aggressive-instcombine, simplifycfg, and instcombine.
+; https://bugs.llvm.org/show_bug.cgi?id=34924
+; These are equivalent, but the value name with the new-pm shows a bug -
+; this code should not have been converted to a speculative select with
+; an intermediate transform.
+
+define i32 @rotl(i32 %a, i32 %b) {
+; OLDPM-LABEL: @rotl(
+; OLDPM-NEXT:  entry:
+; OLDPM-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.fshl.i32(i32 [[A:%.*]], i32 [[A]], i32 [[B:%.*]])
+; OLDPM-NEXT:    ret i32 [[TMP0]]
+;
+; NEWPM-LABEL: @rotl(
+; NEWPM-NEXT:  entry:
+; NEWPM-NEXT:    [[SPEC_SELECT:%.*]] = tail call i32 @llvm.fshl.i32(i32 [[A:%.*]], i32 [[A]], i32 [[B:%.*]])
+; NEWPM-NEXT:    ret i32 [[SPEC_SELECT]]
+;
+entry:
+  %cmp = icmp eq i32 %b, 0
+  br i1 %cmp, label %end, label %rotbb
+
+rotbb:
+  %sub = sub i32 32, %b
+  %shr = lshr i32 %a, %sub
+  %shl = shl i32 %a, %b
+  %or = or i32 %shr, %shl
+  br label %end
+
+end:
+  %cond = phi i32 [ %or, %rotbb ], [ %a, %entry ]
+  ret i32 %cond
+}
+

Added: llvm/trunk/test/Transforms/PhaseOrdering/scev-custom-dl.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PhaseOrdering/scev-custom-dl.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/PhaseOrdering/scev-custom-dl.ll (added)
+++ llvm/trunk/test/Transforms/PhaseOrdering/scev-custom-dl.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,67 @@
+; RUN: opt -O3 -S -analyze -scalar-evolution < %s | FileCheck %s
+
+target datalayout = "e-m:m-p:40:64:64:32-i32:32-i16:16-i8:8-n32"
+
+;
+; This file contains phase ordering tests for scalar evolution.
+; Test that the standard passes don't obfuscate the IR so scalar evolution can't
+; recognize expressions.
+
+; CHECK: test1
+; The loop body contains two increments by %div.
+; Make sure that 2*%div is recognizable, and not expressed as a bit mask of %d.
+; CHECK: -->  {%p,+,(8 * (%d /u 4))}
+define void @test1(i32 %d, i32* %p) nounwind uwtable ssp {
+entry:
+  %div = udiv i32 %d, 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %p.addr.0 = phi i32* [ %p, %entry ], [ %add.ptr1, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp ne i32 %i.0, 64
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  store i32 0, i32* %p.addr.0, align 4
+  %add.ptr = getelementptr inbounds i32, i32* %p.addr.0, i32 %div
+  store i32 1, i32* %add.ptr, align 4
+  %add.ptr1 = getelementptr inbounds i32, i32* %add.ptr, i32 %div
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; CHECK: test1a
+; Same thing as test1, but it is even more tempting to fold 2 * (%d /u 2)
+; CHECK: -->  {%p,+,(8 * (%d /u 2))}
+define void @test1a(i32 %d, i32* %p) nounwind uwtable ssp {
+entry:
+  %div = udiv i32 %d, 2
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %p.addr.0 = phi i32* [ %p, %entry ], [ %add.ptr1, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp ne i32 %i.0, 64
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  store i32 0, i32* %p.addr.0, align 4
+  %add.ptr = getelementptr inbounds i32, i32* %p.addr.0, i32 %div
+  store i32 1, i32* %add.ptr, align 4
+  %add.ptr1 = getelementptr inbounds i32, i32* %add.ptr, i32 %div
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}

Added: llvm/trunk/test/Transforms/PhaseOrdering/scev.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PhaseOrdering/scev.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/PhaseOrdering/scev.ll (added)
+++ llvm/trunk/test/Transforms/PhaseOrdering/scev.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,64 @@
+; RUN: opt -O3 -S -analyze -scalar-evolution < %s | FileCheck %s
+;
+; This file contains phase ordering tests for scalar evolution.
+; Test that the standard passes don't obfuscate the IR so scalar evolution can't
+; recognize expressions.
+
+; CHECK: test1
+; The loop body contains two increments by %div.
+; Make sure that 2*%div is recognizable, and not expressed as a bit mask of %d.
+; CHECK: -->  {%p,+,(8 * (%d /u 4))}
+define void @test1(i64 %d, i32* %p) nounwind uwtable ssp {
+entry:
+  %div = udiv i64 %d, 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %p.addr.0 = phi i32* [ %p, %entry ], [ %add.ptr1, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp ne i32 %i.0, 64
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  store i32 0, i32* %p.addr.0, align 4
+  %add.ptr = getelementptr inbounds i32, i32* %p.addr.0, i64 %div
+  store i32 1, i32* %add.ptr, align 4
+  %add.ptr1 = getelementptr inbounds i32, i32* %add.ptr, i64 %div
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; CHECK: test1a
+; Same thing as test1, but it is even more tempting to fold 2 * (%d /u 2)
+; CHECK: -->  {%p,+,(8 * (%d /u 2))}
+define void @test1a(i64 %d, i32* %p) nounwind uwtable ssp {
+entry:
+  %div = udiv i64 %d, 2
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %p.addr.0 = phi i32* [ %p, %entry ], [ %add.ptr1, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp ne i32 %i.0, 64
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  store i32 0, i32* %p.addr.0, align 4
+  %add.ptr = getelementptr inbounds i32, i32* %p.addr.0, i64 %div
+  store i32 1, i32* %add.ptr, align 4
+  %add.ptr1 = getelementptr inbounds i32, i32* %add.ptr, i64 %div
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}

Added: llvm/trunk/test/Transforms/PhaseOrdering/simplifycfg-options.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PhaseOrdering/simplifycfg-options.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/PhaseOrdering/simplifycfg-options.ll (added)
+++ llvm/trunk/test/Transforms/PhaseOrdering/simplifycfg-options.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -O1 -S < %s                    | FileCheck %s --check-prefix=ALL --check-prefix=OLDPM
+; RUN: opt -passes='default<O1>' -S < %s  | FileCheck %s --check-prefix=ALL --check-prefix=NEWPM
+
+; Don't simplify unconditional branches from empty blocks in simplifyCFG
+; until late in the pipeline because it can destroy canonical loop structure.
+
+define i1 @PR33605(i32 %a, i32 %b, i32* %c) {
+; ALL-LABEL: @PR33605(
+; ALL-NEXT:  for.body:
+; ALL-NEXT:    [[OR:%.*]] = or i32 [[B:%.*]], [[A:%.*]]
+; ALL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 1
+; ALL-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[OR]], [[TMP0]]
+; ALL-NEXT:    br i1 [[CMP]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; ALL:       if.then:
+; ALL-NEXT:    store i32 [[OR]], i32* [[ARRAYIDX]], align 4
+; ALL-NEXT:    tail call void @foo()
+; ALL-NEXT:    br label [[IF_END]]
+; ALL:       if.end:
+; ALL-NEXT:    [[CHANGED_1_OFF0:%.*]] = phi i1 [ true, [[IF_THEN]] ], [ false, [[FOR_BODY:%.*]] ]
+; ALL-NEXT:    [[TMP1:%.*]] = load i32, i32* [[C]], align 4
+; ALL-NEXT:    [[CMP_1:%.*]] = icmp eq i32 [[OR]], [[TMP1]]
+; ALL-NEXT:    br i1 [[CMP_1]], label [[IF_END_1:%.*]], label [[IF_THEN_1:%.*]]
+; ALL:       if.then.1:
+; ALL-NEXT:    store i32 [[OR]], i32* [[C]], align 4
+; ALL-NEXT:    tail call void @foo()
+; ALL-NEXT:    br label [[IF_END_1]]
+; ALL:       if.end.1:
+; ALL-NEXT:    [[CHANGED_1_OFF0_1:%.*]] = phi i1 [ true, [[IF_THEN_1]] ], [ [[CHANGED_1_OFF0]], [[IF_END]] ]
+; ALL-NEXT:    ret i1 [[CHANGED_1_OFF0_1]]
+;
+entry:
+  br label %for.cond
+
+for.cond:
+  %i.0 = phi i32 [ 2, %entry ], [ %dec, %if.end ]
+  %changed.0.off0 = phi i1 [ false, %entry ], [ %changed.1.off0, %if.end ]
+  %dec = add nsw i32 %i.0, -1
+  %tobool = icmp eq i32 %i.0, 0
+  br i1 %tobool, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  %changed.0.off0.lcssa = phi i1 [ %changed.0.off0, %for.cond ]
+  ret i1 %changed.0.off0.lcssa
+
+for.body:
+  %or = or i32 %a, %b
+  %idxprom = sext i32 %dec to i64
+  %arrayidx = getelementptr inbounds i32, i32* %c, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp = icmp eq i32 %or, %0
+  br i1 %cmp, label %if.end, label %if.then
+
+if.then:
+  store i32 %or, i32* %arrayidx, align 4
+  call void @foo()
+  br label %if.end
+
+if.end:
+  %changed.1.off0 = phi i1 [ true, %if.then ], [ %changed.0.off0, %for.body ]
+  br label %for.cond
+}
+
+declare void @foo()
+
+; PR34603 - https://bugs.llvm.org/show_bug.cgi?id=34603
+; We should have a select of doubles, not a select of double pointers.
+; SimplifyCFG should not flatten this before early-cse has a chance to eliminate redundant ops.
+
+define double @max_of_loads(double* %x, double* %y, i64 %i) {
+; ALL-LABEL: @max_of_loads(
+; ALL-NEXT:  entry:
+; ALL-NEXT:    [[XI_PTR:%.*]] = getelementptr double, double* [[X:%.*]], i64 [[I:%.*]]
+; ALL-NEXT:    [[YI_PTR:%.*]] = getelementptr double, double* [[Y:%.*]], i64 [[I]]
+; ALL-NEXT:    [[XI:%.*]] = load double, double* [[XI_PTR]], align 8
+; ALL-NEXT:    [[YI:%.*]] = load double, double* [[YI_PTR]], align 8
+; ALL-NEXT:    [[CMP:%.*]] = fcmp ogt double [[XI]], [[YI]]
+; ALL-NEXT:    [[XI_YI:%.*]] = select i1 [[CMP]], double [[XI]], double [[YI]]
+; ALL-NEXT:    ret double [[XI_YI]]
+;
+entry:
+  %xi_ptr = getelementptr double, double* %x, i64 %i
+  %yi_ptr = getelementptr double, double* %y, i64 %i
+  %xi = load double, double* %xi_ptr
+  %yi = load double, double* %yi_ptr
+  %cmp = fcmp ogt double %xi, %yi
+  br i1 %cmp, label %if, label %else
+
+if:
+  %xi_ptr_again = getelementptr double, double* %x, i64 %i
+  %xi_again = load double, double* %xi_ptr_again
+  br label %end
+
+else:
+  %yi_ptr_again = getelementptr double, double* %y, i64 %i
+  %yi_again = load double, double* %yi_ptr_again
+  br label %end
+
+end:
+  %max = phi double [ %xi_again,  %if ], [ %yi_again, %else ]
+  ret double %max
+}
+

Added: llvm/trunk/test/Transforms/PlaceSafepoints/basic.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PlaceSafepoints/basic.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/PlaceSafepoints/basic.ll (added)
+++ llvm/trunk/test/Transforms/PlaceSafepoints/basic.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,77 @@
+; RUN: opt < %s -S -place-safepoints | FileCheck %s
+
+
+; Do we insert a simple entry safepoint?
+define void @test_entry() gc "statepoint-example" {
+; CHECK-LABEL: @test_entry
+entry:
+; CHECK-LABEL: entry
+; CHECK: call void @do_safepoint
+  ret void
+}
+
+; On a non-gc function, we should NOT get an entry safepoint
+define void @test_negative() {
+; CHECK-LABEL: @test_negative
+entry:
+; CHECK-NOT: do_safepoint
+  ret void
+}
+
+; Do we insert a backedge safepoint in a statically
+; infinite loop?
+define void @test_backedge() gc "statepoint-example" {
+; CHECK-LABEL: test_backedge
+entry:
+; CHECK-LABEL: entry
+; This statepoint is technically not required, but we don't exploit that yet.
+; CHECK: call void @do_safepoint
+  br label %other
+
+; CHECK-LABEL: other
+; CHECK: call void @do_safepoint
+other:
+  br label %other
+}
+
+; Check that we remove an unreachable block rather than trying
+; to insert a backedge safepoint
+define void @test_unreachable() gc "statepoint-example" {
+; CHECK-LABEL: test_unreachable
+entry:
+; CHECK-LABEL: entry
+; CHECK: call void @do_safepoint
+  ret void
+
+; CHECK-NOT: other
+; CHECK-NOT: do_safepoint
+other:
+  br label %other
+}
+
+declare void @foo()
+
+declare zeroext i1 @i1_return_i1(i1)
+
+define i1 @test_call_with_result() gc "statepoint-example" {
+; CHECK-LABEL: test_call_with_result
+; This is checking that a statepoint_poll is inserted for a function
+; that takes 1 argument.
+; CHECK: call void @do_safepoint
+entry:
+  %call1 = tail call i1 (i1) @i1_return_i1(i1 false)
+  ret i1 %call1
+}
+
+; This function is inlined when inserting a poll.  To avoid recursive 
+; issues, make sure we don't place safepoints in it.
+declare void @do_safepoint()
+define void @gc.safepoint_poll() {
+; CHECK-LABEL: gc.safepoint_poll
+; CHECK-LABEL: entry
+; CHECK-NEXT: do_safepoint
+; CHECK-NEXT: ret void 
+entry:
+  call void @do_safepoint()
+  ret void
+}

Added: llvm/trunk/test/Transforms/PlaceSafepoints/call-in-loop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PlaceSafepoints/call-in-loop.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/PlaceSafepoints/call-in-loop.ll (added)
+++ llvm/trunk/test/Transforms/PlaceSafepoints/call-in-loop.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,30 @@
+; If there's a call in the loop which dominates the backedge, we 
+; don't need a safepoint poll (since the callee must contain a 
+; poll test).
+;; RUN: opt < %s -place-safepoints -S | FileCheck %s
+
+declare void @foo()
+
+define void @test1() gc "statepoint-example" {
+; CHECK-LABEL: test1
+
+entry:
+; CHECK-LABEL: entry
+; CHECK: call void @do_safepoint
+  br label %loop
+
+loop:
+; CHECK-LABEL: loop
+; CHECK-NOT: call void @do_safepoint
+  call void @foo()
+  br label %loop
+}
+
+; This function is inlined when inserting a poll.
+declare void @do_safepoint()
+define void @gc.safepoint_poll() {
+; CHECK-LABEL: gc.safepoint_poll
+entry:
+  call void @do_safepoint()
+  ret void
+}

Added: llvm/trunk/test/Transforms/PlaceSafepoints/finite-loops.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PlaceSafepoints/finite-loops.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/PlaceSafepoints/finite-loops.ll (added)
+++ llvm/trunk/test/Transforms/PlaceSafepoints/finite-loops.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,143 @@
+; Tests to ensure that we are not placing backedge safepoints in
+; loops which are clearly finite.
+;; RUN: opt < %s -place-safepoints -spp-counted-loop-trip-width=32 -S | FileCheck %s
+;; RUN: opt < %s -place-safepoints -spp-counted-loop-trip-width=64 -S | FileCheck %s -check-prefix=COUNTED-64
+
+
+; A simple counted loop with trivially known range
+define void @test1(i32) gc "statepoint-example" {
+; CHECK-LABEL: test1
+; CHECK-LABEL: entry
+; CHECK: call void @do_safepoint
+; CHECK-LABEL: loop
+; CHECK-NOT: call void @do_safepoint
+; CHECK-LABEL: exit
+
+entry:
+  br label %loop
+
+loop:
+  %counter = phi i32 [ 0 , %entry ], [ %counter.inc , %loop ]
+  %counter.inc = add i32 %counter, 1
+  %counter.cmp = icmp slt i32 %counter.inc, 16
+  br i1 %counter.cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; The same counted loop, but with an unknown early exit
+define void @test2(i32) gc "statepoint-example" {
+; CHECK-LABEL: test2
+; CHECK-LABEL: entry
+; CHECK: call void @do_safepoint
+; CHECK-LABEL: loop
+; CHECK-NOT: call void @do_safepoint
+; CHECK-LABEL: exit
+
+entry:
+  br label %loop
+
+loop:
+  %counter = phi i32 [ 0 , %entry ], [ %counter.inc , %continue ]
+  %counter.inc = add i32 %counter, 1
+  %counter.cmp = icmp slt i32 %counter.inc, 16
+  br i1 undef, label %continue, label %exit
+
+continue:
+  br i1 %counter.cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; The range is a 8 bit value and we can't overflow
+define void @test3(i8 %upper) gc "statepoint-example" {
+; CHECK-LABEL: test3
+; CHECK-LABEL: entry
+; CHECK: call void @do_safepoint
+; CHECK-LABEL: loop
+; CHECK-NOT: call void @do_safepoint
+; CHECK-LABEL: exit
+
+entry:
+  br label %loop
+
+loop:
+  %counter = phi i8 [ 0 , %entry ], [ %counter.inc , %loop ]
+  %counter.inc = add nsw i8 %counter, 1
+  %counter.cmp = icmp slt i8 %counter.inc, %upper
+  br i1 %counter.cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; The range is a 64 bit value
+define void @test4(i64 %upper) gc "statepoint-example" {
+; CHECK-LABEL: test4
+; CHECK-LABEL: entry
+; CHECK: call void @do_safepoint
+; CHECK-LABEL: loop
+; CHECK: call void @do_safepoint
+; CHECK-LABEL: exit
+
+; COUNTED-64-LABEL: test4
+; COUNTED-64-LABEL: entry
+; COUNTED-64: call void @do_safepoint
+; COUNTED-64-LABEL: loop
+; COUNTED-64-NOT: call void @do_safepoint
+; COUNTED-64-LABEL: exit
+
+entry:
+  br label %loop
+
+loop:
+  %counter = phi i64 [ 0 , %entry ], [ %counter.inc , %loop ]
+  %counter.inc = add i64 %counter, 1
+  %counter.cmp = icmp slt i64 %counter.inc, %upper
+  br i1 %counter.cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; This loop can run infinitely (for %upper == INT64_MAX) so it needs a
+; safepoint.
+define void @test5(i64 %upper) gc "statepoint-example" {
+; CHECK-LABEL: test5
+; CHECK-LABEL: entry
+; CHECK: call void @do_safepoint
+; CHECK-LABEL: loop
+; CHECK: call void @do_safepoint
+; CHECK-LABEL: exit
+
+; COUNTED-64-LABEL: test5
+; COUNTED-64-LABEL: entry
+; COUNTED-64: call void @do_safepoint
+; COUNTED-64-LABEL: loop
+; COUNTED-64: call void @do_safepoint
+; COUNTED-64-LABEL: exit
+
+entry:
+  br label %loop
+
+loop:
+  %counter = phi i64 [ 0 , %entry ], [ %counter.inc , %loop ]
+  %counter.inc = add i64 %counter, 1
+  %counter.cmp = icmp sle i64 %counter.inc, %upper
+  br i1 %counter.cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+
+; This function is inlined when inserting a poll.
+declare void @do_safepoint()
+define void @gc.safepoint_poll() {
+; CHECK-LABEL: gc.safepoint_poll
+entry:
+  call void @do_safepoint()
+  ret void
+}
\ No newline at end of file

Added: llvm/trunk/test/Transforms/PlaceSafepoints/libcall.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PlaceSafepoints/libcall.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/PlaceSafepoints/libcall.ll (added)
+++ llvm/trunk/test/Transforms/PlaceSafepoints/libcall.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,37 @@
+; RUN: opt -S -place-safepoints < %s | FileCheck %s
+
+; Libcalls will not contain a safepoint poll, so check that we insert
+; a safepoint in a loop containing a libcall.
+declare double @ldexp(double %x, i32 %n) nounwind readnone
+define double @test_libcall(double %x) gc "statepoint-example" {
+; CHECK-LABEL: test_libcall
+
+entry:
+; CHECK: entry
+; CHECK-NEXT: call void @do_safepoint
+; CHECK-NEXT: br label %loop
+  br label %loop
+
+loop:
+; CHECK: loop
+; CHECK-NEXT: %x_loop = phi double [ %x, %entry ], [ %x_exp, %loop ]
+; CHECK-NEXT: %x_exp = call double @ldexp(double %x_loop, i32 5)
+; CHECK-NEXT: %done = fcmp ogt double %x_exp, 1.5
+; CHECK-NEXT: call void @do_safepoint
+  %x_loop = phi double [ %x, %entry ], [ %x_exp, %loop ]
+  %x_exp = call double @ldexp(double %x_loop, i32 5) nounwind readnone
+  %done = fcmp ogt double %x_exp, 1.5
+  br i1 %done, label %end, label %loop
+end:
+  %x_end = phi double [%x_exp, %loop]
+  ret double %x_end
+}
+
+; This function is inlined when inserting a poll.
+declare void @do_safepoint()
+define void @gc.safepoint_poll() {
+; CHECK-LABEL: gc.safepoint_poll
+entry:
+  call void @do_safepoint()
+  ret void
+}

Added: llvm/trunk/test/Transforms/PlaceSafepoints/memset.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PlaceSafepoints/memset.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/PlaceSafepoints/memset.ll (added)
+++ llvm/trunk/test/Transforms/PlaceSafepoints/memset.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,20 @@
+; RUN: opt < %s -S -place-safepoints | FileCheck %s
+
+define void @test(i32, i8 addrspace(1)* %ptr) gc "statepoint-example" {
+; CHECK-LABEL: @test
+; CHECK-NEXT: llvm.memset
+; CHECK: do_safepoint
+; CHECK: @foo
+  call void @llvm.memset.p1i8.i64(i8 addrspace(1)* align 8 %ptr, i8 0, i64 24, i1 false)
+  call void @foo()
+  ret void
+}
+
+declare void @foo()
+declare void @llvm.memset.p1i8.i64(i8 addrspace(1)*, i8, i64, i1)
+
+declare void @do_safepoint()
+define void @gc.safepoint_poll() {
+  call void @do_safepoint()
+  ret void
+}

Added: llvm/trunk/test/Transforms/PlaceSafepoints/no-statepoints.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PlaceSafepoints/no-statepoints.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/PlaceSafepoints/no-statepoints.ll (added)
+++ llvm/trunk/test/Transforms/PlaceSafepoints/no-statepoints.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,23 @@
+; RUN: opt -S -place-safepoints < %s | FileCheck %s
+
+declare void @callee()
+
+define void @test() gc "statepoint-example" {
+; CHECK-LABEL: test(
+entry:
+; CHECK: entry:
+; CHECK: call void @do_safepoint()
+  br label %other
+
+other:
+; CHECK: other:
+  call void @callee() "gc-leaf-function"
+; CHECK: call void @do_safepoint()
+  br label %other
+}
+
+declare void @do_safepoint()
+define void @gc.safepoint_poll() {
+  call void @do_safepoint()
+  ret void
+}

Added: llvm/trunk/test/Transforms/PlaceSafepoints/split-backedge.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PlaceSafepoints/split-backedge.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/PlaceSafepoints/split-backedge.ll (added)
+++ llvm/trunk/test/Transforms/PlaceSafepoints/split-backedge.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,46 @@
+;; A very basic test to make sure that splitting the backedge keeps working
+;; RUN: opt < %s -place-safepoints -spp-split-backedge=1 -S | FileCheck %s
+
+define void @test(i32, i1 %cond) gc "statepoint-example" {
+; CHECK-LABEL: @test
+; CHECK-LABEL: loop.loop_crit_edge
+; CHECK: call void @do_safepoint
+; CHECK-NEXT: br label %loop
+entry:
+  br label %loop
+
+loop:
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Test for the case where a single conditional branch jumps to two
+; different loop header blocks.  Since we're currently using LoopSimplfy
+; this doesn't hit the interesting case, but once we remove that, we need
+; to be sure this keeps working.
+define void @test2(i32, i1 %cond) gc "statepoint-example" {
+; CHECK-LABEL: @test2
+; CHECK-LABEL: loop2.loop2_crit_edge:
+; CHECK: call void @do_safepoint
+; CHECK-NEXT: br label %loop2
+; CHECK-LABEL: loop2.loop_crit_edge:
+; CHECK: call void @do_safepoint
+; CHECK-NEXT: br label %loop
+entry:
+  br label %loop
+
+loop:
+  br label %loop2
+
+loop2:
+  br i1 %cond, label %loop, label %loop2
+}
+
+declare void @do_safepoint()
+define void @gc.safepoint_poll() {
+entry:
+  call void @do_safepoint()
+  ret void
+}
\ No newline at end of file

Added: llvm/trunk/test/Transforms/PlaceSafepoints/statepoint-coreclr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PlaceSafepoints/statepoint-coreclr.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/PlaceSafepoints/statepoint-coreclr.ll (added)
+++ llvm/trunk/test/Transforms/PlaceSafepoints/statepoint-coreclr.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,29 @@
+; RUN: opt < %s -S -place-safepoints | FileCheck %s
+
+; Basic test to make sure that safepoints are placed
+; for CoreCLR GC
+
+declare void @foo()
+
+define void @test_simple_call() gc "coreclr" {
+; CHECK-LABEL: test_simple_call
+entry:
+; CHECK: call void @do_safepoint
+  br label %other
+other:
+  call void @foo()
+  ret void
+}
+
+; This function is inlined when inserting a poll.  To avoid recursive
+; issues, make sure we don't place safepoints in it.
+declare void @do_safepoint()
+define void @gc.safepoint_poll() {
+; CHECK-LABEL: gc.safepoint_poll
+; CHECK-LABEL: entry
+; CHECK-NEXT: do_safepoint
+; CHECK-NEXT: ret void
+entry:
+  call void @do_safepoint()
+  ret void
+}

Added: llvm/trunk/test/Transforms/PlaceSafepoints/statepoint-frameescape.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PlaceSafepoints/statepoint-frameescape.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/PlaceSafepoints/statepoint-frameescape.ll (added)
+++ llvm/trunk/test/Transforms/PlaceSafepoints/statepoint-frameescape.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,29 @@
+; RUN: opt < %s -S -place-safepoints | FileCheck %s
+
+declare void @llvm.localescape(...)
+
+; Do we insert the entry safepoint after the localescape intrinsic?
+define void @parent() gc "statepoint-example" {
+; CHECK-LABEL: @parent
+entry:
+; CHECK-LABEL: entry
+; CHECK-NEXT: alloca
+; CHECK-NEXT: localescape
+; CHECK-NEXT: call void @do_safepoint
+  %ptr = alloca i32
+  call void (...) @llvm.localescape(i32* %ptr)
+  ret void
+}
+
+; This function is inlined when inserting a poll.  To avoid recursive 
+; issues, make sure we don't place safepoints in it.
+declare void @do_safepoint()
+define void @gc.safepoint_poll() {
+; CHECK-LABEL: gc.safepoint_poll
+; CHECK-LABEL: entry
+; CHECK-NEXT: do_safepoint
+; CHECK-NEXT: ret void 
+entry:
+  call void @do_safepoint()
+  ret void
+}
\ No newline at end of file

Added: llvm/trunk/test/Transforms/PreISelIntrinsicLowering/load-relative.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PreISelIntrinsicLowering/load-relative.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/PreISelIntrinsicLowering/load-relative.ll (added)
+++ llvm/trunk/test/Transforms/PreISelIntrinsicLowering/load-relative.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,27 @@
+; RUN: opt -pre-isel-intrinsic-lowering -S -o - %s | FileCheck %s
+; RUN: opt -passes='pre-isel-intrinsic-lowering' -S -o - %s | FileCheck %s
+
+; CHECK: define i8* @foo32(i8* [[P:%.*]], i32 [[O:%.*]])
+define i8* @foo32(i8* %p, i32 %o) {
+  ; CHECK: [[OP:%.*]] = getelementptr i8, i8* [[P]], i32 [[O]]
+  ; CHECK: [[OPI32:%.*]] = bitcast i8* [[OP]] to i32*
+  ; CHECK: [[OI32:%.*]] = load i32, i32* [[OPI32]], align 4
+  ; CHECK: [[R:%.*]] = getelementptr i8, i8* [[P]], i32 [[OI32]]
+  ; CHECK: ret i8* [[R]]
+  %l = call i8* @llvm.load.relative.i32(i8* %p, i32 %o)
+  ret i8* %l
+}
+
+; CHECK: define i8* @foo64(i8* [[P:%.*]], i64 [[O:%.*]])
+define i8* @foo64(i8* %p, i64 %o) {
+  ; CHECK: [[OP:%.*]] = getelementptr i8, i8* [[P]], i64 [[O]]
+  ; CHECK: [[OPI32:%.*]] = bitcast i8* [[OP]] to i32*
+  ; CHECK: [[OI32:%.*]] = load i32, i32* [[OPI32]], align 4
+  ; CHECK: [[R:%.*]] = getelementptr i8, i8* [[P]], i32 [[OI32]]
+  ; CHECK: ret i8* [[R]]
+  %l = call i8* @llvm.load.relative.i64(i8* %p, i64 %o)
+  ret i8* %l
+}
+
+declare i8* @llvm.load.relative.i32(i8*, i32)
+declare i8* @llvm.load.relative.i64(i8*, i64)

Added: llvm/trunk/test/Transforms/PreISelIntrinsicLowering/objc-arc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PreISelIntrinsicLowering/objc-arc.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/PreISelIntrinsicLowering/objc-arc.ll (added)
+++ llvm/trunk/test/Transforms/PreISelIntrinsicLowering/objc-arc.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,312 @@
+; RUN: opt -pre-isel-intrinsic-lowering -S -o - %s | FileCheck %s
+; RUN: opt -passes='pre-isel-intrinsic-lowering' -S -o - %s | FileCheck %s
+
+; Make sure calls to the objc intrinsics are translated to calls in to the
+; runtime
+
+define i8* @test_objc_autorelease(i8* %arg0) {
+; CHECK-LABEL: test_objc_autorelease
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_autorelease(i8* %arg0)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.autorelease(i8* %arg0)
+	ret i8* %0
+}
+
+define void @test_objc_autoreleasePoolPop(i8* %arg0) {
+; CHECK-LABEL: test_objc_autoreleasePoolPop
+; CHECK-NEXT: entry
+; CHECK-NEXT: call void @objc_autoreleasePoolPop(i8* %arg0)
+; CHECK-NEXT: ret void
+entry:
+  call void @llvm.objc.autoreleasePoolPop(i8* %arg0)
+  ret void
+}
+
+define i8* @test_objc_autoreleasePoolPush() {
+; CHECK-LABEL: test_objc_autoreleasePoolPush
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_autoreleasePoolPush()
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.autoreleasePoolPush()
+	ret i8* %0
+}
+
+define i8* @test_objc_autoreleaseReturnValue(i8* %arg0) {
+; CHECK-LABEL: test_objc_autoreleaseReturnValue
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_autoreleaseReturnValue(i8* %arg0)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.autoreleaseReturnValue(i8* %arg0)
+	ret i8* %0
+}
+
+define void @test_objc_copyWeak(i8** %arg0, i8** %arg1) {
+; CHECK-LABEL: test_objc_copyWeak
+; CHECK-NEXT: entry
+; CHECK-NEXT: call void @objc_copyWeak(i8** %arg0, i8** %arg1)
+; CHECK-NEXT: ret void
+entry:
+  call void @llvm.objc.copyWeak(i8** %arg0, i8** %arg1)
+  ret void
+}
+
+define void @test_objc_destroyWeak(i8** %arg0) {
+; CHECK-LABEL: test_objc_destroyWeak
+; CHECK-NEXT: entry
+; CHECK-NEXT: call void @objc_destroyWeak(i8** %arg0)
+; CHECK-NEXT: ret void
+entry:
+  call void @llvm.objc.destroyWeak(i8** %arg0)
+  ret void
+}
+
+define i8* @test_objc_initWeak(i8** %arg0, i8* %arg1) {
+; CHECK-LABEL: test_objc_initWeak
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_initWeak(i8** %arg0, i8* %arg1)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.initWeak(i8** %arg0, i8* %arg1)
+	ret i8* %0
+}
+
+define i8* @test_objc_loadWeak(i8** %arg0) {
+; CHECK-LABEL: test_objc_loadWeak
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_loadWeak(i8** %arg0)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.loadWeak(i8** %arg0)
+	ret i8* %0
+}
+
+define i8* @test_objc_loadWeakRetained(i8** %arg0) {
+; CHECK-LABEL: test_objc_loadWeakRetained
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_loadWeakRetained(i8** %arg0)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.loadWeakRetained(i8** %arg0)
+	ret i8* %0
+}
+
+define void @test_objc_moveWeak(i8** %arg0, i8** %arg1) {
+; CHECK-LABEL: test_objc_moveWeak
+; CHECK-NEXT: entry
+; CHECK-NEXT: call void @objc_moveWeak(i8** %arg0, i8** %arg1)
+; CHECK-NEXT: ret void
+entry:
+  call void @llvm.objc.moveWeak(i8** %arg0, i8** %arg1)
+  ret void
+}
+
+define void @test_objc_release(i8* %arg0) {
+; CHECK-LABEL: test_objc_release
+; CHECK-NEXT: entry
+; CHECK-NEXT: call void @objc_release(i8* %arg0)
+; CHECK-NEXT: ret void
+entry:
+  call void @llvm.objc.release(i8* %arg0)
+  ret void
+}
+
+define i8* @test_objc_retain(i8* %arg0) {
+; CHECK-LABEL: test_objc_retain
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_retain(i8* %arg0)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.retain(i8* %arg0)
+	ret i8* %0
+}
+
+define i8* @test_objc_retainAutorelease(i8* %arg0) {
+; CHECK-LABEL: test_objc_retainAutorelease
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_retainAutorelease(i8* %arg0)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.retainAutorelease(i8* %arg0)
+	ret i8* %0
+}
+
+define i8* @test_objc_retainAutoreleaseReturnValue(i8* %arg0) {
+; CHECK-LABEL: test_objc_retainAutoreleaseReturnValue
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = tail call i8* @objc_retainAutoreleaseReturnValue(i8* %arg0)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = tail call i8* @llvm.objc.retainAutoreleaseReturnValue(i8* %arg0)
+	ret i8* %0
+}
+
+define i8* @test_objc_retainAutoreleasedReturnValue(i8* %arg0) {
+; CHECK-LABEL: test_objc_retainAutoreleasedReturnValue
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_retainAutoreleasedReturnValue(i8* %arg0)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.retainAutoreleasedReturnValue(i8* %arg0)
+	ret i8* %0
+}
+
+define i8* @test_objc_retainBlock(i8* %arg0) {
+; CHECK-LABEL: test_objc_retainBlock
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_retainBlock(i8* %arg0)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.retainBlock(i8* %arg0)
+	ret i8* %0
+}
+
+define void @test_objc_storeStrong(i8** %arg0, i8* %arg1) {
+; CHECK-LABEL: test_objc_storeStrong
+; CHECK-NEXT: entry
+; CHECK-NEXT: call void @objc_storeStrong(i8** %arg0, i8* %arg1)
+; CHECK-NEXT: ret void
+entry:
+  call void @llvm.objc.storeStrong(i8** %arg0, i8* %arg1)
+	ret void
+}
+
+define i8* @test_objc_storeWeak(i8** %arg0, i8* %arg1) {
+; CHECK-LABEL: test_objc_storeWeak
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_storeWeak(i8** %arg0, i8* %arg1)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.storeWeak(i8** %arg0, i8* %arg1)
+	ret i8* %0
+}
+
+define i8* @test_objc_unsafeClaimAutoreleasedReturnValue(i8* %arg0) {
+; CHECK-LABEL: test_objc_unsafeClaimAutoreleasedReturnValue
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_unsafeClaimAutoreleasedReturnValue(i8* %arg0)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8* %arg0)
+  ret i8* %0
+}
+
+define i8* @test_objc_retainedObject(i8* %arg0) {
+; CHECK-LABEL: test_objc_retainedObject
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_retainedObject(i8* %arg0)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.retainedObject(i8* %arg0)
+  ret i8* %0
+}
+
+define i8* @test_objc_unretainedObject(i8* %arg0) {
+; CHECK-LABEL: test_objc_unretainedObject
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_unretainedObject(i8* %arg0)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.unretainedObject(i8* %arg0)
+  ret i8* %0
+}
+
+define i8* @test_objc_unretainedPointer(i8* %arg0) {
+; CHECK-LABEL: test_objc_unretainedPointer
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_unretainedPointer(i8* %arg0)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.unretainedPointer(i8* %arg0)
+  ret i8* %0
+}
+
+define i8* @test_objc_retain_autorelease(i8* %arg0) {
+; CHECK-LABEL: test_objc_retain_autorelease
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i8* @objc_retain_autorelease(i8* %arg0)
+; CHECK-NEXT: ret i8* %0
+entry:
+  %0 = call i8* @llvm.objc.retain.autorelease(i8* %arg0)
+  ret i8* %0
+}
+
+define i32 @test_objc_sync_enter(i8* %arg0) {
+; CHECK-LABEL: test_objc_sync_enter
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i32 @objc_sync_enter(i8* %arg0)
+; CHECK-NEXT: ret i32 %0
+entry:
+  %0 = call i32 @llvm.objc.sync.enter(i8* %arg0)
+  ret i32 %0
+}
+
+define i32 @test_objc_sync_exit(i8* %arg0) {
+; CHECK-LABEL: test_objc_sync_exit
+; CHECK-NEXT: entry
+; CHECK-NEXT: %0 = call i32 @objc_sync_exit(i8* %arg0)
+; CHECK-NEXT: ret i32 %0
+entry:
+  %0 = call i32 @llvm.objc.sync.exit(i8* %arg0)
+  ret i32 %0
+}
+
+declare i8* @llvm.objc.autorelease(i8*)
+declare void @llvm.objc.autoreleasePoolPop(i8*)
+declare i8* @llvm.objc.autoreleasePoolPush()
+declare i8* @llvm.objc.autoreleaseReturnValue(i8*)
+declare void @llvm.objc.copyWeak(i8**, i8**)
+declare void @llvm.objc.destroyWeak(i8**)
+declare extern_weak i8* @llvm.objc.initWeak(i8**, i8*)
+declare i8* @llvm.objc.loadWeak(i8**)
+declare i8* @llvm.objc.loadWeakRetained(i8**)
+declare void @llvm.objc.moveWeak(i8**, i8**)
+declare void @llvm.objc.release(i8*)
+declare i8* @llvm.objc.retain(i8*)
+declare i8* @llvm.objc.retainAutorelease(i8*)
+declare i8* @llvm.objc.retainAutoreleaseReturnValue(i8*)
+declare i8* @llvm.objc.retainAutoreleasedReturnValue(i8*)
+declare i8* @llvm.objc.retainBlock(i8*)
+declare void @llvm.objc.storeStrong(i8**, i8*)
+declare i8* @llvm.objc.storeWeak(i8**, i8*)
+declare i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8*)
+declare i8* @llvm.objc.retainedObject(i8*)
+declare i8* @llvm.objc.unretainedObject(i8*)
+declare i8* @llvm.objc.unretainedPointer(i8*)
+declare i8* @llvm.objc.retain.autorelease(i8*)
+declare i32 @llvm.objc.sync.enter(i8*)
+declare i32 @llvm.objc.sync.exit(i8*)
+
+attributes #0 = { nounwind }
+
+; CHECK: declare i8* @objc_autorelease(i8*)
+; CHECK: declare void @objc_autoreleasePoolPop(i8*)
+; CHECK: declare i8* @objc_autoreleasePoolPush()
+; CHECK: declare i8* @objc_autoreleaseReturnValue(i8*)
+; CHECK: declare void @objc_copyWeak(i8**, i8**)
+; CHECK: declare void @objc_destroyWeak(i8**)
+; CHECK: declare extern_weak i8* @objc_initWeak(i8**, i8*)
+; CHECK: declare i8* @objc_loadWeak(i8**)
+; CHECK: declare i8* @objc_loadWeakRetained(i8**)
+; CHECK: declare void @objc_moveWeak(i8**, i8**)
+; CHECK: declare void @objc_release(i8*) [[NLB:#[0-9]+]]
+; CHECK: declare i8* @objc_retain(i8*) [[NLB]]
+; CHECK: declare i8* @objc_retainAutorelease(i8*)
+; CHECK: declare i8* @objc_retainAutoreleaseReturnValue(i8*)
+; CHECK: declare i8* @objc_retainAutoreleasedReturnValue(i8*)
+; CHECK: declare i8* @objc_retainBlock(i8*)
+; CHECK: declare void @objc_storeStrong(i8**, i8*)
+; CHECK: declare i8* @objc_storeWeak(i8**, i8*)
+; CHECK: declare i8* @objc_unsafeClaimAutoreleasedReturnValue(i8*)
+; CHECK: declare i8* @objc_retainedObject(i8*)
+; CHECK: declare i8* @objc_unretainedObject(i8*)
+; CHECK: declare i8* @objc_unretainedPointer(i8*)
+; CHECK: declare i8* @objc_retain_autorelease(i8*)
+; CHECK: declare i32 @objc_sync_enter(i8*)
+; CHECK: declare i32 @objc_sync_exit(i8*)
+
+; CHECK: attributes #0 = { nounwind }
+; CHECK: attributes [[NLB]] = { nonlazybind }

Added: llvm/trunk/test/Transforms/PruneEH/2008-06-02-Weak.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PruneEH/2008-06-02-Weak.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/PruneEH/2008-06-02-Weak.ll (added)
+++ llvm/trunk/test/Transforms/PruneEH/2008-06-02-Weak.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,20 @@
+; RUN: opt < %s -prune-eh -S | FileCheck %s
+; RUN: opt < %s -passes='function-attrs,function(simplify-cfg)' -S | FileCheck %s
+
+; We should not infer 'nounwind' for/from a weak function,
+; since it can be overriden by throwing implementation.
+;
+; CHECK-LABEL: define weak void @f()
+define weak void @f() {
+entry:
+        ret void
+}
+
+; CHECK-LABEL: define void @g()
+define void @g() {
+entry:
+	call void @f()
+	ret void
+}
+
+; CHECK-NOT: {{^}}attributes #{{[0-9].*}} nounwind

Added: llvm/trunk/test/Transforms/PruneEH/ipo-nounwind.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PruneEH/ipo-nounwind.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/PruneEH/ipo-nounwind.ll (added)
+++ llvm/trunk/test/Transforms/PruneEH/ipo-nounwind.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,44 @@
+; RUN: opt -S -prune-eh < %s | FileCheck %s
+; RUN: opt -S -passes='function-attrs,function(simplify-cfg)' < %s | FileCheck %s
+
+declare void @may_throw()
+
+; @callee below may be an optimized form of this function, which can
+; throw at runtime (see r265762 for more details):
+; 
+; define linkonce_odr void @callee(i32* %ptr) noinline {
+; entry:
+;   %val0 = load atomic i32, i32* %ptr unordered, align 4
+;   %val1 = load atomic i32, i32* %ptr unordered, align 4
+;   %cmp = icmp eq i32 %val0, %val1
+;   br i1 %cmp, label %left, label %right
+
+; left:
+;   ret void
+
+; right:
+;   call void @may_throw()
+;   ret void
+; }
+
+define linkonce_odr void @callee(i32* %ptr) noinline {
+  ret void
+}
+
+define i32 @caller(i32* %ptr) personality i32 3 {
+; CHECK-LABEL: @caller(
+; CHECK:  invoke void @callee(i32* %ptr)
+; CHECK-NEXT:          to label %normal unwind label %unwind
+
+entry:
+  invoke void @callee(i32* %ptr)
+          to label %normal unwind label %unwind
+
+normal:
+  ret i32 1
+
+unwind:
+  %res = landingpad { i8*, i32 }
+         cleanup
+  ret i32 2
+}

Added: llvm/trunk/test/Transforms/PruneEH/looptest.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PruneEH/looptest.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/PruneEH/looptest.ll (added)
+++ llvm/trunk/test/Transforms/PruneEH/looptest.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,44 @@
+; RUN: opt < %s -prune-eh -S | FileCheck %s
+
+declare void @nounwind() nounwind
+
+define internal void @foo() {
+	call void @nounwind()
+	ret void
+}
+
+; CHECK-LABEL: @caller
+define i32 @caller(i32 %n) personality i32 (...)* @__gxx_personality_v0 {
+entry:
+  br label %for
+
+for:
+  %j = phi i32 [0, %entry], [%j.inc, %inc]
+  %j.cmp = icmp slt i32 %j, %n
+  br i1 %j.cmp, label %body, label %exit, !llvm.loop !0
+
+body:
+; CHECK: call void @foo(), !llvm.mem.parallel_loop_access !0
+	invoke void @foo( )
+			to label %Normal unwind label %Except, !llvm.mem.parallel_loop_access !0
+  br label %inc
+
+inc:
+  %j.inc = add nuw nsw i32 %j, 1
+  br label %for, !llvm.loop !0
+
+exit:
+  br label %Normal
+
+Normal:
+	ret i32 0
+
+Except:
+        landingpad { i8*, i32 }
+                catch i8* null
+	ret i32 1
+}
+
+declare i32 @__gxx_personality_v0(...)
+
+!0 = distinct !{!0}

Added: llvm/trunk/test/Transforms/PruneEH/musttail.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PruneEH/musttail.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/PruneEH/musttail.ll (added)
+++ llvm/trunk/test/Transforms/PruneEH/musttail.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,15 @@
+; RUN: opt -prune-eh -S < %s | FileCheck %s
+
+declare void @noreturn()
+
+define void @testfn() {
+    ; A musttail call must be followed by (optional bitcast then) ret,
+    ; so make sure we don't insert an unreachable
+    ; CHECK: musttail call void @noreturn
+    ; CHECK-NOT: unreachable
+    ; CHECK-NEXT: ret void
+    musttail call void @noreturn() #0
+    ret void
+}
+
+attributes #0 = { noreturn }

Added: llvm/trunk/test/Transforms/PruneEH/operand-bundles.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PruneEH/operand-bundles.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/PruneEH/operand-bundles.ll (added)
+++ llvm/trunk/test/Transforms/PruneEH/operand-bundles.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,27 @@
+; RUN: opt < %s -prune-eh -S | FileCheck %s
+; RUN: opt < %s -passes='function-attrs,function(simplify-cfg)' -S | FileCheck %s
+
+declare void @nounwind() nounwind
+
+define internal void @foo() {
+	call void @nounwind()
+	ret void
+}
+
+define i32 @caller() personality i32 (...)* @__gxx_personality_v0 {
+; CHECK-LABEL: @caller(
+; CHECK-NOT: invoke
+; CHECK: call void @foo() [ "foo"(i32 0, i8 1) ]
+	invoke void @foo() [ "foo"(i32 0, i8 1) ]
+			to label %Normal unwind label %Except
+
+Normal:		; preds = %0
+	ret i32 0
+
+Except:		; preds = %0
+        landingpad { i8*, i32 }
+                catch i8* null
+	ret i32 1
+}
+
+declare i32 @__gxx_personality_v0(...)

Added: llvm/trunk/test/Transforms/PruneEH/pr23971.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PruneEH/pr23971.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/PruneEH/pr23971.ll (added)
+++ llvm/trunk/test/Transforms/PruneEH/pr23971.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,22 @@
+; RUN: opt -S -prune-eh < %s | FileCheck %s
+; RUN: opt -S -passes='function-attrs,function(simplify-cfg)' < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @f() #0 {
+entry:
+  call void asm sideeffect "ret\0A\09", "~{dirflag},~{fpsr},~{flags}"()
+  unreachable
+}
+
+define i32 @g() {
+entry:
+  call void @f()
+  ret i32 42
+}
+
+; CHECK-LABEL: define i32 @g()
+; CHECK: ret i32 42
+
+attributes #0 = { naked noinline }

Added: llvm/trunk/test/Transforms/PruneEH/pr26263.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PruneEH/pr26263.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/PruneEH/pr26263.ll (added)
+++ llvm/trunk/test/Transforms/PruneEH/pr26263.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,70 @@
+; PruneEH is less powerful than simplify-cfg in terms of cfg simplification,
+; so it leaves some of the unreachable stuff hanging around.
+; Checking it with CHECK-OLD.
+;
+; RUN: opt -prune-eh -S < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-OLD
+; RUN: opt -passes='function-attrs,function(simplify-cfg)' -S < %s | FileCheck %s  --check-prefix=CHECK --check-prefix=CHECK-NEW
+
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i386-pc-windows-msvc"
+
+declare void @neverthrows() nounwind
+
+define void @test1() personality i32 (...)* @__CxxFrameHandler3 {
+  invoke void @neverthrows()
+          to label %try.cont unwind label %cleanuppad
+
+try.cont:
+  ret void
+
+cleanuppad:
+  %cp = cleanuppad within none []
+  br label %cleanupret
+
+cleanupret:
+  cleanupret from %cp unwind to caller
+}
+
+; CHECK-LABEL: define void @test1(
+; CHECK:       call void @neverthrows()
+; CHECK-NEW-NEXT: ret void
+; CHECK-NEW-NEXT: }
+; CHECK-OLD:	  ret void
+
+; CHECK-OLD: %[[cp:.*]] = cleanuppad within none []
+; CHECK-OLD-NEXT: unreachable
+
+; CHECK-OLD: cleanupret from %[[cp]] unwind to caller
+
+define void @test2() personality i32 (...)* @__CxxFrameHandler3 {
+  invoke void @neverthrows()
+          to label %try.cont unwind label %catchswitch
+
+try.cont:
+  ret void
+
+catchswitch:
+  %cs = catchswitch within none [label %catchpad] unwind to caller
+
+catchpad:
+  %cp = catchpad within %cs []
+  unreachable
+
+ret:
+  ret void
+}
+
+; CHECK-LABEL: define void @test2(
+; CHECK:       call void @neverthrows()
+; CHECK-NEW-NEXT: ret void
+; CHECK-NEW-NEXT: }
+; CHECK-OLD:      ret void
+
+; CHECK-OLD: %[[cs:.*]] = catchswitch within none [label
+
+; CHECK-OLD: catchpad within %[[cs]] []
+; CHECK-OLD-NEXT: unreachable
+
+; CHECK-OLD:ret void
+
+declare i32 @__CxxFrameHandler3(...)

Added: llvm/trunk/test/Transforms/PruneEH/recursivetest.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PruneEH/recursivetest.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/PruneEH/recursivetest.ll (added)
+++ llvm/trunk/test/Transforms/PruneEH/recursivetest.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,30 @@
+; RUN: opt < %s -prune-eh -S | FileCheck %s
+; RUN: opt < %s -passes='function-attrs,function(simplify-cfg)' -S | FileCheck %s
+
+; CHECK-LABEL: define internal i32 @foo()
+define internal i32 @foo() personality i32 (...)* @__gxx_personality_v0 {
+; CHECK-NOT: invoke i32 @foo()
+	invoke i32 @foo( )
+			to label %Normal unwind label %Except		; <i32>:1 [#uses=0]
+Normal:		; preds = %0
+	ret i32 12
+Except:		; preds = %0
+        landingpad { i8*, i32 }
+                catch i8* null
+	ret i32 123
+}
+
+; CHECK-LABEL: define i32 @caller()
+define i32 @caller() personality i32 (...)* @__gxx_personality_v0 {
+; CHECK-NOT: invoke i32 @foo()
+	invoke i32 @foo( )
+			to label %Normal unwind label %Except		; <i32>:1 [#uses=0]
+Normal:		; preds = %0
+	ret i32 0
+Except:		; preds = %0
+        landingpad { i8*, i32 }
+                catch i8* null
+	ret i32 1
+}
+
+declare i32 @__gxx_personality_v0(...)

Added: llvm/trunk/test/Transforms/PruneEH/seh-nounwind.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PruneEH/seh-nounwind.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/PruneEH/seh-nounwind.ll (added)
+++ llvm/trunk/test/Transforms/PruneEH/seh-nounwind.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,32 @@
+; RUN: opt -S -prune-eh < %s | FileCheck %s
+; RUN: opt -S -passes='function-attrs,function(simplify-cfg)' < %s | FileCheck %s
+
+; Don't remove invokes of nounwind functions if the personality handles async
+; exceptions. The @div function in this test can fault, even though it can't
+; throw a synchronous exception.
+
+define i32 @div(i32 %n, i32 %d) nounwind {
+entry:
+  %div = sdiv i32 %n, %d
+  ret i32 %div
+}
+
+define i32 @main() nounwind personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*) {
+entry:
+  %call = invoke i32 @div(i32 10, i32 0)
+          to label %__try.cont unwind label %lpad
+
+lpad:
+  %0 = landingpad { i8*, i32 }
+          catch i8* null
+  br label %__try.cont
+
+__try.cont:
+  %retval.0 = phi i32 [ %call, %entry ], [ 0, %lpad ]
+  ret i32 %retval.0
+}
+
+; CHECK-LABEL: define i32 @main()
+; CHECK: invoke i32 @div(i32 10, i32 0)
+
+declare i32 @__C_specific_handler(...)

Added: llvm/trunk/test/Transforms/PruneEH/simplenoreturntest.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PruneEH/simplenoreturntest.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/PruneEH/simplenoreturntest.ll (added)
+++ llvm/trunk/test/Transforms/PruneEH/simplenoreturntest.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,13 @@
+; RUN: opt < %s -prune-eh -S | not grep "ret i32"
+
+declare void @noreturn() noreturn
+
+define i32 @caller() {
+	call void @noreturn( )
+	ret i32 17
+}
+
+define i32 @caller2() {
+	%T = call i32 @caller( )		; <i32> [#uses=1]
+	ret i32 %T
+}

Added: llvm/trunk/test/Transforms/PruneEH/simpletest.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/PruneEH/simpletest.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/PruneEH/simpletest.ll (added)
+++ llvm/trunk/test/Transforms/PruneEH/simpletest.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,26 @@
+; RUN: opt < %s -prune-eh -S | FileCheck %s
+; RUN: opt < %s -passes='function-attrs,function(simplify-cfg)' -S | FileCheck %s
+
+declare void @nounwind() nounwind
+
+define internal void @foo() {
+	call void @nounwind()
+	ret void
+}
+
+; CHECK-LABEL: define i32 @caller()
+define i32 @caller() personality i32 (...)* @__gxx_personality_v0 {
+; CHECK-NOT: invoke void @foo
+	invoke void @foo( )
+			to label %Normal unwind label %Except
+
+Normal:		; preds = %0
+	ret i32 0
+
+Except:		; preds = %0
+        landingpad { i8*, i32 }
+                catch i8* null
+	ret i32 1
+}
+
+declare i32 @__gxx_personality_v0(...)

Added: llvm/trunk/test/Transforms/Reassociate/2002-05-15-AgressiveSubMove.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/2002-05-15-AgressiveSubMove.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/2002-05-15-AgressiveSubMove.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/2002-05-15-AgressiveSubMove.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,10 @@
+; RUN: opt < %s -reassociate -S | FileCheck %s
+
+define i32 @test1(i32 %A) {
+; CHECK-LABEL: test1
+; CHECK: ret i32 0
+  %X = add i32 %A, 1
+  %Y = add i32 %A, 1
+  %r = sub i32 %X, %Y
+  ret i32 %r
+}

Added: llvm/trunk/test/Transforms/Reassociate/2002-05-15-MissedTree.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/2002-05-15-MissedTree.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/2002-05-15-MissedTree.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/2002-05-15-MissedTree.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,11 @@
+; RUN: opt < %s -reassociate -instcombine -S | FileCheck %s
+
+define i32 @test1(i32 %A, i32 %B) {
+; CHECK-LABEL: test1
+; CHECK: %Z = add i32 %B, %A
+; CHECK: ret i32 %Z
+	%W = add i32 %B, -5
+	%Y = add i32 %A, 5
+	%Z = add i32 %W, %Y
+	ret i32 %Z
+}

Added: llvm/trunk/test/Transforms/Reassociate/2002-05-15-SubReassociate.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/2002-05-15-SubReassociate.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/2002-05-15-SubReassociate.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/2002-05-15-SubReassociate.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,31 @@
+; RUN: opt < %s -reassociate -constprop -instcombine -dce -S | FileCheck %s
+
+; With sub reassociation, constant folding can eliminate all of the constants.
+define i32 @test1(i32 %A, i32 %B) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[Z:%.*]] = sub i32 %A, %B
+; CHECK-NEXT:    ret i32 [[Z]]
+;
+  %W = add i32 5, %B
+  %X = add i32 -7, %A
+  %Y = sub i32 %X, %W
+  %Z = add i32 %Y, 12
+  ret i32 %Z
+}
+
+; With sub reassociation, constant folding can eliminate the two 12 constants.
+define i32 @test2(i32 %A, i32 %B, i32 %C, i32 %D) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[SUM:%.*]] = add i32 %B, %A
+; CHECK-NEXT:    [[SUM1:%.*]] = add i32 [[SUM]], %C
+; CHECK-NEXT:    [[Q:%.*]] = sub i32 %D, [[SUM1]]
+; CHECK-NEXT:    ret i32 [[Q]]
+;
+  %M = add i32 %A, 12
+  %N = add i32 %M, %B
+  %O = add i32 %N, %C
+  %P = sub i32 %D, %O
+  %Q = add i32 %P, 12
+  ret i32 %Q
+}
+

Added: llvm/trunk/test/Transforms/Reassociate/2002-07-09-DominanceProblem.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/2002-07-09-DominanceProblem.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/2002-07-09-DominanceProblem.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/2002-07-09-DominanceProblem.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,10 @@
+; The reassociate pass is not preserving dominance properties correctly
+;
+; RUN: opt < %s -reassociate
+
+define i32 @compute_dist(i32 %i, i32 %j) {
+	%reg119 = sub i32 %j, %i		; <i32> [#uses=1]
+	ret i32 %reg119
+}
+
+

Added: llvm/trunk/test/Transforms/Reassociate/2003-08-12-InfiniteLoop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/2003-08-12-InfiniteLoop.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/2003-08-12-InfiniteLoop.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/2003-08-12-InfiniteLoop.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,9 @@
+; RUN: opt < %s -reassociate -disable-output
+
+define i32 @test(i32 %A.1, i32 %B.1, i32 %C.1, i32 %D.1) {
+	%tmp.16 = and i32 %A.1, %B.1		; <i32> [#uses=1]
+	%tmp.18 = and i32 %tmp.16, %C.1		; <i32> [#uses=1]
+	%tmp.20 = and i32 %tmp.18, %D.1		; <i32> [#uses=1]
+	ret i32 %tmp.20
+}
+

Added: llvm/trunk/test/Transforms/Reassociate/2005-09-01-ArrayOutOfBounds.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/2005-09-01-ArrayOutOfBounds.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/2005-09-01-ArrayOutOfBounds.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/2005-09-01-ArrayOutOfBounds.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,24 @@
+; RUN: opt < %s -reassociate -instcombine -S | FileCheck %s
+
+define i32 @f1(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
+; CHECK-LABEL: f1
+; CHECK-NEXT: ret i32 0
+
+  %tmp.2 = add i32 %a4, %a3
+  %tmp.4 = add i32 %tmp.2, %a2
+  %tmp.6 = add i32 %tmp.4, %a1
+  %tmp.8 = add i32 %tmp.6, %a0
+  %tmp.11 = add i32 %a3, %a2
+  %tmp.13 = add i32 %tmp.11, %a1
+  %tmp.15 = add i32 %tmp.13, %a0
+  %tmp.18 = add i32 %a2, %a1
+  %tmp.20 = add i32 %tmp.18, %a0
+  %tmp.23 = add i32 %a1, %a0
+  %tmp.26 = sub i32 %tmp.8, %tmp.15
+  %tmp.28 = add i32 %tmp.26, %tmp.20
+  %tmp.30 = sub i32 %tmp.28, %tmp.23
+  %tmp.32 = sub i32 %tmp.30, %a4
+  %tmp.34 = sub i32 %tmp.32, %a2
+  %T = mul i32 %tmp.34, %tmp.34
+  ret i32 %T
+}

Added: llvm/trunk/test/Transforms/Reassociate/2006-04-27-ReassociateVector.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/2006-04-27-ReassociateVector.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/2006-04-27-ReassociateVector.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/2006-04-27-ReassociateVector.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,12 @@
+; RUN: opt < %s -reassociate -S | FileCheck %s
+
+define <4 x float> @test1() {
+; CHECK-LABEL: test1
+; CHECK-NEXT: %tmp1 = fsub <4 x float> zeroinitializer, zeroinitializer
+; CHECK-NEXT: %tmp2 = fmul <4 x float> %tmp1, zeroinitializer
+; CHECK-NEXT: ret <4 x float> %tmp2
+
+  %tmp1 = fsub <4 x float> zeroinitializer, zeroinitializer
+  %tmp2 = fmul <4 x float> zeroinitializer, %tmp1
+  ret <4 x float> %tmp2
+}

Added: llvm/trunk/test/Transforms/Reassociate/2011-01-26-UseAfterFree.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/2011-01-26-UseAfterFree.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/2011-01-26-UseAfterFree.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/2011-01-26-UseAfterFree.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,33 @@
+; RUN: opt < %s -reassociate
+; PR9039
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
+target triple = "i386-gnu-linux"
+
+define void @exp_averages_intraday__deviation() {
+entry:
+  %0 = load i32, i32* undef, align 4
+  %1 = shl i32 %0, 2
+  %2 = add nsw i32 undef, %1
+  %3 = add nsw i32 %2, undef
+  %4 = mul nsw i32 %0, 12
+  %5 = add nsw i32 %3, %4
+  %6 = add nsw i32 %5, %4
+  %7 = add nsw i32 %6, undef
+  br i1 false, label %"4", label %"12"
+
+"4":                                              ; preds = %entry
+  br i1 undef, label %"5", label %"8"
+
+"5":                                              ; preds = %"4"
+  unreachable
+
+"8":                                              ; preds = %"4"
+  %8 = getelementptr inbounds i8, i8* undef, i32 %6
+  br i1 undef, label %"13", label %"12"
+
+"12":                                             ; preds = %"8", %entry
+  ret void
+
+"13":                                             ; preds = %"8"
+  ret void
+}

Added: llvm/trunk/test/Transforms/Reassociate/2012-05-08-UndefLeak.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/2012-05-08-UndefLeak.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/2012-05-08-UndefLeak.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/2012-05-08-UndefLeak.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,85 @@
+; RUN: opt < %s -reassociate -S | FileCheck %s
+; PR12169
+; PR12764
+; XFAIL: *
+; Transform disabled until PR13021 is fixed.
+
+define i64 @f(i64 %x0) {
+; CHECK-LABEL: @f(
+; CHECK-NEXT: mul i64 %x0, 208
+; CHECK-NEXT: add i64 %{{.*}}, 1617
+; CHECK-NEXT: ret i64
+  %t0 = add i64 %x0, 1
+  %t1 = add i64 %x0, 2
+  %t2 = add i64 %x0, 3
+  %t3 = add i64 %x0, 4
+  %t4 = add i64 %x0, 5
+  %t5 = add i64 %x0, 6
+  %t6 = add i64 %x0, 7
+  %t7 = add i64 %x0, 8
+  %t8 = add i64 %x0, 9
+  %t9 = add i64 %x0, 10
+  %t10 = add i64 %x0, 11
+  %t11 = add i64 %x0, 12
+  %t12 = add i64 %x0, 13
+  %t13 = add i64 %x0, 14
+  %t14 = add i64 %x0, 15
+  %t15 = add i64 %x0, 16
+  %t16 = add i64 %x0, 17
+  %t17 = add i64 %x0, 18
+  %t18 = add i64 %t17, %t0
+  %t19 = add i64 %t18, %t1
+  %t20 = add i64 %t19, %t2
+  %t21 = add i64 %t20, %t3
+  %t22 = add i64 %t21, %t4
+  %t23 = add i64 %t22, %t5
+  %t24 = add i64 %t23, %t6
+  %t25 = add i64 %t24, %t7
+  %t26 = add i64 %t25, %t8
+  %t27 = add i64 %t26, %t9
+  %t28 = add i64 %t27, %t10
+  %t29 = add i64 %t28, %t11
+  %t30 = add i64 %t29, %t12
+  %t31 = add i64 %t30, %t13
+  %t32 = add i64 %t31, %t14
+  %t33 = add i64 %t32, %t15
+  %t34 = add i64 %t33, %t16
+  %t35 = add i64 %t34, %x0
+  %t36 = add i64 %t0, %t1
+  %t37 = add i64 %t36, %t2
+  %t38 = add i64 %t37, %t3
+  %t39 = add i64 %t38, %t4
+  %t40 = add i64 %t39, %t5
+  %t41 = add i64 %t40, %t6
+  %t42 = add i64 %t41, %t7
+  %t43 = add i64 %t42, %t8
+  %t44 = add i64 %t43, %t9
+  %t45 = add i64 %t44, %t10
+  %t46 = add i64 %t45, %t11
+  %t47 = add i64 %t46, %t12
+  %t48 = add i64 %t47, %t13
+  %t49 = add i64 %t48, %t14
+  %t50 = add i64 %t49, %t15
+  %t51 = add i64 %t50, %t16
+  %t52 = add i64 %t51, %t17
+  %t53 = add i64 %t52, %t18
+  %t54 = add i64 %t53, %t19
+  %t55 = add i64 %t54, %t20
+  %t56 = add i64 %t55, %t21
+  %t57 = add i64 %t56, %t22
+  %t58 = add i64 %t57, %t23
+  %t59 = add i64 %t58, %t24
+  %t60 = add i64 %t59, %t25
+  %t61 = add i64 %t60, %t26
+  %t62 = add i64 %t61, %t27
+  %t63 = add i64 %t62, %t28
+  %t64 = add i64 %t63, %t29
+  %t65 = add i64 %t64, %t30
+  %t66 = add i64 %t65, %t31
+  %t67 = add i64 %t66, %t32
+  %t68 = add i64 %t67, %t33
+  %t69 = add i64 %t68, %t34
+  %t70 = add i64 %t69, %t35
+  %t71 = add i64 %t70, %x0
+  ret i64 %t71
+}

Added: llvm/trunk/test/Transforms/Reassociate/2012-06-08-InfiniteLoop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/2012-06-08-InfiniteLoop.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/2012-06-08-InfiniteLoop.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/2012-06-08-InfiniteLoop.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,21 @@
+; RUN: opt < %s -reassociate -disable-output
+; PR13041
+
+define void @foo() {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %b.0 = phi i32 [ undef, %entry ], [ %sub2, %while.body ]
+  %c.0 = phi i32 [ undef, %entry ], [ %sub3, %while.body ]
+  br i1 undef, label %while.end, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %sub = sub nsw i32 0, %b.0
+  %sub2 = sub nsw i32 %sub, %c.0
+  %sub3 = sub nsw i32 0, %c.0
+  br label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret void
+}

Added: llvm/trunk/test/Transforms/Reassociate/absorption.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/absorption.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/absorption.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/absorption.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -reassociate < %s | FileCheck %s
+
+; Check that if constants combine to an absorbing value then the expression is
+; evaluated as the absorbing value.
+
+define i8 @or_all_ones(i8 %x) {
+; CHECK-LABEL: @or_all_ones(
+; CHECK-NEXT:    ret i8 -1
+;
+  %tmp1 = or i8 %x, 127
+  %tmp2 = or i8 %tmp1, 128
+  ret i8 %tmp2
+}
+
+; TODO: fmul by 0.0 with nsz+nnan should have simplified to 0.0.
+
+define double @fmul_zero(double %x) {
+; CHECK-LABEL: @fmul_zero(
+; CHECK-NEXT:    [[R:%.*]] = fmul fast double [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    ret double [[R]]
+;
+  %x4 = fmul fast double %x, 4.0
+  %r = fmul fast double %x4, 0.0
+  ret double %r
+}
+

Added: llvm/trunk/test/Transforms/Reassociate/add_across_block_crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/add_across_block_crash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/add_across_block_crash.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/add_across_block_crash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,30 @@
+; RUN: opt < %s -reassociate -S | FileCheck %s
+
+; This test is to make sure while processing a block, uses of instructions
+; from a different basic block don't get added to be re-optimized
+
+define  void @main() {
+; CHECK-LABEL: @main(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label %bb1, label %bb2
+; CHECK:       bb1:
+; CHECK-NEXT:    ret void
+; CHECK:       bb2:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = fadd fast float undef, undef
+  br i1 undef, label %bb1, label %bb2
+
+bb1:
+  %1 = fmul fast float undef, -2.000000e+00
+  %2 = fmul fast float %1, 2.000000e+00
+  %3 = fadd fast float %2, 2.000000e+00
+  %4 = fadd fast float %3, %0
+  %mul351 = fmul fast float %4, 5.000000e-01
+  ret void
+
+bb2:
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/Reassociate/basictest.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/basictest.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/basictest.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/basictest.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,297 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -reassociate -gvn -instcombine -S | FileCheck %s
+; RUN: opt < %s -passes='reassociate,gvn,instcombine' -S | FileCheck %s
+
+define i32 @test1(i32 %arg) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[ARG_NEG:%.*]] = sub i32 0, [[ARG:%.*]]
+; CHECK-NEXT:    ret i32 [[ARG_NEG]]
+;
+  %tmp1 = sub i32 -12, %arg
+  %tmp2 = add i32 %tmp1, 12
+  ret i32 %tmp2
+}
+
+define i32 @test2(i32 %reg109, i32 %reg1111) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[REG117:%.*]] = add i32 [[REG1111:%.*]], [[REG109:%.*]]
+; CHECK-NEXT:    ret i32 [[REG117]]
+;
+  %reg115 = add i32 %reg109, -30
+  %reg116 = add i32 %reg115, %reg1111
+  %reg117 = add i32 %reg116, 30
+  ret i32 %reg117
+}
+
+ at e = external global i32
+ at a = external global i32
+ at b = external global i32
+ at c = external global i32
+ at f = external global i32
+
+define void @test3() {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    [[B:%.*]] = load i32, i32* @b, align 4
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* @c, align 4
+; CHECK-NEXT:    [[T1:%.*]] = add i32 [[B]], [[A]]
+; CHECK-NEXT:    [[T2:%.*]] = add i32 [[T1]], [[C]]
+; CHECK-NEXT:    store i32 [[T2]], i32* @e, align 4
+; CHECK-NEXT:    store i32 [[T2]], i32* @f, align 4
+; CHECK-NEXT:    ret void
+;
+  %A = load i32, i32* @a
+  %B = load i32, i32* @b
+  %C = load i32, i32* @c
+  %t1 = add i32 %A, %B
+  %t2 = add i32 %t1, %C
+  %t3 = add i32 %C, %A
+  %t4 = add i32 %t3, %B
+  ; e = (a+b)+c;
+  store i32 %t2, i32* @e
+  ; f = (a+c)+b
+  store i32 %t4, i32* @f
+  ret void
+}
+
+define void @test4() {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    [[B:%.*]] = load i32, i32* @b, align 4
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* @c, align 4
+; CHECK-NEXT:    [[T1:%.*]] = add i32 [[B]], [[A]]
+; CHECK-NEXT:    [[T2:%.*]] = add i32 [[T1]], [[C]]
+; CHECK-NEXT:    store i32 [[T2]], i32* @e, align 4
+; CHECK-NEXT:    store i32 [[T2]], i32* @f, align 4
+; CHECK-NEXT:    ret void
+;
+  %A = load i32, i32* @a
+  %B = load i32, i32* @b
+  %C = load i32, i32* @c
+  %t1 = add i32 %A, %B
+  %t2 = add i32 %t1, %C
+  %t3 = add i32 %C, %A
+  %t4 = add i32 %t3, %B
+  ; e = c+(a+b)
+  store i32 %t2, i32* @e
+  ; f = (c+a)+b
+  store i32 %t4, i32* @f
+  ret void
+}
+
+define void @test5() {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    [[B:%.*]] = load i32, i32* @b, align 4
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* @c, align 4
+; CHECK-NEXT:    [[T1:%.*]] = add i32 [[B]], [[A]]
+; CHECK-NEXT:    [[T2:%.*]] = add i32 [[T1]], [[C]]
+; CHECK-NEXT:    store i32 [[T2]], i32* @e, align 4
+; CHECK-NEXT:    store i32 [[T2]], i32* @f, align 4
+; CHECK-NEXT:    ret void
+;
+  %A = load i32, i32* @a
+  %B = load i32, i32* @b
+  %C = load i32, i32* @c
+  %t1 = add i32 %B, %A
+  %t2 = add i32 %t1, %C
+  %t3 = add i32 %C, %A
+  %t4 = add i32 %t3, %B
+  ; e = c+(b+a)
+  store i32 %t2, i32* @e
+  ; f = (c+a)+b
+  store i32 %t4, i32* @f
+  ret void
+}
+
+define i32 @test6() {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    ret i32 0
+;
+  %tmp.0 = load i32, i32* @a
+  %tmp.1 = load i32, i32* @b
+  ; (a+b)
+  %tmp.2 = add i32 %tmp.0, %tmp.1
+  %tmp.4 = load i32, i32* @c
+  ; (a+b)+c
+  %tmp.5 = add i32 %tmp.2, %tmp.4
+  ; (a+c)
+  %tmp.8 = add i32 %tmp.0, %tmp.4
+  ; (a+c)+b
+  %tmp.11 = add i32 %tmp.8, %tmp.1
+  ; X ^ X = 0
+  %RV = xor i32 %tmp.5, %tmp.11
+  ret i32 %RV
+}
+
+; This should be one add and two multiplies.
+; A*A*B + A*C*A
+
+define i32 @test7(i32 %A, i32 %B, i32 %C) {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    [[REASS_ADD1:%.*]] = add i32 [[C:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[REASS_MUL2:%.*]] = mul i32 [[A:%.*]], [[A]]
+; CHECK-NEXT:    [[REASS_MUL:%.*]] = mul i32 [[REASS_MUL2]], [[REASS_ADD1]]
+; CHECK-NEXT:    ret i32 [[REASS_MUL]]
+;
+  %aa = mul i32 %A, %A
+  %aab = mul i32 %aa, %B
+  %ac = mul i32 %A, %C
+  %aac = mul i32 %ac, %A
+  %r = add i32 %aab, %aac
+  ret i32 %r
+}
+
+define i32 @test8(i32 %X, i32 %Y, i32 %Z) {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:    [[A:%.*]] = mul i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[C:%.*]] = sub i32 [[Z:%.*]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
+  %A = sub i32 0, %X
+  %B = mul i32 %A, %Y
+  ; (-X)*Y + Z -> Z-X*Y
+  %C = add i32 %B, %Z
+  ret i32 %C
+}
+
+; PR5458
+
+define i32 @test9(i32 %X) {
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:    [[FACTOR:%.*]] = mul i32 [[X:%.*]], 94
+; CHECK-NEXT:    ret i32 [[FACTOR]]
+;
+  %Y = mul i32 %X, 47
+  %Z = add i32 %Y, %Y
+  ret i32 %Z
+}
+
+define i32 @test10(i32 %X) {
+; CHECK-LABEL: @test10(
+; CHECK-NEXT:    [[FACTOR:%.*]] = mul i32 [[X:%.*]], 3
+; CHECK-NEXT:    ret i32 [[FACTOR]]
+;
+  %Y = add i32 %X ,%X
+  %Z = add i32 %Y, %X
+  ret i32 %Z
+}
+
+define i32 @test11(i32 %W) {
+; CHECK-LABEL: @test11(
+; CHECK-NEXT:    [[FACTOR:%.*]] = mul i32 [[W:%.*]], 381
+; CHECK-NEXT:    ret i32 [[FACTOR]]
+;
+  %X = mul i32 %W, 127
+  %Y = add i32 %X ,%X
+  %Z = add i32 %Y, %X
+  ret i32 %Z
+}
+
+declare void @mumble(i32)
+
+define i32 @test12(i32 %X) {
+; CHECK-LABEL: @test12(
+; CHECK-NEXT:    [[X_NEG:%.*]] = sub i32 0, [[X:%.*]]
+; CHECK-NEXT:    call void @mumble(i32 [[X_NEG]])
+; CHECK-NEXT:    [[FACTOR:%.*]] = mul i32 [[X]], -3
+; CHECK-NEXT:    [[Z:%.*]] = add i32 [[FACTOR]], 6
+; CHECK-NEXT:    ret i32 [[Z]]
+;
+  %X.neg = sub nsw nuw i32 0, %X
+  call void @mumble(i32 %X.neg)
+  %A = sub i32 1, %X
+  %B = sub i32 2, %X
+  %C = sub i32 3, %X
+  %Y = add i32 %A ,%B
+  %Z = add i32 %Y, %C
+  ret i32 %Z
+}
+
+define i32 @test13(i32 %X1, i32 %X2, i32 %X3) {
+; CHECK-LABEL: @test13(
+; CHECK-NEXT:    [[REASS_ADD:%.*]] = sub i32 [[X3:%.*]], [[X2:%.*]]
+; CHECK-NEXT:    [[REASS_MUL:%.*]] = mul i32 [[REASS_ADD]], [[X1:%.*]]
+; CHECK-NEXT:    ret i32 [[REASS_MUL]]
+;
+  %A = sub i32 0, %X1
+  %B = mul i32 %A, %X2   ; -X1*X2
+  %C = mul i32 %X1, %X3  ; X1*X3
+  %D = add i32 %B, %C    ; -X1*X2 + X1*X3 -> X1*(X3-X2)
+  ret i32 %D
+}
+
+; PR5359
+
+define i32 @test14(i32 %X1, i32 %X2) {
+; CHECK-LABEL: @test14(
+; CHECK-NEXT:    [[REASS_ADD:%.*]] = sub i32 [[X1:%.*]], [[X2:%.*]]
+; CHECK-NEXT:    [[REASS_MUL:%.*]] = mul i32 [[REASS_ADD]], 47
+; CHECK-NEXT:    ret i32 [[REASS_MUL]]
+;
+  %B = mul i32 %X1, 47   ; X1*47
+  %C = mul i32 %X2, -47  ; X2*-47
+  %D = add i32 %B, %C    ; X1*47 + X2*-47 -> 47*(X1-X2)
+  ret i32 %D
+}
+
+; Do not reassociate expressions of type i1
+
+define i32 @test15(i32 %X1, i32 %X2, i32 %X3) {
+; CHECK-LABEL: @test15(
+; CHECK-NEXT:    [[A:%.*]] = icmp ne i32 [[X1:%.*]], 0
+; CHECK-NEXT:    [[B:%.*]] = icmp slt i32 [[X2:%.*]], [[X3:%.*]]
+; CHECK-NEXT:    [[C:%.*]] = and i1 [[A]], [[B]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[C]], i32 [[X1]], i32 0
+; CHECK-NEXT:    ret i32 [[D]]
+;
+  %A = icmp ne i32 %X1, 0
+  %B = icmp slt i32 %X2, %X3
+  %C = and i1 %A, %B
+  %D = select i1 %C, i32 %X1, i32 0
+  ret i32 %D
+}
+
+; PR30256 - previously this asserted.
+
+define i64 @test16(i1 %cmp, i64 %a, i64 %b) {
+; CHECK-LABEL: @test16(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[CMP:%.*]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[FACTOR:%.*]] = mul i64 [[A:%.*]], -4
+; CHECK-NEXT:    [[ADD2:%.*]] = add i64 [[FACTOR]], [[B:%.*]]
+; CHECK-NEXT:    ret i64 [[ADD2]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret i64 0
+;
+entry:
+  %shl = shl i64 %a, 1
+  %shl.neg = sub i64 0, %shl
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %add1 = add i64 %shl.neg, %shl.neg
+  %add2 = add i64 %add1, %b
+  ret i64 %add2
+
+if.end:
+  ret i64 0
+}
+
+define i32 @test17(i32 %X1, i32 %X2, i32 %X3, i32 %X4) {
+; CHECK-LABEL: @test17(
+; CHECK-NEXT:    [[A:%.*]] = mul i32 [[X4:%.*]], [[X3:%.*]]
+; CHECK-NEXT:    [[C:%.*]] = mul i32 [[A]], [[X1:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = mul i32 [[A]], [[X2:%.*]]
+; CHECK-NEXT:    [[E:%.*]] = xor i32 [[C]], [[D]]
+; CHECK-NEXT:    ret i32 [[E]]
+;
+  %A = mul i32 %X3, %X1
+  %B = mul i32 %X3, %X2
+  %C = mul i32 %A, %X4
+  %D = mul i32 %B, %X4
+  %E = xor i32 %C, %D
+  ret i32 %E
+}
+

Added: llvm/trunk/test/Transforms/Reassociate/binop-identity.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/binop-identity.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/binop-identity.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/binop-identity.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -reassociate -S | FileCheck %s
+
+; Don't produce an instruction that is a no-op because the constant is an identity constant.
+
+define i32 @add_0(i32 %x) {
+; CHECK-LABEL: @add_0(
+; CHECK-NEXT:    ret i32 [[X:%.*]]
+;
+  %a1 = add i32 %x, -30
+  %a2 = add i32 %a1, 30
+  ret i32 %a2
+}
+
+define i32 @mul_1(i32 %x) {
+; CHECK-LABEL: @mul_1(
+; CHECK-NEXT:    ret i32 [[X:%.*]]
+;
+  %a1 = mul i32 %x, -1
+  %a2 = mul i32 %a1, -1
+  ret i32 %a2
+}
+
+define i8 @and_neg1(i8 %x) {
+; CHECK-LABEL: @and_neg1(
+; CHECK-NEXT:    ret i8 [[X:%.*]]
+;
+  %a1 = and i8 %x, 255
+  %a2 = and i8 %a1, 255
+  ret i8 %a2
+}
+
+define i8 @or_0(i8 %x) {
+; CHECK-LABEL: @or_0(
+; CHECK-NEXT:    ret i8 [[X:%.*]]
+;
+  %a1 = or i8 %x, 0
+  %a2 = or i8 %a1, 0
+  ret i8 %a2
+}
+
+define i8 @xor_0(i8 %x) {
+; CHECK-LABEL: @xor_0(
+; CHECK-NEXT:    ret i8 [[X:%.*]]
+;
+  %a1 = xor i8 %x, 42
+  %a2 = xor i8 %a1, 42
+  ret i8 %a2
+}
+
+; FIXME - the binop identity constant for fadd is -0.0, so this didn't fold.
+
+define float @fadd_0(float %x) {
+; CHECK-LABEL: @fadd_0(
+; CHECK-NEXT:    [[A2:%.*]] = fadd fast float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    ret float [[A2]]
+;
+  %a1 = fadd fast float %x, -30.0
+  %a2 = fadd fast float %a1, 30.0
+  ret float %a2
+}
+
+define float @fmul_1(float %x) {
+; CHECK-LABEL: @fmul_1(
+; CHECK-NEXT:    ret float [[X:%.*]]
+;
+  %a1 = fmul fast float %x, 4.0
+  %a2 = fmul fast float %a1, 0.25
+  ret float %a2
+}
+

Added: llvm/trunk/test/Transforms/Reassociate/canonicalize-neg-const.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/canonicalize-neg-const.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/canonicalize-neg-const.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/canonicalize-neg-const.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,185 @@
+; RUN: opt -reassociate -gvn -S < %s | FileCheck %s
+
+; (x + 0.1234 * y) * (x + -0.1234 * y) -> (x + 0.1234 * y) * (x - 0.1234 * y)
+define double @test1(double %x, double %y) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double %y, 1.234000e-01
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double %x, [[MUL]]
+; CHECK-NEXT:    [[ADD21:%.*]] = fsub double %x, [[MUL]]
+; CHECK-NEXT:    [[MUL3:%.*]] = fmul double [[ADD]], [[ADD21]]
+; CHECK-NEXT:    ret double [[MUL3]]
+;
+  %mul = fmul double 1.234000e-01, %y
+  %add = fadd double %mul, %x
+  %mul1 = fmul double -1.234000e-01, %y
+  %add2 = fadd double %mul1, %x
+  %mul3 = fmul double %add, %add2
+  ret double %mul3
+}
+
+; (x + -0.1234 * y) * (x + -0.1234 * y) -> (x - 0.1234 * y) * (x - 0.1234 * y)
+define double @test2(double %x, double %y) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double %y, 1.234000e-01
+; CHECK-NEXT:    [[ADD1:%.*]] = fsub double %x, [[MUL]]
+; CHECK-NEXT:    [[MUL3:%.*]] = fmul double [[ADD1]], [[ADD1]]
+; CHECK-NEXT:    ret double [[MUL3]]
+;
+  %mul = fmul double %y, -1.234000e-01
+  %add = fadd double %mul, %x
+  %mul1 = fmul double %y, -1.234000e-01
+  %add2 = fadd double %mul1, %x
+  %mul3 = fmul double %add, %add2
+  ret double %mul3
+}
+
+; (x + 0.1234 * y) * (x - -0.1234 * y) -> (x + 0.1234 * y) * (x + 0.1234 * y)
+define double @test3(double %x, double %y) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double %y, 1.234000e-01
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double %x, [[MUL]]
+; CHECK-NEXT:    [[MUL3:%.*]] = fmul double [[ADD]], [[ADD]]
+; CHECK-NEXT:    ret double [[MUL3]]
+;
+  %mul = fmul double %y, 1.234000e-01
+  %add = fadd double %mul, %x
+  %mul1 = fmul double %y, -1.234000e-01
+  %add2 = fsub double %x, %mul1
+  %mul3 = fmul double %add, %add2
+  ret double %mul3
+}
+
+; Canonicalize (x - -0.1234 * y)
+define double @test5(double %x, double %y) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double %y, 1.234000e-01
+; CHECK-NEXT:    [[SUB1:%.*]] = fadd double %x, [[MUL]]
+; CHECK-NEXT:    ret double [[SUB1]]
+;
+  %mul = fmul double -1.234000e-01, %y
+  %sub = fsub double %x, %mul
+  ret double %sub
+}
+
+; Don't modify (-0.1234 * y - x)
+define double @test6(double %x, double %y) {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double %y, -1.234000e-01
+; CHECK-NEXT:    [[SUB:%.*]] = fsub double [[MUL]], %x
+; CHECK-NEXT:    ret double [[SUB]]
+;
+  %mul = fmul double -1.234000e-01, %y
+  %sub = fsub double %mul, %x
+  ret double %sub
+}
+
+; Canonicalize (-0.1234 * y + x) -> (x - 0.1234 * y)
+define double @test7(double %x, double %y) {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double %y, 1.234000e-01
+; CHECK-NEXT:    [[ADD1:%.*]] = fsub double %x, [[MUL]]
+; CHECK-NEXT:    ret double [[ADD1]]
+;
+  %mul = fmul double -1.234000e-01, %y
+  %add = fadd double %mul, %x
+  ret double %add
+}
+
+; Canonicalize (y * -0.1234 + x) -> (x - 0.1234 * y)
+define double @test8(double %x, double %y) {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double %y, 1.234000e-01
+; CHECK-NEXT:    [[ADD1:%.*]] = fsub double %x, [[MUL]]
+; CHECK-NEXT:    ret double [[ADD1]]
+;
+  %mul = fmul double %y, -1.234000e-01
+  %add = fadd double %mul, %x
+  ret double %add
+}
+
+; Canonicalize (x - -0.1234 / y)
+define double @test9(double %x, double %y) {
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double 1.234000e-01, %y
+; CHECK-NEXT:    [[SUB1:%.*]] = fadd double %x, [[DIV]]
+; CHECK-NEXT:    ret double [[SUB1]]
+;
+  %div = fdiv double -1.234000e-01, %y
+  %sub = fsub double %x, %div
+  ret double %sub
+}
+
+; Don't modify (-0.1234 / y - x)
+define double @test10(double %x, double %y) {
+; CHECK-LABEL: @test10(
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double -1.234000e-01, %y
+; CHECK-NEXT:    [[SUB:%.*]] = fsub double [[DIV]], %x
+; CHECK-NEXT:    ret double [[SUB]]
+;
+  %div = fdiv double -1.234000e-01, %y
+  %sub = fsub double %div, %x
+  ret double %sub
+}
+
+; Canonicalize (-0.1234 / y + x) -> (x - 0.1234 / y)
+define double @test11(double %x, double %y) {
+; CHECK-LABEL: @test11(
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double 1.234000e-01, %y
+; CHECK-NEXT:    [[ADD1:%.*]] = fsub double %x, [[DIV]]
+; CHECK-NEXT:    ret double [[ADD1]]
+;
+  %div = fdiv double -1.234000e-01, %y
+  %add = fadd double %div, %x
+  ret double %add
+}
+
+; Canonicalize (y / -0.1234 + x) -> (x - y / 0.1234)
+define double @test12(double %x, double %y) {
+; CHECK-LABEL: @test12(
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double %y, 1.234000e-01
+; CHECK-NEXT:    [[ADD1:%.*]] = fsub double %x, [[DIV]]
+; CHECK-NEXT:    ret double [[ADD1]]
+;
+  %div = fdiv double %y, -1.234000e-01
+  %add = fadd double %div, %x
+  ret double %add
+}
+
+; Don't create an NSW violation
+define i4 @test13(i4 %x) {
+; CHECK-LABEL: @test13(
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i4 %x, -2
+; CHECK-NEXT:    [[ADD:%.*]] = add i4 [[MUL]], 3
+; CHECK-NEXT:    ret i4 [[ADD]]
+;
+  %mul = mul nsw i4 %x, -2
+  %add = add i4 %mul, 3
+  ret i4 %add
+}
+
+; This tests used to cause an infinite loop where we would loop between
+; canonicalizing the negated constant (i.e., (X + Y*-5.0) -> (X - Y*5.0)) and
+; breaking up a subtract (i.e., (X - Y*5.0) -> X + (0 - Y*5.0)). To break the
+; cycle, we don't canonicalize the negative constant if we're going to later
+; break up the subtract.
+;
+; Check to make sure we don't canonicalize
+;   (%pow2*-5.0 + %sub) -> (%sub - %pow2*5.0)
+; as we would later break up this subtract causing a cycle.
+
+define double @pr34078(double %A) {
+; CHECK-LABEL: @pr34078(
+; CHECK-NEXT:    [[SUB:%.*]] = fsub fast double 1.000000e+00, %A
+; CHECK-NEXT:    [[POW2:%.*]] = fmul double %A, %A
+; CHECK-NEXT:    [[MUL5_NEG:%.*]] = fmul fast double [[POW2]], -5.000000e-01
+; CHECK-NEXT:    [[SUB1:%.*]] = fadd fast double [[MUL5_NEG]], [[SUB]]
+; CHECK-NEXT:    [[FACTOR:%.*]] = fmul fast double [[SUB1]], 2.000000e+00
+; CHECK-NEXT:    ret double [[FACTOR]]
+;
+  %sub = fsub fast double 1.000000e+00, %A
+  %pow2 = fmul double %A, %A
+  %mul5 = fmul fast double %pow2, 5.000000e-01
+  %sub1 = fsub fast double %sub, %mul5
+  %add = fadd fast double %sub1, %sub1
+  ret double %add
+}

Added: llvm/trunk/test/Transforms/Reassociate/commute.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/commute.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/commute.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/commute.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,19 @@
+; RUN: opt -reassociate -S < %s | FileCheck %s
+
+declare void @use(i32)
+
+define void @test1(i32 %x, i32 %y) {
+; CHECK-LABEL: test1
+; CHECK: mul i32 %y, %x
+; CHECK: mul i32 %y, %x
+; CHECK: sub i32 %1, %2
+; CHECK: call void @use(i32 %{{.*}})
+; CHECK: call void @use(i32 %{{.*}})
+
+  %1 = mul i32 %x, %y
+  %2 = mul i32 %y, %x
+  %3 = sub i32 %1, %2
+  call void @use(i32 %1)
+  call void @use(i32 %3)
+  ret void
+}

Added: llvm/trunk/test/Transforms/Reassociate/crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/crash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/crash.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/crash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,174 @@
+; RUN: opt -reassociate -disable-output < %s
+
+
+; rdar://7507855
+define fastcc i32 @test1() nounwind {
+entry:
+  %cond = select i1 undef, i32 1, i32 -1          ; <i32> [#uses=2]
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %sub889 = sub i32 undef, undef                  ; <i32> [#uses=1]
+  %sub891 = sub i32 %sub889, %cond                ; <i32> [#uses=0]
+  %add896 = sub i32 0, %cond                      ; <i32> [#uses=0]
+  ret i32 undef
+}
+
+; PR5981
+define i32 @test2() nounwind ssp {
+entry:
+  %0 = load i32, i32* undef, align 4
+  %1 = mul nsw i32 undef, %0
+  %2 = mul nsw i32 undef, %0
+  %3 = add nsw i32 undef, %1
+  %4 = add nsw i32 %3, %2
+  %5 = add nsw i32 %4, 4
+  %6 = shl i32 %0, 3
+  %7 = add nsw i32 %5, %6
+  br label %bb4.i9
+
+bb4.i9:
+  %8 = add nsw i32 undef, %1
+  ret i32 0
+}
+
+
+define i32 @test3(i32 %Arg, i32 %x1, i32 %x2, i32 %x3) {
+ %A = mul i32 %x1, %Arg
+ %B = mul i32 %Arg, %x2 ;; Part of add operation being factored, also used by C
+ %C = mul i32 %x3, %B
+
+ %D = add i32 %A, %B
+ %E = add i32 %D, %C
+  ret i32 %E
+}
+
+
+; rdar://9096268
+define void @x66303361ae3f602889d1b7d0f86e5455(i8* %arg) nounwind {
+_:
+  br label %_33
+
+_33:                                              ; preds = %_33, %_
+  %tmp348 = load i8, i8* %arg, align 1
+  %tmp349 = lshr i8 %tmp348, 7
+  %tmp350 = or i8 %tmp349, 42
+  %tmp351 = add i8 %tmp350, -42
+  %tmp352 = zext i8 %tmp351 to i32
+  %tmp358 = add i32 %tmp352, -501049439
+  %tmp359 = mul i32 %tmp358, %tmp358
+  %tmp360 = mul i32 %tmp352, %tmp352
+  %tmp361 = sub i32 %tmp359, %tmp360
+  %tmp362 = mul i32 %tmp361, -920056735
+  %tmp363 = add i32 %tmp362, 501049439
+  %tmp364 = add i32 %tmp362, -2000262972
+  %tmp365 = sub i32 %tmp363, %tmp364
+  %tmp366 = sub i32 -501049439, %tmp362
+  %tmp367 = add i32 %tmp365, %tmp366
+  br label %_33
+}
+
+define void @test(i32 %a, i32 %b, i32 %c, i32 %d) {
+  %tmp.2 = xor i32 %a, %b		; <i32> [#uses=1]
+  %tmp.5 = xor i32 %c, %d		; <i32> [#uses=1]
+  %tmp.6 = xor i32 %tmp.2, %tmp.5		; <i32> [#uses=1]
+  %tmp.9 = xor i32 %c, %a		; <i32> [#uses=1]
+  %tmp.12 = xor i32 %b, %d		; <i32> [#uses=1]
+  %tmp.13 = xor i32 %tmp.9, %tmp.12		; <i32> [#uses=1]
+  %tmp.16 = xor i32 %tmp.6, %tmp.13		; <i32> [#uses=0]
+  ret void
+}
+
+define i128 @foo() {
+  %mul = mul i128 0, 0
+  ret i128 %mul
+}
+
+define void @infinite_loop() {
+entry:
+  br label %loop
+loop:
+  %x = phi i32 [undef, %entry], [%x, %loop]
+  %dead = add i32 %x, 0
+  br label %loop
+unreachable1:
+  %y1 = add i32 %y1, 0
+  %z1 = add i32 %y1, 0
+  ret void
+unreachable2:
+  %y2 = add i32 %y2, 0
+  %z2 = add i32 %y2, %y2
+  ret void
+unreachable3:
+  %y3 = add i32 %y3, %y3
+  %z3 = add i32 %y3, 0
+  ret void
+unreachable4:
+  %y4 = add i32 %y4, %y4
+  %z4 = add i32 %y4, %y4
+  ret void
+}
+
+; PR13185
+define void @pr13185(i16 %p) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond, %entry
+  %x.0 = phi i32 [ undef, %entry ], [ %conv, %for.cond ]
+  %conv = zext i16 %p to i32
+  br label %for.cond
+}
+
+; PR12963
+ at a = external global i8
+define i8 @f0(i8 %x) {
+  %t0 = load i8, i8* @a
+  %t1 = mul i8 %x, %x
+  %t2 = mul i8 %t1, %t1
+  %t3 = mul i8 %t2, %t2
+  %t4 = mul i8 %t3, %x
+  %t5 = mul i8 %t4, %t4
+  %t6 = mul i8 %t5, %x
+  %t7 = mul i8 %t6, %t0
+  ret i8 %t7
+}
+
+define i32 @sozefx_(i32 %x, i32 %y) {
+  %t0 = sub i32 %x, %x
+  %t1 = mul i32 %t0, %t0
+  %t2 = mul i32 %x, %t0
+  %t3 = mul i32 %t1, %t1
+  %t4 = add i32 %t2, %t3
+  %t5 = mul i32 %x, %y
+  %t6 = add i32 %t4, %t5
+  ret i32 %t6
+}
+
+define i32 @bar(i32 %arg, i32 %arg1, i32 %arg2) {
+  %tmp1 = mul i32 %arg1, 2
+  %tmp2 = mul i32 %tmp1, 3
+  %tmp3 = mul i32 %arg2, 2
+  %tmp4 = add i32 %tmp1, 1 ; dead code
+  %ret = add i32 %tmp2, %tmp3
+  ret i32 %ret
+}
+
+; PR14060
+define i8 @hang(i8 %p, i8 %p0, i8 %p1, i8 %p2, i8 %p3, i8 %p4, i8 %p5, i8 %p6, i8 %p7, i8 %p8, i8 %p9) {
+  %tmp = zext i1 false to i8
+  %tmp16 = or i8 %tmp, 1
+  %tmp22 = or i8 %p7, %p0
+  %tmp23 = or i8 %tmp16, %tmp22
+  %tmp28 = or i8 %p9, %p1
+  %tmp31 = or i8 %tmp23, %p2
+  %tmp32 = or i8 %tmp31, %tmp28
+  %tmp38 = or i8 %p8, %p3
+  %tmp39 = or i8 %tmp16, %tmp38
+  %tmp43 = or i8 %tmp39, %p4
+  %tmp44 = or i8 %tmp43, 1
+  %tmp47 = or i8 %tmp32, %p5
+  %tmp50 = or i8 %tmp47, %p6
+  %tmp51 = or i8 %tmp44, %tmp50
+  ret i8 %tmp51
+}

Added: llvm/trunk/test/Transforms/Reassociate/crash2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/crash2.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/crash2.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/crash2.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -reassociate %s -S -o - | FileCheck %s
+
+; Reassociate pass used to crash on these example
+
+ at g = global i32 0
+
+define float @undef1() {
+; CHECK-LABEL: @undef1(
+; CHECK-NEXT:    ret float fadd (float bitcast (i32 ptrtoint (i32* @g to i32) to float), float fadd (float bitcast (i32 ptrtoint (i32* @g to i32) to float), float fadd (float fsub (float -0.000000e+00, float bitcast (i32 ptrtoint (i32* @g to i32) to float)), float fsub (float -0.000000e+00, float bitcast (i32 ptrtoint (i32* @g to i32) to float)))))
+;
+  %t0 = fadd fast float bitcast (i32 ptrtoint (i32* @g to i32) to float), bitcast (i32 ptrtoint (i32* @g to i32) to float)
+  %t1 = fsub fast float bitcast (i32 ptrtoint (i32* @g to i32) to float), %t0
+  %t2 = fadd fast float bitcast (i32 ptrtoint (i32* @g to i32) to float), %t1
+  ret float %t2
+}
+
+define void @undef2() {
+; CHECK-LABEL: @undef2(
+; CHECK-NEXT:    unreachable
+;
+  %t0 = fadd fast float bitcast (i32 ptrtoint (i32* @g to i32) to float), bitcast (i32 ptrtoint (i32* @g to i32) to float)
+  %t1 = fadd fast float %t0, 1.0
+  %t2 = fsub fast float %t0, %t1
+  %t3 = fmul fast float %t2, 2.0
+  unreachable
+}
+

Added: llvm/trunk/test/Transforms/Reassociate/deadcode.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/deadcode.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/deadcode.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/deadcode.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,37 @@
+; RUN: opt < %s -reassociate -disable-output
+
+; It has been detected that dead loops like the one in this test case can be
+; created by -jump-threading (it was detected by a csmith generated program).
+;
+; According to -verify this is valid input (even if it could be discussed if
+; the dead loop really satisfies SSA form).
+;
+; The problem found was that the -reassociate pass ends up in an infinite loop
+; when analysing the 'deadloop1' basic block. See "Bugzilla - Bug 30818".
+define void @deadloop1() {
+  br label %endlabel
+
+deadloop1:
+  %1 = xor i32 %2, 7
+  %2 = xor i32 %1, 8
+  br label %deadloop1
+
+endlabel:
+  ret void
+}
+
+
+; Another example showing that dead code could result in infinite loops in
+; reassociate pass. See "Bugzilla - Bug 30818".
+define void @deadloop2() {
+  br label %endlabel
+
+deadloop2:
+  %1 = and i32 %2, 7
+  %2 = and i32 %3, 8
+  %3 = and i32 %1, 6
+  br label %deadloop2
+
+endlabel:
+  ret void
+}

Added: llvm/trunk/test/Transforms/Reassociate/erase_inst_made_change.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/erase_inst_made_change.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/erase_inst_made_change.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/erase_inst_made_change.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,29 @@
+; RUN: opt < %s -inline -reassociate -S | FileCheck %s
+
+; This test case exposed a bug in reassociate where EraseInst's
+; removal of a dead call wasn't recognized as changing the IR.
+; So when runOnFunction propagated the "made changes" upwards
+; to the CallGraphSCCPass it signalled that no changes had been
+; made, so CallGraphSCCPass assumed that the old CallGraph,
+; as known by that pass manager, still was up-to-date.
+;
+; This was detected as an assert when trying to remove the
+; no longer used function 'bar' (due to incorrect reference
+; count in the CallGraph).
+
+define void @foo() {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret void
+entry:
+  call void @bar()
+  ret void
+}
+
+define internal void @bar() noinline nounwind readnone {
+; CHECK-NOT: bar
+entry:
+  ret void
+}
+
+

Added: llvm/trunk/test/Transforms/Reassociate/factorize-again.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/factorize-again.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/factorize-again.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/factorize-again.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -reassociate < %s | FileCheck %s
+
+define void @main(float, float) {
+; CHECK-LABEL: @main(
+; CHECK-NEXT:  wrapper_entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub float undef, [[TMP0:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub float undef, [[TMP1:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.rsqrt.f32(float undef)
+; CHECK-NEXT:    [[REASS_ADD2:%.*]] = fadd fast float [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[REASS_MUL3:%.*]] = fmul fast float [[TMP4]], [[REASS_ADD2]]
+; CHECK-NEXT:    [[REASS_ADD1:%.*]] = fadd fast float [[REASS_MUL3]], [[TMP4]]
+; CHECK-NEXT:    [[REASS_MUL:%.*]] = fmul fast float [[REASS_ADD1]], undef
+; CHECK-NEXT:    [[TMP5:%.*]] = call float @foo2(float [[REASS_MUL]], float 0.000000e+00)
+; CHECK-NEXT:    [[MUL36:%.*]] = fmul fast float [[TMP5]], 1.500000e+00
+; CHECK-NEXT:    call void @foo1(i32 4, float [[MUL36]])
+; CHECK-NEXT:    ret void
+;
+wrapper_entry:
+  %2 = fsub float undef, %0
+  %3 = fsub float undef, %1
+  %4 = call float @llvm.rsqrt.f32(float undef)
+  %5 = fmul fast float undef, %4
+  %6 = fmul fast float %2, %4
+  %7 = fmul fast float %3, %4
+  %8 = fmul fast float %5, undef
+  %9 = fmul fast float %6, undef
+  %10 = fmul fast float %7, undef
+  %11 = fadd fast float %8, %9
+  %12 = fadd fast float %11, %10
+  %13 = call float @foo2(float %12, float 0.000000e+00)
+  %mul36 = fmul fast float %13, 1.500000e+00
+  call void @foo1(i32 4, float %mul36)
+  ret void
+}
+
+declare void @foo1(i32, float)
+
+declare float @foo2(float, float) #1
+
+declare float @llvm.rsqrt.f32(float) #1
+
+attributes #0 = { argmemonly nounwind }
+attributes #1 = { nounwind readnone }
+

Added: llvm/trunk/test/Transforms/Reassociate/fast-AgressiveSubMove.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/fast-AgressiveSubMove.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/fast-AgressiveSubMove.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/fast-AgressiveSubMove.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,40 @@
+; RUN: opt < %s -reassociate -S | FileCheck %s
+
+define float @test1(float %A) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[X:%.*]] = fadd float %A, 1.000000e+00
+; CHECK-NEXT:    [[Y:%.*]] = fadd float %A, 1.000000e+00
+; CHECK-NEXT:    [[R:%.*]] = fsub float [[X]], [[Y]]
+; CHECK-NEXT:    ret float [[R]]
+;
+  %X = fadd float %A, 1.000000e+00
+  %Y = fadd float %A, 1.000000e+00
+  %r = fsub float %X, %Y
+  ret float %r
+}
+
+define float @test2(float %A) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %X = fadd fast float 1.000000e+00, %A
+  %Y = fadd fast float 1.000000e+00, %A
+  %r = fsub fast float %X, %Y
+  ret float %r
+}
+
+; Check again using minimal subset of FMF.
+
+define float @test2_reassoc(float %A) {
+; CHECK-LABEL: @test2_reassoc(
+; CHECK-NEXT:    [[X:%.*]] = fadd reassoc float %A, 1.000000e+00
+; CHECK-NEXT:    [[Y:%.*]] = fadd reassoc float %A, 1.000000e+00
+; CHECK-NEXT:    [[R:%.*]] = fsub reassoc float [[X]], [[Y]]
+; CHECK-NEXT:    ret float [[R]]
+;
+  %X = fadd reassoc float 1.000000e+00, %A
+  %Y = fadd reassoc float 1.000000e+00, %A
+  %r = fsub reassoc float %X, %Y
+  ret float %r
+}
+

Added: llvm/trunk/test/Transforms/Reassociate/fast-ArrayOutOfBounds.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/fast-ArrayOutOfBounds.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/fast-ArrayOutOfBounds.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/fast-ArrayOutOfBounds.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,65 @@
+; RUN: opt < %s -reassociate -instcombine -S | FileCheck %s
+
+; Not marked as fast, so must not change.
+define float @test1(float %a0, float %a1, float %a2, float %a3, float %a4) {
+; CHECK-LABEL: test1
+; CHECK-NEXT: %tmp.2 = fadd float %a3, %a4
+; CHECK-NEXT: %tmp.4 = fadd float %tmp.2, %a2
+; CHECK-NEXT: %tmp.6 = fadd float %tmp.4, %a1
+; CHECK-NEXT: %tmp.8 = fadd float %tmp.6, %a0
+; CHECK-NEXT: %tmp.11 = fadd float %a2, %a3
+; CHECK-NEXT: %tmp.13 = fadd float %tmp.11, %a1
+; CHECK-NEXT: %tmp.15 = fadd float %tmp.13, %a0
+; CHECK-NEXT: %tmp.18 = fadd float %a1, %a2
+; CHECK-NEXT: %tmp.20 = fadd float %tmp.18, %a0
+; CHECK-NEXT: %tmp.23 = fadd float %a0, %a1
+; CHECK-NEXT: %tmp.26 = fsub float %tmp.8, %tmp.15
+; CHECK-NEXT: %tmp.28 = fadd float %tmp.20, %tmp.26
+; CHECK-NEXT: %tmp.30 = fsub float %tmp.28, %tmp.23
+; CHECK-NEXT: %tmp.32 = fsub float %tmp.30, %a4
+; CHECK-NEXT: %tmp.34 = fsub float %tmp.32, %a2
+; CHECK-NEXT: %T = fmul float %tmp.34, %tmp.34
+; CHECK-NEXT: ret float %T
+
+  %tmp.2 = fadd float %a4, %a3
+  %tmp.4 = fadd float %tmp.2, %a2
+  %tmp.6 = fadd float %tmp.4, %a1
+  %tmp.8 = fadd float %tmp.6, %a0
+  %tmp.11 = fadd float %a3, %a2
+  %tmp.13 = fadd float %tmp.11, %a1
+  %tmp.15 = fadd float %tmp.13, %a0
+  %tmp.18 = fadd float %a2, %a1
+  %tmp.20 = fadd float %tmp.18, %a0
+  %tmp.23 = fadd float %a1, %a0
+  %tmp.26 = fsub float %tmp.8, %tmp.15
+  %tmp.28 = fadd float %tmp.26, %tmp.20
+  %tmp.30 = fsub float %tmp.28, %tmp.23
+  %tmp.32 = fsub float %tmp.30, %a4
+  %tmp.34 = fsub float %tmp.32, %a2
+  %T = fmul float %tmp.34, %tmp.34
+  ret float %T
+}
+
+; Should be able to eliminate everything.
+define float @test2(float %a0, float %a1, float %a2, float %a3, float %a4) {
+; CHECK-LABEL: test2
+; CHECK: ret float 0.000000e+00
+
+  %tmp.2 = fadd fast float %a4, %a3
+  %tmp.4 = fadd fast float %tmp.2, %a2
+  %tmp.6 = fadd fast float %tmp.4, %a1
+  %tmp.8 = fadd fast float %tmp.6, %a0
+  %tmp.11 = fadd fast float %a3, %a2
+  %tmp.13 = fadd fast float %tmp.11, %a1
+  %tmp.15 = fadd fast float %tmp.13, %a0
+  %tmp.18 = fadd fast float %a2, %a1
+  %tmp.20 = fadd fast float %tmp.18, %a0
+  %tmp.23 = fadd fast float %a1, %a0
+  %tmp.26 = fsub fast float %tmp.8, %tmp.15
+  %tmp.28 = fadd fast float %tmp.26, %tmp.20
+  %tmp.30 = fsub fast float %tmp.28, %tmp.23
+  %tmp.32 = fsub fast float %tmp.30, %a4
+  %tmp.34 = fsub fast float %tmp.32, %a2
+  %T = fmul fast float %tmp.34, %tmp.34
+  ret float %T
+}

Added: llvm/trunk/test/Transforms/Reassociate/fast-MissedTree.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/fast-MissedTree.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/fast-MissedTree.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/fast-MissedTree.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,39 @@
+; RUN: opt < %s -reassociate -instcombine -S | FileCheck %s
+
+define float @test1(float %A, float %B) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[Z:%.*]] = fadd fast float %A, %B
+; CHECK-NEXT:    ret float [[Z]]
+;
+  %W = fadd fast float %B, -5.0
+  %Y = fadd fast float %A, 5.0
+  %Z = fadd fast float %W, %Y
+  ret float %Z
+}
+
+; Check again using minimal subset of FMF.
+; Both 'reassoc' and 'nsz' are required.
+define float @test1_reassoc_nsz(float %A, float %B) {
+; CHECK-LABEL: @test1_reassoc_nsz(
+; CHECK-NEXT:    [[Z:%.*]] = fadd reassoc nsz float %A, %B
+; CHECK-NEXT:    ret float [[Z]]
+;
+  %W = fadd reassoc nsz float %B, -5.0
+  %Y = fadd reassoc nsz float %A, 5.0
+  %Z = fadd reassoc nsz float %W, %Y
+  ret float %Z
+}
+
+; Verify the fold is not done with only 'reassoc' ('nsz' is required).
+define float @test1_reassoc(float %A, float %B) {
+; CHECK-LABEL: @test1_reassoc(
+; CHECK-NEXT:    [[W:%.*]] = fadd reassoc float %B, -5.000000e+00
+; CHECK-NEXT:    [[Y:%.*]] = fadd reassoc float %A, 5.000000e+00
+; CHECK-NEXT:    [[Z:%.*]] = fadd reassoc float [[Y]], [[W]]
+; CHECK-NEXT:    ret float [[Z]]
+;
+  %W = fadd reassoc float %B, -5.0
+  %Y = fadd reassoc float %A, 5.0
+  %Z = fadd reassoc float %W, %Y
+  ret float %Z
+}

Added: llvm/trunk/test/Transforms/Reassociate/fast-ReassociateVector.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/fast-ReassociateVector.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/fast-ReassociateVector.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/fast-ReassociateVector.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,400 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -reassociate -S | FileCheck %s
+
+; Check that a*c+b*c is turned into (a+b)*c
+
+define <4 x float> @test1(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[REASS_ADD:%.*]] = fadd fast <4 x float> [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[REASS_MUL:%.*]] = fmul fast <4 x float> [[REASS_ADD]], [[C:%.*]]
+; CHECK-NEXT:    ret <4 x float> [[REASS_MUL]]
+;
+  %mul = fmul fast <4 x float> %a, %c
+  %mul1 = fmul fast <4 x float> %b, %c
+  %add = fadd fast <4 x float> %mul, %mul1
+  ret <4 x float> %add
+}
+
+; Check that a*c+b*c is turned into (a+b)*c - minimum FMF subset version
+
+define <4 x float> @test1_reassoc(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: @test1_reassoc(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc <4 x float> [[A:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[MUL1:%.*]] = fmul reassoc <4 x float> [[B:%.*]], [[C]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd reassoc <4 x float> [[MUL]], [[MUL1]]
+; CHECK-NEXT:    ret <4 x float> [[ADD]]
+;
+  %mul = fmul reassoc <4 x float> %a, %c
+  %mul1 = fmul reassoc <4 x float> %b, %c
+  %add = fadd reassoc <4 x float> %mul, %mul1
+  ret <4 x float> %add
+}
+
+; Check that a*a*b+a*a*c is turned into a*(a*(b+c)).
+
+define <2 x float> @test2(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[REASS_ADD1:%.*]] = fadd fast <2 x float> [[C:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[REASS_MUL2:%.*]] = fmul fast <2 x float> [[A:%.*]], [[A]]
+; CHECK-NEXT:    [[REASS_MUL:%.*]] = fmul fast <2 x float> [[REASS_MUL2]], [[REASS_ADD1]]
+; CHECK-NEXT:    ret <2 x float> [[REASS_MUL]]
+;
+  %t0 = fmul fast <2 x float> %a, %b
+  %t1 = fmul fast <2 x float> %a, %t0
+  %t2 = fmul fast <2 x float> %a, %c
+  %t3 = fmul fast <2 x float> %a, %t2
+  %t4 = fadd fast <2 x float> %t1, %t3
+  ret <2 x float> %t4
+}
+
+; Check that a*a*b+a*a*c is turned into a*(a*(b+c)) - minimum FMF subset version
+
+define <2 x float> @test2_reassoc(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; CHECK-LABEL: @test2_reassoc(
+; CHECK-NEXT:    [[T0:%.*]] = fmul reassoc <2 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[T1:%.*]] = fmul reassoc <2 x float> [[A]], [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = fmul reassoc <2 x float> [[A]], [[C:%.*]]
+; CHECK-NEXT:    [[T3:%.*]] = fmul reassoc <2 x float> [[A]], [[T2]]
+; CHECK-NEXT:    [[T4:%.*]] = fadd reassoc <2 x float> [[T1]], [[T3]]
+; CHECK-NEXT:    ret <2 x float> [[T4]]
+;
+  %t0 = fmul reassoc <2 x float> %a, %b
+  %t1 = fmul reassoc <2 x float> %a, %t0
+  %t2 = fmul reassoc <2 x float> %a, %c
+  %t3 = fmul reassoc <2 x float> %a, %t2
+  %t4 = fadd reassoc <2 x float> %t1, %t3
+  ret <2 x float> %t4
+}
+
+; Check that a*b+a*c+d is turned into a*(b+c)+d.
+
+define <2 x double> @test3(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> %d) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[REASS_ADD:%.*]] = fadd fast <2 x double> [[C:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[REASS_MUL:%.*]] = fmul fast <2 x double> [[REASS_ADD]], [[A:%.*]]
+; CHECK-NEXT:    [[T3:%.*]] = fadd fast <2 x double> [[REASS_MUL]], [[D:%.*]]
+; CHECK-NEXT:    ret <2 x double> [[T3]]
+;
+  %t0 = fmul fast <2 x double> %a, %b
+  %t1 = fmul fast <2 x double> %a, %c
+  %t2 = fadd fast <2 x double> %t1, %d
+  %t3 = fadd fast <2 x double> %t0, %t2
+  ret <2 x double> %t3
+}
+
+; Check that a*b+a*c+d is turned into a*(b+c)+d - minimum FMF subset version
+
+define <2 x double> @test3_reassoc(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> %d) {
+; CHECK-LABEL: @test3_reassoc(
+; CHECK-NEXT:    [[T0:%.*]] = fmul reassoc <2 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[T1:%.*]] = fmul reassoc <2 x double> [[A]], [[C:%.*]]
+; CHECK-NEXT:    [[T2:%.*]] = fadd reassoc <2 x double> [[T1]], [[D:%.*]]
+; CHECK-NEXT:    [[T3:%.*]] = fadd reassoc <2 x double> [[T0]], [[T2]]
+; CHECK-NEXT:    ret <2 x double> [[T3]]
+;
+  %t0 = fmul reassoc <2 x double> %a, %b
+  %t1 = fmul reassoc <2 x double> %a, %c
+  %t2 = fadd reassoc <2 x double> %t1, %d
+  %t3 = fadd reassoc <2 x double> %t0, %t2
+  ret <2 x double> %t3
+}
+
+; No fast-math.
+
+define <2 x float> @test4(<2 x float> %A) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    [[X:%.*]] = fadd <2 x float> [[A:%.*]], <float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[Y:%.*]] = fadd <2 x float> [[A]], <float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[R:%.*]] = fsub <2 x float> [[X]], [[Y]]
+; CHECK-NEXT:    ret <2 x float> [[R]]
+;
+  %X = fadd <2 x float> %A, < float 1.000000e+00, float 1.000000e+00 >
+  %Y = fadd <2 x float> %A, < float 1.000000e+00, float 1.000000e+00 >
+  %R = fsub <2 x float> %X, %Y
+  ret <2 x float> %R
+}
+
+; Check 47*X + 47*X -> 94*X.
+
+define <2 x float> @test5(<2 x float> %X) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    [[FACTOR:%.*]] = fmul fast <2 x float> [[X:%.*]], <float 9.400000e+01, float 9.400000e+01>
+; CHECK-NEXT:    ret <2 x float> [[FACTOR]]
+;
+  %Y = fmul fast <2 x float> %X, <float 4.700000e+01, float 4.700000e+01>
+  %Z = fadd fast <2 x float> %Y, %Y
+  ret <2 x float> %Z
+}
+
+; Check 47*X + 47*X -> 94*X - minimum FMF subset version
+
+define <2 x float> @test5_reassoc(<2 x float> %X) {
+; CHECK-LABEL: @test5_reassoc(
+; CHECK-NEXT:    [[Y:%.*]] = fmul reassoc <2 x float> [[X:%.*]], <float 4.700000e+01, float 4.700000e+01>
+; CHECK-NEXT:    [[Z:%.*]] = fadd reassoc <2 x float> [[Y]], [[Y]]
+; CHECK-NEXT:    ret <2 x float> [[Z]]
+;
+  %Y = fmul reassoc <2 x float> %X, <float 4.700000e+01, float 4.700000e+01>
+  %Z = fadd reassoc <2 x float> %Y, %Y
+  ret <2 x float> %Z
+}
+
+; Check X+X+X -> 3*X.
+
+define <2 x float> @test6(<2 x float> %X) {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    [[FACTOR:%.*]] = fmul fast <2 x float> [[X:%.*]], <float 3.000000e+00, float 3.000000e+00>
+; CHECK-NEXT:    ret <2 x float> [[FACTOR]]
+;
+  %Y = fadd fast <2 x float> %X ,%X
+  %Z = fadd fast <2 x float> %Y, %X
+  ret <2 x float> %Z
+}
+
+; Check X+X+X -> 3*X - minimum FMF subset version
+
+define <2 x float> @test6_reassoc(<2 x float> %X) {
+; CHECK-LABEL: @test6_reassoc(
+; CHECK-NEXT:    [[Y:%.*]] = fadd reassoc <2 x float> [[X:%.*]], [[X]]
+; CHECK-NEXT:    [[Z:%.*]] = fadd reassoc <2 x float> [[X]], [[Y]]
+; CHECK-NEXT:    ret <2 x float> [[Z]]
+;
+  %Y = fadd reassoc <2 x float> %X ,%X
+  %Z = fadd reassoc <2 x float> %Y, %X
+  ret <2 x float> %Z
+}
+
+; Check 127*W+50*W -> 177*W.
+
+define <2 x double> @test7(<2 x double> %W) {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    [[REASS_MUL:%.*]] = fmul fast <2 x double> [[W:%.*]], <double 1.770000e+02, double 1.770000e+02>
+; CHECK-NEXT:    ret <2 x double> [[REASS_MUL]]
+;
+  %X = fmul fast <2 x double> %W, <double 127.0, double 127.0>
+  %Y = fmul fast <2 x double> %W, <double 50.0, double 50.0>
+  %Z = fadd fast <2 x double> %Y, %X
+  ret <2 x double> %Z
+}
+
+; Check 127*W+50*W -> 177*W - minimum FMF subset version
+
+define <2 x double> @test7_reassoc(<2 x double> %W) {
+; CHECK-LABEL: @test7_reassoc(
+; CHECK-NEXT:    [[X:%.*]] = fmul reassoc <2 x double> [[W:%.*]], <double 1.270000e+02, double 1.270000e+02>
+; CHECK-NEXT:    [[Y:%.*]] = fmul reassoc <2 x double> [[W]], <double 5.000000e+01, double 5.000000e+01>
+; CHECK-NEXT:    [[Z:%.*]] = fadd reassoc <2 x double> [[Y]], [[X]]
+; CHECK-NEXT:    ret <2 x double> [[Z]]
+;
+  %X = fmul reassoc <2 x double> %W, <double 127.0, double 127.0>
+  %Y = fmul reassoc <2 x double> %W, <double 50.0, double 50.0>
+  %Z = fadd reassoc <2 x double> %Y, %X
+  ret <2 x double> %Z
+}
+
+; Check X*12*12 -> X*144.
+
+define <2 x float> @test8(<2 x float> %arg) {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <2 x float> [[ARG:%.*]], <float 1.440000e+02, float 1.440000e+02>
+; CHECK-NEXT:    ret <2 x float> [[TMP2]]
+;
+  %tmp1 = fmul fast <2 x float> <float 1.200000e+01, float 1.200000e+01>, %arg
+  %tmp2 = fmul fast <2 x float> %tmp1, <float 1.200000e+01, float 1.200000e+01>
+  ret <2 x float> %tmp2
+}
+
+; Check X*12*12 -> X*144 - minimum FMF subset version
+
+define <2 x float> @test8_reassoc(<2 x float> %arg) {
+; CHECK-LABEL: @test8_reassoc(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul reassoc <2 x float> [[ARG:%.*]], <float 1.200000e+01, float 1.200000e+01>
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul reassoc <2 x float> [[TMP1]], <float 1.200000e+01, float 1.200000e+01>
+; CHECK-NEXT:    ret <2 x float> [[TMP2]]
+;
+  %tmp1 = fmul reassoc <2 x float> <float 1.200000e+01, float 1.200000e+01>, %arg
+  %tmp2 = fmul reassoc <2 x float> %tmp1, <float 1.200000e+01, float 1.200000e+01>
+  ret <2 x float> %tmp2
+}
+
+; Check (b+(a+1234))+-a -> b+1234.
+
+define <2 x double> @test9(<2 x double> %b, <2 x double> %a) {
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub fast <2 x double> zeroinitializer, [[A:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <2 x double> [[B:%.*]], <double 1.234000e+03, double 1.234000e+03>
+; CHECK-NEXT:    ret <2 x double> [[TMP2]]
+;
+  %1 = fadd fast <2 x double> %a, <double 1.234000e+03, double 1.234000e+03>
+  %2 = fadd fast <2 x double> %b, %1
+  %3 = fsub fast <2 x double> <double 0.000000e+00, double 0.000000e+00>, %a
+  %4 = fadd fast <2 x double> %2, %3
+  ret <2 x double> %4
+}
+
+; Check (b+(a+1234))+-a -> b+1234 - minimum FMF subset version
+
+define <2 x double> @test9_reassoc(<2 x double> %b, <2 x double> %a) {
+; CHECK-LABEL: @test9_reassoc(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd reassoc <2 x double> [[A:%.*]], <double 1.234000e+03, double 1.234000e+03>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd reassoc <2 x double> [[B:%.*]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub reassoc <2 x double> zeroinitializer, [[A]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd reassoc <2 x double> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x double> [[TMP4]]
+;
+  %1 = fadd reassoc <2 x double> %a, <double 1.234000e+03, double 1.234000e+03>
+  %2 = fadd reassoc <2 x double> %b, %1
+  %3 = fsub reassoc <2 x double> <double 0.000000e+00, double 0.000000e+00>, %a
+  %4 = fadd reassoc <2 x double> %2, %3
+  ret <2 x double> %4
+}
+
+; Check -(-(z*40)*a) -> a*40*z.
+
+define <2 x float> @test10(<2 x float> %a, <2 x float> %b, <2 x float> %z) {
+; CHECK-LABEL: @test10(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub fast <2 x float> zeroinitializer, zeroinitializer
+; CHECK-NEXT:    [[E:%.*]] = fmul fast <2 x float> [[A:%.*]], <float 4.000000e+01, float 4.000000e+01>
+; CHECK-NEXT:    [[F:%.*]] = fmul fast <2 x float> [[E]], [[Z:%.*]]
+; CHECK-NEXT:    ret <2 x float> [[F]]
+;
+  %d = fmul fast <2 x float> %z, <float 4.000000e+01, float 4.000000e+01>
+  %c = fsub fast <2 x float> <float 0.000000e+00, float 0.000000e+00>, %d
+  %e = fmul fast <2 x float> %a, %c
+  %f = fsub fast <2 x float> <float 0.000000e+00, float 0.000000e+00>, %e
+  ret <2 x float> %f
+}
+
+; Check -(-(z*40)*a) -> a*40*z - minimum FMF subset version
+
+define <2 x float> @test10_reassoc(<2 x float> %a, <2 x float> %b, <2 x float> %z) {
+; CHECK-LABEL: @test10_reassoc(
+; CHECK-NEXT:    [[D:%.*]] = fmul reassoc <2 x float> [[Z:%.*]], <float 4.000000e+01, float 4.000000e+01>
+; CHECK-NEXT:    [[C:%.*]] = fsub reassoc <2 x float> zeroinitializer, [[D]]
+; CHECK-NEXT:    [[E:%.*]] = fmul reassoc <2 x float> [[A:%.*]], [[C]]
+; CHECK-NEXT:    [[F:%.*]] = fsub reassoc <2 x float> zeroinitializer, [[E]]
+; CHECK-NEXT:    ret <2 x float> [[F]]
+;
+  %d = fmul reassoc <2 x float> %z, <float 4.000000e+01, float 4.000000e+01>
+  %c = fsub reassoc <2 x float> <float 0.000000e+00, float 0.000000e+00>, %d
+  %e = fmul reassoc <2 x float> %a, %c
+  %f = fsub reassoc <2 x float> <float 0.000000e+00, float 0.000000e+00>, %e
+  ret <2 x float> %f
+}
+
+; Check x*y+y*x -> x*y*2.
+
+define <2 x double> @test11(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: @test11(
+; CHECK-NEXT:    [[FACTOR:%.*]] = fmul fast <2 x double> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[REASS_MUL:%.*]] = fmul fast <2 x double> [[FACTOR]], <double 2.000000e+00, double 2.000000e+00>
+; CHECK-NEXT:    ret <2 x double> [[REASS_MUL]]
+;
+  %1 = fmul fast <2 x double> %x, %y
+  %2 = fmul fast <2 x double> %y, %x
+  %3 = fadd fast <2 x double> %1, %2
+  ret <2 x double> %3
+}
+
+; Check x*y+y*x -> x*y*2 - minimum FMF subset version
+
+define <2 x double> @test11_reassoc(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: @test11_reassoc(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul reassoc <2 x double> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul reassoc <2 x double> [[X]], [[Y]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd reassoc <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x double> [[TMP3]]
+;
+  %1 = fmul reassoc <2 x double> %x, %y
+  %2 = fmul reassoc <2 x double> %y, %x
+  %3 = fadd reassoc <2 x double> %1, %2
+  ret <2 x double> %3
+}
+
+; FIXME: shifts should be converted to mul to assist further reassociation.
+
+define <2 x i64> @test12(<2 x i64> %b, <2 x i64> %c) {
+; CHECK-LABEL: @test12(
+; CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i64> [[C:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl <2 x i64> [[MUL]], <i64 5, i64 5>
+; CHECK-NEXT:    ret <2 x i64> [[SHL]]
+;
+  %mul = mul <2 x i64> %c, %b
+  %shl = shl <2 x i64> %mul, <i64 5, i64 5>
+  ret <2 x i64> %shl
+}
+
+; FIXME: expressions with a negative const should be canonicalized to assist
+; further reassociation.
+; We would expect (-5*b)+a -> a-(5*b) but only the constant operand is commuted.
+
+define <4 x float> @test13(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test13(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul fast <4 x float> [[B:%.*]], <float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00>
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast <4 x float> [[MUL]], [[A:%.*]]
+; CHECK-NEXT:    ret <4 x float> [[ADD]]
+;
+  %mul = fmul fast <4 x float> <float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00>, %b
+  %add = fadd fast <4 x float> %mul, %a
+  ret <4 x float> %add
+}
+
+; Break up subtract to assist further reassociation.
+; Check a+b-c -> a+b+-c.
+
+define <2 x i64> @test14(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
+; CHECK-LABEL: @test14(
+; CHECK-NEXT:    [[ADD:%.*]] = add <2 x i64> [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[C_NEG:%.*]] = sub <2 x i64> zeroinitializer, [[C:%.*]]
+; CHECK-NEXT:    [[SUB:%.*]] = add <2 x i64> [[ADD]], [[C_NEG]]
+; CHECK-NEXT:    ret <2 x i64> [[SUB]]
+;
+  %add = add <2 x i64> %b, %a
+  %sub = sub <2 x i64> %add, %c
+  ret <2 x i64> %sub
+}
+
+define <2 x i32> @test15(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @test15(
+; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i32> [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
+;
+  %tmp1 = and <2 x i32> %x, %y
+  %tmp2 = and <2 x i32> %y, %x
+  %tmp3 = and <2 x i32> %tmp1, %tmp2
+  ret <2 x i32> %tmp3
+}
+
+define <2 x i32> @test16(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @test16(
+; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i32> [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
+;
+  %tmp1 = or <2 x i32> %x, %y
+  %tmp2 = or <2 x i32> %y, %x
+  %tmp3 = or <2 x i32> %tmp1, %tmp2
+  ret <2 x i32> %tmp3
+}
+
+define <2 x i32> @test17(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @test17(
+; CHECK-NEXT:    ret <2 x i32> zeroinitializer
+;
+  %tmp1 = xor <2 x i32> %x, %y
+  %tmp2 = xor <2 x i32> %y, %x
+  %tmp3 = xor <2 x i32> %tmp1, %tmp2
+  ret <2 x i32> %tmp3
+}
+
+define <2 x i32> @test18(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @test18(
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i32> [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[TMP5]]
+;
+  %tmp1 = xor <2 x i32> %x, %y
+  %tmp2 = xor <2 x i32> %y, %x
+  %tmp3 = xor <2 x i32> %x, %y
+  %tmp4 = xor <2 x i32> %tmp1, %tmp2
+  %tmp5 = xor <2 x i32> %tmp4, %tmp3
+  ret <2 x i32> %tmp5
+}

Added: llvm/trunk/test/Transforms/Reassociate/fast-SubReassociate.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/fast-SubReassociate.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/fast-SubReassociate.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/fast-SubReassociate.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,119 @@
+; RUN: opt < %s -reassociate -constprop -instcombine -S | FileCheck %s
+
+define float @test1(float %A, float %B) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[W:%.*]] = fadd float %B, 5.000000e+00
+; CHECK-NEXT:    [[X:%.*]] = fadd float %A, -7.000000e+00
+; CHECK-NEXT:    [[Y:%.*]] = fsub float [[X]], [[W]]
+; CHECK-NEXT:    [[Z:%.*]] = fadd float [[Y]], 1.200000e+01
+; CHECK-NEXT:    ret float [[Z]]
+;
+  %W = fadd float 5.0, %B
+  %X = fadd float -7.0, %A
+  %Y = fsub float %X, %W
+  %Z = fadd float %Y, 12.0
+  ret float %Z
+}
+
+; With sub reassociation, constant folding can eliminate all of the constants.
+define float @test2(float %A, float %B) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[Z:%.*]] = fsub fast float %A, %B
+; CHECK-NEXT:    ret float [[Z]]
+;
+  %W = fadd fast float %B, 5.000000e+00
+  %X = fadd fast float %A, -7.000000e+00
+  %Y = fsub fast float %X, %W
+  %Z = fadd fast float %Y, 1.200000e+01
+  ret float %Z
+}
+
+; Check again using minimal subset of FMF.
+; Both 'reassoc' and 'nsz' are required.
+define float @test2_minimal(float %A, float %B) {
+; CHECK-LABEL: @test2_minimal(
+; CHECK-NEXT:    [[Z:%.*]] = fsub reassoc nsz float %A, %B
+; CHECK-NEXT:    ret float [[Z]]
+;
+  %W = fadd reassoc nsz float %B, 5.000000e+00
+  %X = fadd reassoc nsz float %A, -7.000000e+00
+  %Y = fsub reassoc nsz float %X, %W
+  %Z = fadd reassoc nsz float %Y, 1.200000e+01
+  ret float %Z
+}
+
+; Verify the fold is not done with only 'reassoc' ('nsz' is required).
+define float @test2_reassoc(float %A, float %B) {
+; CHECK-LABEL: @test2_reassoc(
+; CHECK-NEXT:    [[W:%.*]] = fadd reassoc float %B, 5.000000e+00
+; CHECK-NEXT:    [[X:%.*]] = fadd reassoc float %A, -7.000000e+00
+; CHECK-NEXT:    [[Y:%.*]] = fsub reassoc float [[X]], [[W]]
+; CHECK-NEXT:    [[Z:%.*]] = fadd reassoc float [[Y]], 1.200000e+01
+; CHECK-NEXT:    ret float [[Z]]
+;
+  %W = fadd reassoc float %B, 5.000000e+00
+  %X = fadd reassoc float %A, -7.000000e+00
+  %Y = fsub reassoc float %X, %W
+  %Z = fadd reassoc float %Y, 1.200000e+01
+  ret float %Z
+}
+
+define float @test3(float %A, float %B, float %C, float %D) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[M:%.*]] = fadd float %A, 1.200000e+01
+; CHECK-NEXT:    [[N:%.*]] = fadd float [[M]], %B
+; CHECK-NEXT:    [[O:%.*]] = fadd float [[N]], %C
+; CHECK-NEXT:    [[P:%.*]] = fsub float %D, [[O]]
+; CHECK-NEXT:    [[Q:%.*]] = fadd float [[P]], 1.200000e+01
+; CHECK-NEXT:    ret float [[Q]]
+;
+  %M = fadd float %A, 1.200000e+01
+  %N = fadd float %M, %B
+  %O = fadd float %N, %C
+  %P = fsub float %D, %O
+  %Q = fadd float %P, 1.200000e+01
+  ret float %Q
+}
+
+; With sub reassociation, constant folding can eliminate the two 12 constants.
+
+define float @test4(float %A, float %B, float %C, float %D) {
+; FIXME: InstCombine should be able to get us to the following:
+; %sum = fadd fast float %B, %A
+; %sum1 = fadd fast float %sum, %C
+; %Q = fsub fast float %D, %sum1
+; ret i32 %Q
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    [[B_NEG:%.*]] = fsub fast float -0.000000e+00, %B
+; CHECK-NEXT:    [[O_NEG:%.*]] = fsub fast float [[B_NEG]], %A
+; CHECK-NEXT:    [[P:%.*]] = fsub fast float [[O_NEG]], %C
+; CHECK-NEXT:    [[Q:%.*]] = fadd fast float [[P]], %D
+; CHECK-NEXT:    ret float [[Q]]
+;
+  %M = fadd fast float 1.200000e+01, %A
+  %N = fadd fast float %M, %B
+  %O = fadd fast float %N, %C
+  %P = fsub fast float %D, %O
+  %Q = fadd fast float 1.200000e+01, %P
+  ret float %Q
+}
+
+; Check again using minimal subset of FMF.
+
+define float @test4_reassoc(float %A, float %B, float %C, float %D) {
+; CHECK-LABEL: @test4_reassoc(
+; CHECK-NEXT:    [[M:%.*]] = fadd reassoc float %A, 1.200000e+01
+; CHECK-NEXT:    [[N:%.*]] = fadd reassoc float [[M]], %B
+; CHECK-NEXT:    [[O:%.*]] = fadd reassoc float [[N]], %C
+; CHECK-NEXT:    [[P:%.*]] = fsub reassoc float %D, [[O]]
+; CHECK-NEXT:    [[Q:%.*]] = fadd reassoc float [[P]], 1.200000e+01
+; CHECK-NEXT:    ret float [[Q]]
+;
+  %M = fadd reassoc float 1.200000e+01, %A
+  %N = fadd reassoc float %M, %B
+  %O = fadd reassoc float %N, %C
+  %P = fsub reassoc float %D, %O
+  %Q = fadd reassoc float 1.200000e+01, %P
+  ret float %Q
+}
+

Added: llvm/trunk/test/Transforms/Reassociate/fast-basictest.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/fast-basictest.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/fast-basictest.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/fast-basictest.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,606 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -reassociate -gvn -instcombine -S | FileCheck %s
+
+; With reassociation, constant folding can eliminate the 12 and -12 constants.
+define float @test1(float %arg) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[ARG_NEG:%.*]] = fsub fast float -0.000000e+00, [[ARG:%.*]]
+; CHECK-NEXT:    ret float [[ARG_NEG]]
+;
+  %t1 = fsub fast float -1.200000e+01, %arg
+  %t2 = fadd fast float %t1, 1.200000e+01
+  ret float %t2
+}
+
+; Check again using the minimal subset of FMF.
+; Both 'reassoc' and 'nsz' are required.
+define float @test1_minimal(float %arg) {
+; CHECK-LABEL: @test1_minimal(
+; CHECK-NEXT:    [[ARG_NEG:%.*]] = fsub reassoc nsz float -0.000000e+00, [[ARG:%.*]]
+; CHECK-NEXT:    ret float [[ARG_NEG]]
+;
+  %t1 = fsub reassoc nsz float -1.200000e+01, %arg
+  %t2 = fadd reassoc nsz float %t1, 1.200000e+01
+  ret float %t2
+}
+
+; Verify the fold is not done with only 'reassoc' ('nsz' is required).
+define float @test1_reassoc(float %arg) {
+; CHECK-LABEL: @test1_reassoc(
+; CHECK-NEXT:    [[T1:%.*]] = fsub reassoc float -1.200000e+01, [[ARG:%.*]]
+; CHECK-NEXT:    [[T2:%.*]] = fadd reassoc float [[T1]], 1.200000e+01
+; CHECK-NEXT:    ret float [[T2]]
+;
+  %t1 = fsub reassoc float -1.200000e+01, %arg
+  %t2 = fadd reassoc float %t1, 1.200000e+01
+  ret float %t2
+}
+
+define float @test2(float %reg109, float %reg1111) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[REG115:%.*]] = fadd float [[REG109:%.*]], -3.000000e+01
+; CHECK-NEXT:    [[REG116:%.*]] = fadd float [[REG115]], [[REG1111:%.*]]
+; CHECK-NEXT:    [[REG117:%.*]] = fadd float [[REG116]], 3.000000e+01
+; CHECK-NEXT:    ret float [[REG117]]
+;
+  %reg115 = fadd float %reg109, -3.000000e+01
+  %reg116 = fadd float %reg115, %reg1111
+  %reg117 = fadd float %reg116, 3.000000e+01
+  ret float %reg117
+}
+
+define float @test3(float %reg109, float %reg1111) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[REG117:%.*]] = fadd fast float [[REG109:%.*]], [[REG1111:%.*]]
+; CHECK-NEXT:    ret float [[REG117]]
+;
+  %reg115 = fadd fast float %reg109, -3.000000e+01
+  %reg116 = fadd fast float %reg115, %reg1111
+  %reg117 = fadd fast float %reg116, 3.000000e+01
+  ret float %reg117
+}
+
+define float @test3_reassoc(float %reg109, float %reg1111) {
+; CHECK-LABEL: @test3_reassoc(
+; CHECK-NEXT:    [[REG115:%.*]] = fadd reassoc float [[REG109:%.*]], -3.000000e+01
+; CHECK-NEXT:    [[REG116:%.*]] = fadd reassoc float [[REG115]], [[REG1111:%.*]]
+; CHECK-NEXT:    [[REG117:%.*]] = fadd reassoc float [[REG116]], 3.000000e+01
+; CHECK-NEXT:    ret float [[REG117]]
+;
+  %reg115 = fadd reassoc float %reg109, -3.000000e+01
+  %reg116 = fadd reassoc float %reg115, %reg1111
+  %reg117 = fadd reassoc float %reg116, 3.000000e+01
+  ret float %reg117
+}
+
+ at fe = external global float
+ at fa = external global float
+ at fb = external global float
+ at fc = external global float
+ at ff = external global float
+
+define void @test4() {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    [[A:%.*]] = load float, float* @fa, align 4
+; CHECK-NEXT:    [[B:%.*]] = load float, float* @fb, align 4
+; CHECK-NEXT:    [[C:%.*]] = load float, float* @fc, align 4
+; CHECK-NEXT:    [[T1:%.*]] = fadd fast float [[B]], [[A]]
+; CHECK-NEXT:    [[T2:%.*]] = fadd fast float [[T1]], [[C]]
+; CHECK-NEXT:    store float [[T2]], float* @fe, align 4
+; CHECK-NEXT:    store float [[T2]], float* @ff, align 4
+; CHECK-NEXT:    ret void
+;
+  %A = load float, float* @fa
+  %B = load float, float* @fb
+  %C = load float, float* @fc
+  %t1 = fadd fast float %A, %B
+  %t2 = fadd fast float %t1, %C
+  %t3 = fadd fast float %C, %A
+  %t4 = fadd fast float %t3, %B
+  ; e = (a+b)+c;
+  store float %t2, float* @fe
+  ; f = (a+c)+b
+  store float %t4, float* @ff
+  ret void
+}
+
+define void @test5() {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    [[A:%.*]] = load float, float* @fa, align 4
+; CHECK-NEXT:    [[B:%.*]] = load float, float* @fb, align 4
+; CHECK-NEXT:    [[C:%.*]] = load float, float* @fc, align 4
+; CHECK-NEXT:    [[T1:%.*]] = fadd fast float [[B]], [[A]]
+; CHECK-NEXT:    [[T2:%.*]] = fadd fast float [[T1]], [[C]]
+; CHECK-NEXT:    store float [[T2]], float* @fe, align 4
+; CHECK-NEXT:    store float [[T2]], float* @ff, align 4
+; CHECK-NEXT:    ret void
+;
+  %A = load float, float* @fa
+  %B = load float, float* @fb
+  %C = load float, float* @fc
+  %t1 = fadd fast float %A, %B
+  %t2 = fadd fast float %t1, %C
+  %t3 = fadd fast float %C, %A
+  %t4 = fadd fast float %t3, %B
+  ; e = c+(a+b)
+  store float %t2, float* @fe
+  ; f = (c+a)+b
+  store float %t4, float* @ff
+  ret void
+}
+
+define void @test6() {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    [[A:%.*]] = load float, float* @fa, align 4
+; CHECK-NEXT:    [[B:%.*]] = load float, float* @fb, align 4
+; CHECK-NEXT:    [[C:%.*]] = load float, float* @fc, align 4
+; CHECK-NEXT:    [[T1:%.*]] = fadd fast float [[B]], [[A]]
+; CHECK-NEXT:    [[T2:%.*]] = fadd fast float [[T1]], [[C]]
+; CHECK-NEXT:    store float [[T2]], float* @fe, align 4
+; CHECK-NEXT:    store float [[T2]], float* @ff, align 4
+; CHECK-NEXT:    ret void
+;
+  %A = load float, float* @fa
+  %B = load float, float* @fb
+  %C = load float, float* @fc
+  %t1 = fadd fast float %B, %A
+  %t2 = fadd fast float %t1, %C
+  %t3 = fadd fast float %C, %A
+  %t4 = fadd fast float %t3, %B
+  ; e = c+(b+a)
+  store float %t2, float* @fe
+  ; f = (c+a)+b
+  store float %t4, float* @ff
+  ret void
+}
+
+define float @test7(float %A, float %B, float %C) {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    [[REASS_ADD1:%.*]] = fadd fast float [[C:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[REASS_MUL2:%.*]] = fmul fast float [[A:%.*]], [[A]]
+; CHECK-NEXT:    [[REASS_MUL:%.*]] = fmul fast float [[REASS_MUL2]], [[REASS_ADD1]]
+; CHECK-NEXT:    ret float [[REASS_MUL]]
+;
+  %aa = fmul fast float %A, %A
+  %aab = fmul fast float %aa, %B
+  %ac = fmul fast float %A, %C
+  %aac = fmul fast float %ac, %A
+  %r = fadd fast float %aab, %aac
+  ret float %r
+}
+
+define float @test7_reassoc(float %A, float %B, float %C) {
+; CHECK-LABEL: @test7_reassoc(
+; CHECK-NEXT:    [[AA:%.*]] = fmul reassoc float [[A:%.*]], [[A]]
+; CHECK-NEXT:    [[AAB:%.*]] = fmul reassoc float [[AA]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul reassoc float [[A]], [[A]]
+; CHECK-NEXT:    [[AAC:%.*]] = fmul reassoc float [[TMP1]], [[C:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = fadd reassoc float [[AAB]], [[AAC]]
+; CHECK-NEXT:    ret float [[R]]
+;
+  %aa = fmul reassoc float %A, %A
+  %aab = fmul reassoc float %aa, %B
+  %ac = fmul reassoc float %A, %C
+  %aac = fmul reassoc float %ac, %A
+  %r = fadd reassoc float %aab, %aac
+  ret float %r
+}
+
+; (-X)*Y + Z -> Z-X*Y
+
+define float @test8(float %X, float %Y, float %Z) {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:    [[A:%.*]] = fmul fast float [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[C:%.*]] = fsub fast float [[Z:%.*]], [[A]]
+; CHECK-NEXT:    ret float [[C]]
+;
+  %A = fsub fast float 0.0, %X
+  %B = fmul fast float %A, %Y
+  %C = fadd fast float %B, %Z
+  ret float %C
+}
+
+define float @test8_reassoc(float %X, float %Y, float %Z) {
+; CHECK-LABEL: @test8_reassoc(
+; CHECK-NEXT:    [[A:%.*]] = fsub reassoc float 0.000000e+00, [[X:%.*]]
+; CHECK-NEXT:    [[B:%.*]] = fmul reassoc float [[A]], [[Y:%.*]]
+; CHECK-NEXT:    [[C:%.*]] = fadd reassoc float [[B]], [[Z:%.*]]
+; CHECK-NEXT:    ret float [[C]]
+;
+  %A = fsub reassoc float 0.0, %X
+  %B = fmul reassoc float %A, %Y
+  %C = fadd reassoc float %B, %Z
+  ret float %C
+}
+
+define float @test9(float %X) {
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:    [[FACTOR:%.*]] = fmul fast float [[X:%.*]], 9.400000e+01
+; CHECK-NEXT:    ret float [[FACTOR]]
+;
+  %Y = fmul fast float %X, 4.700000e+01
+  %Z = fadd fast float %Y, %Y
+  ret float %Z
+}
+
+; Check again with 'reassoc' and 'nsz' ('nsz' not technically required).
+define float @test9_reassoc_nsz(float %X) {
+; CHECK-LABEL: @test9_reassoc_nsz(
+; CHECK-NEXT:    [[FACTOR:%.*]] = fmul reassoc nsz float [[X:%.*]], 9.400000e+01
+; CHECK-NEXT:    ret float [[FACTOR]]
+;
+  %Y = fmul reassoc nsz float %X, 4.700000e+01
+  %Z = fadd reassoc nsz float %Y, %Y
+  ret float %Z
+}
+
+; TODO: This doesn't require 'nsz'.  It should fold to X * 94.0
+define float @test9_reassoc(float %X) {
+; CHECK-LABEL: @test9_reassoc(
+; CHECK-NEXT:    [[Y:%.*]] = fmul reassoc float [[X:%.*]], 4.700000e+01
+; CHECK-NEXT:    [[Z:%.*]] = fadd reassoc float [[Y]], [[Y]]
+; CHECK-NEXT:    ret float [[Z]]
+;
+  %Y = fmul reassoc float %X, 4.700000e+01
+  %Z = fadd reassoc float %Y, %Y
+  ret float %Z
+}
+
+; Side note: (x + x + x) and (3*x) each have only a single rounding.  So
+; transforming x+x+x to 3*x is always safe, even without any FMF.
+; To avoid that special-case, we have the addition of 'x' four times, here.
+define float @test10(float %X) {
+; CHECK-LABEL: @test10(
+; CHECK-NEXT:    [[FACTOR:%.*]] = fmul fast float [[X:%.*]], 4.000000e+00
+; CHECK-NEXT:    ret float [[FACTOR]]
+;
+  %Y = fadd fast float %X ,%X
+  %Z = fadd fast float %Y, %X
+  %W = fadd fast float %Z, %X
+  ret float %W
+}
+
+; Check again with 'reassoc' and 'nsz' ('nsz' not technically required).
+define float @test10_reassoc_nsz(float %X) {
+; CHECK-LABEL: @test10_reassoc_nsz(
+; CHECK-NEXT:    [[FACTOR:%.*]] = fmul reassoc nsz float [[X:%.*]], 4.000000e+00
+; CHECK-NEXT:    ret float [[FACTOR]]
+;
+  %Y = fadd reassoc nsz float %X ,%X
+  %Z = fadd reassoc nsz float %Y, %X
+  %W = fadd reassoc nsz float %Z, %X
+  ret float %W
+}
+
+; TODO: This doesn't require 'nsz'.  It should fold to 4 * x
+define float @test10_reassoc(float %X) {
+; CHECK-LABEL: @test10_reassoc(
+; CHECK-NEXT:    [[Y:%.*]] = fadd reassoc float [[X:%.*]], [[X]]
+; CHECK-NEXT:    [[Z:%.*]] = fadd reassoc float [[Y]], [[X]]
+; CHECK-NEXT:    [[W:%.*]] = fadd reassoc float [[Z]], [[X]]
+; CHECK-NEXT:    ret float [[W]]
+;
+  %Y = fadd reassoc float %X ,%X
+  %Z = fadd reassoc float %Y, %X
+  %W = fadd reassoc float %Z, %X
+  ret float %W
+}
+
+define float @test11(float %W) {
+; CHECK-LABEL: @test11(
+; CHECK-NEXT:    [[FACTOR:%.*]] = fmul fast float [[W:%.*]], 3.810000e+02
+; CHECK-NEXT:    ret float [[FACTOR]]
+;
+  %X = fmul fast float %W, 127.0
+  %Y = fadd fast float %X ,%X
+  %Z = fadd fast float %Y, %X
+  ret float %Z
+}
+
+; Check again using the minimal subset of FMF.
+; Check again with 'reassoc' and 'nsz' ('nsz' not technically required).
+define float @test11_reassoc_nsz(float %W) {
+; CHECK-LABEL: @test11_reassoc_nsz(
+; CHECK-NEXT:    [[FACTOR:%.*]] = fmul reassoc nsz float [[W:%.*]], 3.810000e+02
+; CHECK-NEXT:    ret float [[FACTOR]]
+;
+  %X = fmul reassoc nsz float %W, 127.0
+  %Y = fadd reassoc nsz float %X ,%X
+  %Z = fadd reassoc nsz float %Y, %X
+  ret float %Z
+}
+
+; TODO: This doesn't require 'nsz'.  It should fold to W*381.0.
+define float @test11_reassoc(float %W) {
+; CHECK-LABEL: @test11_reassoc(
+; CHECK-NEXT:    [[X:%.*]] = fmul reassoc float [[W:%.*]], 1.270000e+02
+; CHECK-NEXT:    [[Y:%.*]] = fadd reassoc float [[X]], [[X]]
+; CHECK-NEXT:    [[Z:%.*]] = fadd reassoc float [[X]], [[Y]]
+; CHECK-NEXT:    ret float [[Z]]
+;
+  %X = fmul reassoc float %W, 127.0
+  %Y = fadd reassoc float %X ,%X
+  %Z = fadd reassoc float %Y, %X
+  ret float %Z
+}
+
+define float @test12(float %X) {
+; CHECK-LABEL: @test12(
+; CHECK-NEXT:    [[FACTOR:%.*]] = fmul fast float [[X:%.*]], -3.000000e+00
+; CHECK-NEXT:    [[Z:%.*]] = fadd fast float [[FACTOR]], 6.000000e+00
+; CHECK-NEXT:    ret float [[Z]]
+;
+  %A = fsub fast float 1.000000e+00, %X
+  %B = fsub fast float 2.000000e+00, %X
+  %C = fsub fast float 3.000000e+00, %X
+  %Y = fadd fast float %A ,%B
+  %Z = fadd fast float %Y, %C
+  ret float %Z
+}
+
+; Check again with 'reassoc' and 'nsz' ('nsz' not technically required).
+define float @test12_reassoc_nsz(float %X) {
+; CHECK-LABEL: @test12_reassoc_nsz(
+; CHECK-NEXT:    [[FACTOR:%.*]] = fmul reassoc nsz float [[X:%.*]], 3.000000e+00
+; CHECK-NEXT:    [[Z:%.*]] = fsub reassoc nsz float 6.000000e+00, [[FACTOR]]
+; CHECK-NEXT:    ret float [[Z]]
+;
+  %A = fsub reassoc nsz float 1.000000e+00, %X
+  %B = fsub reassoc nsz float 2.000000e+00, %X
+  %C = fsub reassoc nsz float 3.000000e+00, %X
+  %Y = fadd reassoc nsz float %A ,%B
+  %Z = fadd reassoc nsz float %Y, %C
+  ret float %Z
+}
+
+; TODO: This doesn't require 'nsz'.  It should fold to (6.0 - 3.0*x)
+define float @test12_reassoc(float %X) {
+; CHECK-LABEL: @test12_reassoc(
+; CHECK-NEXT:    [[A:%.*]] = fsub reassoc float 1.000000e+00, [[X:%.*]]
+; CHECK-NEXT:    [[B:%.*]] = fsub reassoc float 2.000000e+00, [[X]]
+; CHECK-NEXT:    [[C:%.*]] = fsub reassoc float 3.000000e+00, [[X]]
+; CHECK-NEXT:    [[Y:%.*]] = fadd reassoc float [[A]], [[B]]
+; CHECK-NEXT:    [[Z:%.*]] = fadd reassoc float [[C]], [[Y]]
+; CHECK-NEXT:    ret float [[Z]]
+;
+  %A = fsub reassoc float 1.000000e+00, %X
+  %B = fsub reassoc float 2.000000e+00, %X
+  %C = fsub reassoc float 3.000000e+00, %X
+  %Y = fadd reassoc float %A ,%B
+  %Z = fadd reassoc float %Y, %C
+  ret float %Z
+}
+
+define float @test13(float %X1, float %X2, float %X3) {
+; CHECK-LABEL: @test13(
+; CHECK-NEXT:    [[REASS_ADD:%.*]] = fsub fast float [[X3:%.*]], [[X2:%.*]]
+; CHECK-NEXT:    [[REASS_MUL:%.*]] = fmul fast float [[REASS_ADD]], [[X1:%.*]]
+; CHECK-NEXT:    ret float [[REASS_MUL]]
+;
+  %A = fsub fast float 0.000000e+00, %X1
+  %B = fmul fast float %A, %X2   ; -X1*X2
+  %C = fmul fast float %X1, %X3  ; X1*X3
+  %D = fadd fast float %B, %C    ; -X1*X2 + X1*X3 -> X1*(X3-X2)
+  ret float %D
+}
+
+define float @test13_reassoc(float %X1, float %X2, float %X3) {
+; CHECK-LABEL: @test13_reassoc(
+; CHECK-NEXT:    [[A:%.*]] = fsub reassoc float 0.000000e+00, [[X1:%.*]]
+; CHECK-NEXT:    [[B:%.*]] = fmul reassoc float [[A]], [[X2:%.*]]
+; CHECK-NEXT:    [[C:%.*]] = fmul reassoc float [[X1]], [[X3:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = fadd reassoc float [[B]], [[C]]
+; CHECK-NEXT:    ret float [[D]]
+;
+  %A = fsub reassoc float 0.000000e+00, %X1
+  %B = fmul reassoc float %A, %X2   ; -X1*X2
+  %C = fmul reassoc float %X1, %X3  ; X1*X3
+  %D = fadd reassoc float %B, %C    ; -X1*X2 + X1*X3 -> X1*(X3-X2)
+  ret float %D
+}
+
+define float @test14(float %X1, float %X2) {
+; CHECK-LABEL: @test14(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub fast float [[X1:%.*]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast float [[TMP1]], 4.700000e+01
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %B = fmul fast float %X1, 47.   ; X1*47
+  %C = fmul fast float %X2, -47.  ; X2*-47
+  %D = fadd fast float %B, %C    ; X1*47 + X2*-47 -> 47*(X1-X2)
+  ret float %D
+}
+
+; (x1 * 47) + (x2 * -47) => (x1 - x2) * 47
+; Check again with 'reassoc' and 'nsz' ('nsz' not technically required).
+define float @test14_reassoc_nsz(float %X1, float %X2) {
+; CHECK-LABEL: @test14_reassoc_nsz(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub reassoc nsz float [[X1:%.*]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul reassoc nsz float [[TMP1]], 4.700000e+01
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %B = fmul reassoc nsz float %X1, 47.   ; X1*47
+  %C = fmul reassoc nsz float %X2, -47.  ; X2*-47
+  %D = fadd reassoc nsz float %B, %C    ; X1*47 + X2*-47 -> 47*(X1-X2)
+  ret float %D
+}
+
+; TODO: This doesn't require 'nsz'.  It should fold to ((x1 - x2) * 47.0)
+define float @test14_reassoc(float %X1, float %X2) {
+; CHECK-LABEL: @test14_reassoc(
+; CHECK-NEXT:    [[B:%.*]] = fmul reassoc float [[X1:%.*]], 4.700000e+01
+; CHECK-NEXT:    [[C:%.*]] = fmul reassoc float [[X2:%.*]], 4.700000e+01
+; CHECK-NEXT:    [[D1:%.*]] = fsub reassoc float [[B]], [[C]]
+; CHECK-NEXT:    ret float [[D1]]
+;
+  %B = fmul reassoc float %X1, 47.   ; X1*47
+  %C = fmul reassoc float %X2, -47.  ; X2*-47
+  %D = fadd reassoc float %B, %C    ; X1*47 + X2*-47 -> 47*(X1-X2)
+  ret float %D
+}
+
+define float @test15(float %arg) {
+; CHECK-LABEL: @test15(
+; CHECK-NEXT:    [[T2:%.*]] = fmul fast float [[ARG:%.*]], 1.440000e+02
+; CHECK-NEXT:    ret float [[T2]]
+;
+  %t1 = fmul fast float 1.200000e+01, %arg
+  %t2 = fmul fast float %t1, 1.200000e+01
+  ret float %t2
+}
+
+define float @test15_reassoc(float %arg) {
+; CHECK-LABEL: @test15_reassoc(
+; CHECK-NEXT:    [[T2:%.*]] = fmul reassoc float [[ARG:%.*]], 1.440000e+02
+; CHECK-NEXT:    ret float [[T2]]
+;
+  %t1 = fmul reassoc float 1.200000e+01, %arg
+  %t2 = fmul reassoc float %t1, 1.200000e+01
+  ret float %t2
+}
+
+; (b+(a+1234))+-a -> b+1234
+define float @test16(float %b, float %a) {
+; CHECK-LABEL: @test16(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd fast float [[B:%.*]], 1.234000e+03
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+  %1 = fadd fast float %a, 1234.0
+  %2 = fadd fast float %b, %1
+  %3 = fsub fast float 0.0, %a
+  %4 = fadd fast float %2, %3
+  ret float %4
+}
+
+define float @test16_reassoc(float %b, float %a) {
+; CHECK-LABEL: @test16_reassoc(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd reassoc float [[A:%.*]], 1.234000e+03
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd reassoc float [[TMP1]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub reassoc float 0.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd reassoc float [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    ret float [[TMP4]]
+;
+  %1 = fadd reassoc float %a, 1234.0
+  %2 = fadd reassoc float %b, %1
+  %3 = fsub reassoc float 0.0, %a
+  %4 = fadd reassoc float %2, %3
+  ret float %4
+}
+
+; Test that we can turn things like X*-(Y*Z) -> X*-1*Y*Z.
+
+define float @test17(float %a, float %b, float %z) {
+; CHECK-LABEL: @test17(
+; CHECK-NEXT:    [[E:%.*]] = fmul fast float [[A:%.*]], 1.234500e+04
+; CHECK-NEXT:    [[F:%.*]] = fmul fast float [[E]], [[B:%.*]]
+; CHECK-NEXT:    [[G:%.*]] = fmul fast float [[F]], [[Z:%.*]]
+; CHECK-NEXT:    ret float [[G]]
+;
+  %c = fsub fast float 0.000000e+00, %z
+  %d = fmul fast float %a, %b
+  %e = fmul fast float %c, %d
+  %f = fmul fast float %e, 1.234500e+04
+  %g = fsub fast float 0.000000e+00, %f
+  ret float %g
+}
+
+define float @test17_reassoc(float %a, float %b, float %z) {
+; CHECK-LABEL: @test17_reassoc(
+; CHECK-NEXT:    [[C:%.*]] = fsub reassoc float 0.000000e+00, [[Z:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = fmul reassoc float [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[E:%.*]] = fmul reassoc float [[D]], [[C]]
+; CHECK-NEXT:    [[F:%.*]] = fmul reassoc float [[E]], 1.234500e+04
+; CHECK-NEXT:    [[G:%.*]] = fsub reassoc float 0.000000e+00, [[F]]
+; CHECK-NEXT:    ret float [[G]]
+;
+  %c = fsub reassoc float 0.000000e+00, %z
+  %d = fmul reassoc float %a, %b
+  %e = fmul reassoc float %c, %d
+  %f = fmul reassoc float %e, 1.234500e+04
+  %g = fsub reassoc float 0.000000e+00, %f
+  ret float %g
+}
+
+define float @test18(float %a, float %b, float %z) {
+; CHECK-LABEL: @test18(
+; CHECK-NEXT:    [[E:%.*]] = fmul fast float [[A:%.*]], 4.000000e+01
+; CHECK-NEXT:    [[F:%.*]] = fmul fast float [[E]], [[Z:%.*]]
+; CHECK-NEXT:    ret float [[F]]
+;
+  %d = fmul fast float %z, 4.000000e+01
+  %c = fsub fast float 0.000000e+00, %d
+  %e = fmul fast float %a, %c
+  %f = fsub fast float 0.000000e+00, %e
+  ret float %f
+}
+
+define float @test18_reassoc(float %a, float %b, float %z) {
+; CHECK-LABEL: @test18_reassoc(
+; CHECK-NEXT:    [[D:%.*]] = fmul reassoc float [[Z:%.*]], 4.000000e+01
+; CHECK-NEXT:    [[C:%.*]] = fsub reassoc float 0.000000e+00, [[D]]
+; CHECK-NEXT:    [[E:%.*]] = fmul reassoc float [[C]], [[A:%.*]]
+; CHECK-NEXT:    [[F:%.*]] = fsub reassoc float 0.000000e+00, [[E]]
+; CHECK-NEXT:    ret float [[F]]
+;
+  %d = fmul reassoc float %z, 4.000000e+01
+  %c = fsub reassoc float 0.000000e+00, %d
+  %e = fmul reassoc float %a, %c
+  %f = fsub reassoc float 0.000000e+00, %e
+  ret float %f
+}
+
+; With sub reassociation, constant folding can eliminate the 12 and -12 constants.
+define float @test19(float %A, float %B) {
+; CHECK-LABEL: @test19(
+; CHECK-NEXT:    [[Z:%.*]] = fsub fast float [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret float [[Z]]
+;
+  %X = fadd fast float -1.200000e+01, %A
+  %Y = fsub fast float %X, %B
+  %Z = fadd fast float %Y, 1.200000e+01
+  ret float %Z
+}
+
+define float @test19_reassoc(float %A, float %B) {
+; CHECK-LABEL: @test19_reassoc(
+; CHECK-NEXT:    [[X:%.*]] = fadd reassoc float [[A:%.*]], -1.200000e+01
+; CHECK-NEXT:    [[Y:%.*]] = fsub reassoc float [[X]], [[B:%.*]]
+; CHECK-NEXT:    [[Z:%.*]] = fadd reassoc float [[Y]], 1.200000e+01
+; CHECK-NEXT:    ret float [[Z]]
+;
+  %X = fadd reassoc float -1.200000e+01, %A
+  %Y = fsub reassoc float %X, %B
+  %Z = fadd reassoc float %Y, 1.200000e+01
+  ret float %Z
+}
+
+; With sub reassociation, constant folding can eliminate the uses of %a.
+define float @test20(float %a, float %b, float %c) nounwind  {
+; FIXME: Should be able to generate the below, which may expose more
+;        opportunites for FAdd reassociation.
+; %sum = fadd fast float %c, %b
+; %t7 = fsub fast float 0, %sum
+; CHECK-LABEL: @test20(
+; CHECK-NEXT:    [[B_NEG:%.*]] = fsub fast float -0.000000e+00, [[B:%.*]]
+; CHECK-NEXT:    [[T7:%.*]] = fsub fast float [[B_NEG]], [[C:%.*]]
+; CHECK-NEXT:    ret float [[T7]]
+;
+  %t3 = fsub fast float %a, %b
+  %t5 = fsub fast float %t3, %c
+  %t7 = fsub fast float %t5, %a
+  ret float %t7
+}
+
+define float @test20_reassoc(float %a, float %b, float %c) nounwind  {
+; CHECK-LABEL: @test20_reassoc(
+; CHECK-NEXT:    [[T3:%.*]] = fsub reassoc float [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[T5:%.*]] = fsub reassoc float [[T3]], [[C:%.*]]
+; CHECK-NEXT:    [[T7:%.*]] = fsub reassoc float [[T5]], [[A]]
+; CHECK-NEXT:    ret float [[T7]]
+;
+  %t3 = fsub reassoc float %a, %b
+  %t5 = fsub reassoc float %t3, %c
+  %t7 = fsub reassoc float %t5, %a
+  ret float %t7
+}
+

Added: llvm/trunk/test/Transforms/Reassociate/fast-fp-commute.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/fast-fp-commute.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/fast-fp-commute.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/fast-fp-commute.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,46 @@
+; RUN: opt -reassociate -S < %s | FileCheck %s
+
+declare void @use(float)
+
+define void @test1(float %x, float %y) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast float %y, %x
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast float %y, %x
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast float [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    call void @use(float [[TMP1]])
+; CHECK-NEXT:    call void @use(float [[TMP3]])
+; CHECK-NEXT:    ret void
+;
+  %1 = fmul fast float %x, %y
+  %2 = fmul fast float %y, %x
+  %3 = fsub fast float %1, %2
+  call void @use(float %1)
+  call void @use(float %3)
+  ret void
+}
+
+define float @test2(float %x, float %y) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast float %y, %x
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast float %y, %x
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast float [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret float [[TMP3]]
+;
+  %1 = fmul fast float %x, %y
+  %2 = fmul fast float %y, %x
+  %3 = fsub fast float %1, %2
+  ret float %3
+}
+
+define float @test3(float %x, float %y) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[FACTOR:%.*]] = fmul fast float %y, %x
+; CHECK-NEXT:    [[REASS_MUL:%.*]] = fmul fast float [[FACTOR]], 2.000000e+00
+; CHECK-NEXT:    ret float [[REASS_MUL]]
+;
+  %1 = fmul fast float %x, %y
+  %2 = fmul fast float %y, %x
+  %3 = fadd fast float %1, %2
+  ret float %3
+}
+

Added: llvm/trunk/test/Transforms/Reassociate/fast-mightymul.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/fast-mightymul.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/fast-mightymul.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/fast-mightymul.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,35 @@
+; RUN: opt < %s -reassociate -disable-output
+; PR13021
+
+define float @test2(float %x) {
+  %t0 = fmul fast float %x, %x
+  %t1 = fmul fast float %t0, %t0
+  %t2 = fmul fast float %t1, %t1
+  %t3 = fmul fast float %t2, %t2
+  %t4 = fmul fast float %t3, %t3
+  %t5 = fmul fast float %t4, %t4
+  %t6 = fmul fast float %t5, %t5
+  %t7 = fmul fast float %t6, %t6
+  %t8 = fmul fast float %t7, %t7
+  %t9 = fmul fast float %t8, %t8
+  %t10 = fmul fast float %t9, %t9
+  %t11 = fmul fast float %t10, %t10
+  %t12 = fmul fast float %t11, %t11
+  %t13 = fmul fast float %t12, %t12
+  %t14 = fmul fast float %t13, %t13
+  %t15 = fmul fast float %t14, %t14
+  %t16 = fmul fast float %t15, %t15
+  %t17 = fmul fast float %t16, %t16
+  %t18 = fmul fast float %t17, %t17
+  %t19 = fmul fast float %t18, %t18
+  %t20 = fmul fast float %t19, %t19
+  %t21 = fmul fast float %t20, %t20
+  %t22 = fmul fast float %t21, %t21
+  %t23 = fmul fast float %t22, %t22
+  %t24 = fmul fast float %t23, %t23
+  %t25 = fmul fast float %t24, %t24
+  %t26 = fmul fast float %t25, %t25
+  %t27 = fmul fast float %t26, %t26
+  %t28 = fmul fast float %t27, %t27
+  ret float %t28
+}

Added: llvm/trunk/test/Transforms/Reassociate/fast-multistep.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/fast-multistep.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/fast-multistep.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/fast-multistep.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,35 @@
+; RUN: opt < %s -reassociate -S | FileCheck %s
+
+; Check that a*a*b+a*a*c is turned into a*(a*(b+c)).
+
+define float @fmultistep1(float %a, float %b, float %c) {
+; CHECK-LABEL: @fmultistep1(
+; CHECK-NEXT:    [[REASS_ADD1:%.*]] = fadd fast float %c, %b
+; CHECK-NEXT:    [[REASS_MUL2:%.*]] = fmul fast float %a, %a
+; CHECK-NEXT:    [[REASS_MUL:%.*]] = fmul fast float [[REASS_MUL:%.*]]2, [[REASS_ADD1]]
+; CHECK-NEXT:    ret float [[REASS_MUL]]
+;
+  %t0 = fmul fast float %a, %b
+  %t1 = fmul fast float %a, %t0 ; a*(a*b)
+  %t2 = fmul fast float %a, %c
+  %t3 = fmul fast float %a, %t2 ; a*(a*c)
+  %t4 = fadd fast float %t1, %t3
+  ret float %t4
+}
+
+; Check that a*b+a*c+d is turned into a*(b+c)+d.
+
+define float @fmultistep2(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: @fmultistep2(
+; CHECK-NEXT:    [[REASS_ADD:%.*]] = fadd fast float %c, %b
+; CHECK-NEXT:    [[REASS_MUL:%.*]] = fmul fast float [[REASS_ADD]], %a
+; CHECK-NEXT:    [[T3:%.*]] = fadd fast float [[REASS_MUL]], %d
+; CHECK-NEXT:    ret float [[T3]]
+;
+  %t0 = fmul fast float %a, %b
+  %t1 = fmul fast float %a, %c
+  %t2 = fadd fast float %t1, %d ; a*c+d
+  %t3 = fadd fast float %t0, %t2 ; a*b+(a*c+d)
+  ret float %t3
+}
+

Added: llvm/trunk/test/Transforms/Reassociate/fp-commute.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/fp-commute.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/fp-commute.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/fp-commute.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,19 @@
+; RUN: opt -reassociate -S < %s | FileCheck %s
+
+declare void @use(float)
+
+define void @test1(float %x, float %y) {
+; CHECK-LABEL: test1
+; CHECK: fmul float %x, %y
+; CHECK: fmul float %x, %y
+; CHECK: fsub float %1, %2
+; CHECK: call void @use(float %{{.*}})
+; CHECK: call void @use(float %{{.*}})
+
+  %1 = fmul float %x, %y
+  %2 = fmul float %y, %x
+  %3 = fsub float %1, %2
+  call void @use(float %1)
+  call void @use(float %3)
+  ret void
+}

Added: llvm/trunk/test/Transforms/Reassociate/fp-expr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/fp-expr.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/fp-expr.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/fp-expr.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -reassociate < %s | FileCheck %s
+
+define void @test1() {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[T1:%.*]] = tail call <4 x float> @blam()
+; CHECK-NEXT:    [[T1_NEG:%.*]] = fsub fast <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[T1]]
+; CHECK-NEXT:    [[T24:%.*]] = fadd fast <4 x float> [[T1_NEG]], undef
+; CHECK-NEXT:    tail call void @wombat(<4 x float> [[T24]])
+; CHECK-NEXT:    ret void
+;
+  %t1 = tail call <4 x float> @blam()
+  %t23 = fsub fast <4 x float> undef, %t1
+  %t24 = fadd fast <4 x float> %t23, undef
+  tail call void @wombat(<4 x float> %t24)
+  ret void
+}
+
+define half @test2() {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[T15:%.*]] = fsub fast half undef, undef
+; CHECK-NEXT:    [[T15_NEG:%.*]] = fsub fast half 0xH8000, [[T15]]
+; CHECK-NEXT:    [[T18:%.*]] = fadd fast half [[T15_NEG]], undef
+; CHECK-NEXT:    ret half [[T18]]
+;
+  %t15 = fsub fast half undef, undef
+  %t17 = fsub fast half undef, %t15
+  %t18 = fadd fast half undef, %t17
+  ret half %t18
+}
+
+
+
+; Function Attrs: optsize
+declare <4 x float> @blam()
+
+; Function Attrs: optsize
+declare void @wombat(<4 x float>)
+

Added: llvm/trunk/test/Transforms/Reassociate/infloop-deadphi.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/infloop-deadphi.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/infloop-deadphi.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/infloop-deadphi.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -reassociate %s -S | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @f() {
+; CHECK-LABEL: @f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[DONE:%.*]]
+; CHECK:       dead:
+; CHECK-NEXT:    [[XOR0:%.*]] = xor i16 [[XOR1:%.*]], undef
+; CHECK-NEXT:    [[XOR1]] = xor i16 [[XOR0]], undef
+; CHECK-NEXT:    br i1 undef, label [[DEAD:%.*]], label [[DONE]]
+; CHECK:       done:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %done
+
+dead:
+  %xor0 = xor i16 %xor1, undef
+  %xor1 = xor i16 %xor0, undef
+  br i1 undef, label %dead, label %done
+
+done:
+  %e = phi i16 [ %xor1, %dead ], [ 0, %entry ]
+  ret void
+}

Added: llvm/trunk/test/Transforms/Reassociate/inverses.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/inverses.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/inverses.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/inverses.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -reassociate -die -S | FileCheck %s
+
+; (A&B)&~A == 0
+define i32 @test1(i32 %a, i32 %b) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    ret i32 0
+;
+  %t2 = and i32 %b, %a
+  %t4 = xor i32 %a, -1
+  %t5 = and i32 %t2, %t4
+  ret i32 %t5
+}
+
+define <2 x i32> @not_op_vec_undef(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: @not_op_vec_undef(
+; CHECK-NEXT:    ret <2 x i32> zeroinitializer
+;
+  %t2 = and <2 x i32> %b, %a
+  %t4 = xor <2 x i32> %a, <i32 -1, i32 undef>
+  %t5 = and <2 x i32> %t2, %t4
+  ret <2 x i32> %t5
+}
+
+; A&~A == 0
+define i32 @test2(i32 %a, i32 %b) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    ret i32 0
+;
+  %t1 = and i32 %a, 1234
+  %t2 = and i32 %b, %t1
+  %t4 = xor i32 %a, -1
+  %t5 = and i32 %t2, %t4
+  ret i32 %t5
+}
+
+; (b+(a+1234))+-a -> b+1234
+define i32 @test3(i32 %b, i32 %a) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[T5:%.*]] = add i32 [[B:%.*]], 1234
+; CHECK-NEXT:    ret i32 [[T5]]
+;
+  %t1 = add i32 %a, 1234
+  %t2 = add i32 %b, %t1
+  %t4 = sub i32 0, %a
+  %t5 = add i32 %t2, %t4
+  ret i32 %t5
+}
+
+; (b+(a+1234))+~a -> b+1233
+define i32 @test4(i32 %b, i32 %a) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    [[T5:%.*]] = add i32 [[B:%.*]], 1233
+; CHECK-NEXT:    ret i32 [[T5]]
+;
+  %t1 = add i32 %a, 1234
+  %t2 = add i32 %b, %t1
+  %t4 = xor i32 %a, -1
+  %t5 = add i32 %t2, %t4
+  ret i32 %t5
+}
+

Added: llvm/trunk/test/Transforms/Reassociate/keep-debug-loc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/keep-debug-loc.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/keep-debug-loc.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/keep-debug-loc.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,48 @@
+; RUN: opt -S -reassociate < %s | FileCheck %s
+
+; PR34231
+;
+; Verify that the original debug location is kept if the
+; replacement debug location is missing when
+; reassociating expressions.
+
+define i16 @fn1() !dbg !3 {
+  ret i16 undef
+}
+
+define void @fn2() !dbg !6 {
+; CHECK-LABEL: @fn2
+; CHECK: call i16 @fn1(), !dbg ![[LOC1:[0-9]+]]
+; CHECK-NOT: or i16
+  %inlinable_call = call i16 @fn1(), !dbg !7
+  %dbgless_instruction = or i16 %inlinable_call, 0
+  store i16 %dbgless_instruction, i16* undef, align 1
+  unreachable
+}
+
+define void @fn3() !dbg !8 {
+; CHECK-LABEL: @fn3
+; CHECK: load i16, i16* undef, !dbg ![[LOC2:[0-9]+]]
+; CHECK-NOT: or i16
+  %instruction = load i16, i16* undef, !dbg !9
+  %dbgless_instruction = or i16 %instruction, 0
+  store i16 %dbgless_instruction, i16* undef, align 1
+  unreachable
+}
+
+; CHECK: ![[LOC1]] = !DILocation(line: 7
+; CHECK: ![[LOC2]] = !DILocation(line: 9
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 6.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "foo.c", directory: "/")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = distinct !DISubprogram(name: "fn1", scope: !1, file: !1, line: 2, type: !4, isLocal: false, isDefinition: true, scopeLine: 2, isOptimized: true, unit: !0)
+!4 = !DISubroutineType(types: !5)
+!5 = !{}
+!6 = distinct !DISubprogram(name: "fn2", scope: !1, file: !1, line: 3, type: !4, isLocal: false, isDefinition: true, scopeLine: 3, isOptimized: true, unit: !0)
+!7 = !DILocation(line: 7, column: 10, scope: !6)
+!8 = distinct !DISubprogram(name: "fn3", scope: !1, file: !1, line: 8, type: !4, isLocal: false, isDefinition: true, scopeLine: 3, isOptimized: true, unit: !0)
+!9 = !DILocation(line: 9, column: 10, scope: !8)

Added: llvm/trunk/test/Transforms/Reassociate/long-chains.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/long-chains.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/long-chains.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/long-chains.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,37 @@
+; RUN: opt < %s -reassociate -stats -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+define i8 @longchain(i8 %in1, i8 %in2, i8 %in3, i8 %in4, i8 %in5, i8 %in6, i8 %in7, i8 %in8, i8 %in9, i8 %in10, i8 %in11, i8 %in12, i8 %in13, i8 %in14, i8 %in15, i8 %in16, i8 %in17, i8 %in18, i8 %in19, i8 %in20) {
+  %tmp1 = add i8 %in1, %in2
+  %tmp2 = add i8 %tmp1, %in3
+  %tmp3 = add i8 %tmp2, %in4
+  %tmp4 = add i8 %tmp3, %in3
+  %tmp5 = add i8 %tmp4, %in4
+  %tmp6 = add i8 %tmp5, %in5
+  %tmp7 = add i8 %tmp6, %in6
+  %tmp8 = add i8 %tmp7, %in7
+  %tmp9 = add i8 %tmp8, %in8
+  %tmp10 = add i8 %tmp9, %in9
+  %tmp11 = add i8 %tmp10, %in10
+  %tmp12 = add i8 %tmp11, %in11
+  %tmp13 = add i8 %tmp12, %in12
+  %tmp14 = add i8 %tmp13, %in13
+  %tmp15 = add i8 %tmp14, %in14
+  %tmp16 = add i8 %tmp15, %in15
+  %tmp17 = add i8 %tmp16, %in16
+  %tmp18 = add i8 %tmp17, %in17
+  %tmp19 = add i8 %tmp18, %in18
+  %tmp20 = add i8 %tmp19, %in19
+  %tmp21 = add i8 %tmp20, %in20
+  ret i8 %tmp20
+}
+
+; Check the number of instructions reassociated is in the tens not the hundreds.
+; At the time of writing, the exact numbers were:
+; Bad order: 220 reassociate - Number of insts reassociated
+; Good order: 55 reassociate - Number of insts reassociated
+;
+; CHECK: {{^[1-9][0-9]}} reassociate - Number of insts reassociated
+
+; Additionally check that we made at least three changes.
+; CHECK:      {{^ *[3-9]}} reassociate - Number of multiplies factored

Added: llvm/trunk/test/Transforms/Reassociate/looptest.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/looptest.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/looptest.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/looptest.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,52 @@
+; This testcase comes from this C fragment:
+;
+; void test(unsigned Num, int *Array) {
+;  unsigned i, j, k;
+;
+;  for (i = 0; i != Num; ++i)
+;    for (j = 0; j != Num; ++j)
+;      for (k = 0; k != Num; ++k)
+;        printf("%d\n", i+k+j);    /* Reassociate to (i+j)+k */
+;}
+;
+; In this case, we want to reassociate the specified expr so that i+j can be
+; hoisted out of the inner most loop.
+;
+; RUN: opt < %s -reassociate -S | FileCheck %s
+; END.
+ at .LC0 = internal global [4 x i8] c"%d\0A\00"		; <[4 x i8]*> [#uses=1]
+
+declare i32 @printf(i8*, ...)
+
+; Check that (i+j) has been reassociated (i=reg115, j=reg116)
+; CHECK: %reg113 = add i32 %reg116, %reg115
+define void @test(i32 %Num, i32* %Array) {
+bb0:
+	%cond221 = icmp eq i32 0, %Num		; <i1> [#uses=3]
+	br i1 %cond221, label %bb7, label %bb2
+bb2:		; preds = %bb6, %bb0
+	%reg115 = phi i32 [ %reg120, %bb6 ], [ 0, %bb0 ]		; <i32> [#uses=2]
+	br i1 %cond221, label %bb6, label %bb3
+bb3:		; preds = %bb5, %bb2
+	%reg116 = phi i32 [ %reg119, %bb5 ], [ 0, %bb2 ]		; <i32> [#uses=2]
+	br i1 %cond221, label %bb5, label %bb4
+bb4:		; preds = %bb4, %bb3
+	%reg117 = phi i32 [ %reg118, %bb4 ], [ 0, %bb3 ]		; <i32> [#uses=2]
+	%reg113 = add i32 %reg115, %reg117		; <i32> [#uses=1]
+	%reg114 = add i32 %reg113, %reg116		; <i32> [#uses=1]
+	%cast227 = getelementptr [4 x i8], [4 x i8]* @.LC0, i64 0, i64 0		; <i8*> [#uses=1]
+	call i32 (i8*, ...) @printf( i8* %cast227, i32 %reg114 )		; <i32>:0 [#uses=0]
+	%reg118 = add i32 %reg117, 1		; <i32> [#uses=2]
+	%cond224 = icmp ne i32 %reg118, %Num		; <i1> [#uses=1]
+	br i1 %cond224, label %bb4, label %bb5
+bb5:		; preds = %bb4, %bb3
+	%reg119 = add i32 %reg116, 1		; <i32> [#uses=2]
+	%cond225 = icmp ne i32 %reg119, %Num		; <i1> [#uses=1]
+	br i1 %cond225, label %bb3, label %bb6
+bb6:		; preds = %bb5, %bb2
+	%reg120 = add i32 %reg115, 1		; <i32> [#uses=2]
+	%cond226 = icmp ne i32 %reg120, %Num		; <i1> [#uses=1]
+	br i1 %cond226, label %bb2, label %bb7
+bb7:		; preds = %bb6, %bb0
+	ret void
+}

Added: llvm/trunk/test/Transforms/Reassociate/matching-binops.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/matching-binops.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/matching-binops.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/matching-binops.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,359 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -reassociate -S | FileCheck %s
+
+; PR37098 - https://bugs.llvm.org/show_bug.cgi?id=37098
+; In all positive tests, we should reassociate binops
+; to allow more factoring folds.
+
+; There are 5 associative integer binops *
+;           13 integer binops *
+;           4 operand commutes =
+;           260 potential variations of this fold
+; for integer binops. There are another 40 for FP.
+; Mix the commutation options to provide coverage using less tests.
+
+define i8 @and_shl(i8 %x, i8 %y, i8 %z, i8 %shamt) {
+; CHECK-LABEL: @and_shl(
+; CHECK-NEXT:    [[SX:%.*]] = shl i8 [[X:%.*]], [[SHAMT:%.*]]
+; CHECK-NEXT:    [[SY:%.*]] = shl i8 [[Y:%.*]], [[SHAMT]]
+; CHECK-NEXT:    [[A:%.*]] = and i8 [[SX]], [[Z:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[A]], [[SY]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sx = shl i8 %x, %shamt
+  %sy = shl i8 %y, %shamt
+  %a = and i8 %sx, %z
+  %r = and i8 %sy, %a
+  ret i8 %r
+}
+
+define i8 @or_shl(i8 %x, i8 %y, i8 %z, i8 %shamt) {
+; CHECK-LABEL: @or_shl(
+; CHECK-NEXT:    [[SX:%.*]] = shl i8 [[X:%.*]], [[SHAMT:%.*]]
+; CHECK-NEXT:    [[SY:%.*]] = shl i8 [[Y:%.*]], [[SHAMT]]
+; CHECK-NEXT:    [[A:%.*]] = or i8 [[SX]], [[Z:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = or i8 [[A]], [[SY]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sx = shl i8 %x, %shamt
+  %sy = shl i8 %y, %shamt
+  %a = or i8 %sx, %z
+  %r = or i8 %a, %sy
+  ret i8 %r
+}
+
+define i8 @xor_shl(i8 %x, i8 %y, i8 %z, i8 %shamt) {
+; CHECK-LABEL: @xor_shl(
+; CHECK-NEXT:    [[SX:%.*]] = shl i8 [[X:%.*]], [[SHAMT:%.*]]
+; CHECK-NEXT:    [[SY:%.*]] = shl i8 [[Y:%.*]], [[SHAMT]]
+; CHECK-NEXT:    [[A:%.*]] = xor i8 [[SX]], [[Z:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = xor i8 [[A]], [[SY]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sx = shl i8 %x, %shamt
+  %sy = shl i8 %y, %shamt
+  %a = xor i8 %z, %sx
+  %r = xor i8 %a, %sy
+  ret i8 %r
+}
+
+define i8 @and_lshr(i8 %x, i8 %y, i8 %z, i8 %shamt) {
+; CHECK-LABEL: @and_lshr(
+; CHECK-NEXT:    [[SX:%.*]] = lshr i8 [[X:%.*]], [[SHAMT:%.*]]
+; CHECK-NEXT:    [[SY:%.*]] = lshr i8 [[Y:%.*]], [[SHAMT]]
+; CHECK-NEXT:    [[A:%.*]] = and i8 [[SX]], [[Z:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[A]], [[SY]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sx = lshr i8 %x, %shamt
+  %sy = lshr i8 %y, %shamt
+  %a = and i8 %z, %sx
+  %r = and i8 %sy, %a
+  ret i8 %r
+}
+
+define i8 @or_lshr(i8 %x, i8 %y, i8 %z, i8 %shamt) {
+; CHECK-LABEL: @or_lshr(
+; CHECK-NEXT:    [[SX:%.*]] = lshr i8 [[X:%.*]], [[SHAMT:%.*]]
+; CHECK-NEXT:    [[SY:%.*]] = lshr i8 [[Y:%.*]], [[SHAMT]]
+; CHECK-NEXT:    [[A:%.*]] = or i8 [[SX]], [[Z:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = or i8 [[A]], [[SY]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sx = lshr i8 %x, %shamt
+  %sy = lshr i8 %y, %shamt
+  %a = or i8 %sx, %z
+  %r = or i8 %sy, %a
+  ret i8 %r
+}
+
+define i8 @xor_lshr(i8 %x, i8 %y, i8 %z, i8 %shamt) {
+; CHECK-LABEL: @xor_lshr(
+; CHECK-NEXT:    [[SX:%.*]] = lshr i8 [[X:%.*]], [[SHAMT:%.*]]
+; CHECK-NEXT:    [[SY:%.*]] = lshr i8 [[Y:%.*]], [[SHAMT]]
+; CHECK-NEXT:    [[A:%.*]] = xor i8 [[SX]], [[Z:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = xor i8 [[A]], [[SY]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sx = lshr i8 %x, %shamt
+  %sy = lshr i8 %y, %shamt
+  %a = xor i8 %sx, %z
+  %r = xor i8 %a, %sy
+  ret i8 %r
+}
+
+define i8 @and_ashr(i8 %x, i8 %y, i8 %z, i8 %shamt) {
+; CHECK-LABEL: @and_ashr(
+; CHECK-NEXT:    [[SX:%.*]] = ashr i8 [[X:%.*]], [[SHAMT:%.*]]
+; CHECK-NEXT:    [[SY:%.*]] = ashr i8 [[Y:%.*]], [[SHAMT]]
+; CHECK-NEXT:    [[A:%.*]] = and i8 [[SX]], [[Z:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[A]], [[SY]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sx = ashr i8 %x, %shamt
+  %sy = ashr i8 %y, %shamt
+  %a = and i8 %z, %sx
+  %r = and i8 %a, %sy
+  ret i8 %r
+}
+
+define i8 @or_ashr(i8 %x, i8 %y, i8 %z, i8 %shamt) {
+; CHECK-LABEL: @or_ashr(
+; CHECK-NEXT:    [[SX:%.*]] = ashr i8 [[X:%.*]], [[SHAMT:%.*]]
+; CHECK-NEXT:    [[SY:%.*]] = ashr i8 [[Y:%.*]], [[SHAMT]]
+; CHECK-NEXT:    [[A:%.*]] = or i8 [[SX]], [[Z:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = or i8 [[A]], [[SY]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sx = ashr i8 %x, %shamt
+  %sy = ashr i8 %y, %shamt
+  %a = or i8 %z, %sx
+  %r = or i8 %sy, %a
+  ret i8 %r
+}
+
+; Vectors work too.
+
+define <2 x i8> @xor_ashr(<2 x i8> %x, <2 x i8> %y, <2 x i8> %z, <2 x i8> %shamt) {
+; CHECK-LABEL: @xor_ashr(
+; CHECK-NEXT:    [[SX:%.*]] = ashr <2 x i8> [[X:%.*]], [[SHAMT:%.*]]
+; CHECK-NEXT:    [[SY:%.*]] = ashr <2 x i8> [[Y:%.*]], [[SHAMT]]
+; CHECK-NEXT:    [[A:%.*]] = xor <2 x i8> [[SX]], [[Z:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = xor <2 x i8> [[A]], [[SY]]
+; CHECK-NEXT:    ret <2 x i8> [[R]]
+;
+  %sx = ashr <2 x i8> %x, %shamt
+  %sy = ashr <2 x i8> %y, %shamt
+  %a = xor <2 x i8> %sx, %z
+  %r = xor <2 x i8> %a, %sy
+  ret <2 x i8> %r
+}
+
+; Negative test - different logic ops
+
+define i8 @or_and_shl(i8 %x, i8 %y, i8 %z, i8 %shamt) {
+; CHECK-LABEL: @or_and_shl(
+; CHECK-NEXT:    [[SX:%.*]] = shl i8 [[X:%.*]], [[SHAMT:%.*]]
+; CHECK-NEXT:    [[SY:%.*]] = shl i8 [[Y:%.*]], [[SHAMT]]
+; CHECK-NEXT:    [[A:%.*]] = or i8 [[SX]], [[Z:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[A]], [[SY]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sx = shl i8 %x, %shamt
+  %sy = shl i8 %y, %shamt
+  %a = or i8 %sx, %z
+  %r = and i8 %sy, %a
+  ret i8 %r
+}
+
+; Negative test - different shift ops
+
+define i8 @or_lshr_shl(i8 %x, i8 %y, i8 %z, i8 %shamt) {
+; CHECK-LABEL: @or_lshr_shl(
+; CHECK-NEXT:    [[SX:%.*]] = lshr i8 [[X:%.*]], [[SHAMT:%.*]]
+; CHECK-NEXT:    [[SY:%.*]] = shl i8 [[Y:%.*]], [[SHAMT]]
+; CHECK-NEXT:    [[A:%.*]] = or i8 [[SX]], [[Z:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = or i8 [[A]], [[SY]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sx = lshr i8 %x, %shamt
+  %sy = shl i8 %y, %shamt
+  %a = or i8 %sx, %z
+  %r = or i8 %a, %sy
+  ret i8 %r
+}
+
+; Negative test - multi-use
+
+define i8 @xor_lshr_multiuse(i8 %x, i8 %y, i8 %z, i8 %shamt) {
+; CHECK-LABEL: @xor_lshr_multiuse(
+; CHECK-NEXT:    [[SX:%.*]] = lshr i8 [[X:%.*]], [[SHAMT:%.*]]
+; CHECK-NEXT:    [[SY:%.*]] = lshr i8 [[Y:%.*]], [[SHAMT]]
+; CHECK-NEXT:    [[A:%.*]] = xor i8 [[SX]], [[Z:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = xor i8 [[A]], [[SY]]
+; CHECK-NEXT:    [[R2:%.*]] = sdiv i8 [[A]], [[R]]
+; CHECK-NEXT:    ret i8 [[R2]]
+;
+  %sx = lshr i8 %x, %shamt
+  %sy = lshr i8 %y, %shamt
+  %a = xor i8 %sx, %z
+  %r = xor i8 %a, %sy
+  %r2 = sdiv i8 %a, %r
+  ret i8 %r2
+}
+
+; Math ops work too. Change instruction positions too to verify placement.
+
+define i8 @add_lshr(i8 %x, i8 %y, i8 %z, i8 %shamt) {
+; CHECK-LABEL: @add_lshr(
+; CHECK-NEXT:    [[SX:%.*]] = lshr i8 [[X:%.*]], [[SHAMT:%.*]]
+; CHECK-NEXT:    [[A:%.*]] = add i8 [[SX]], [[Z:%.*]]
+; CHECK-NEXT:    [[SY:%.*]] = lshr i8 [[Y:%.*]], [[SHAMT]]
+; CHECK-NEXT:    [[R:%.*]] = add i8 [[A]], [[SY]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sx = lshr i8 %x, %shamt
+  %a = add i8 %sx, %z
+  %sy = lshr i8 %y, %shamt
+  %r = add i8 %a, %sy
+  ret i8 %r
+}
+
+; Make sure wrapping flags are cleared.
+
+define i8 @mul_sub(i8 %x, i8 %y, i8 %z, i8 %m) {
+; CHECK-LABEL: @mul_sub(
+; CHECK-NEXT:    [[SX:%.*]] = sub i8 [[X:%.*]], [[M:%.*]]
+; CHECK-NEXT:    [[SY:%.*]] = sub i8 [[Y:%.*]], [[M]]
+; CHECK-NEXT:    [[A:%.*]] = mul nsw i8 [[SX]], [[Z:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = mul nuw i8 [[A]], [[SY]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sx = sub i8 %x, %m
+  %sy = sub i8 %y, %m
+  %a = mul nsw i8 %sx, %z
+  %r = mul nuw i8 %a, %sy
+  ret i8 %r
+}
+
+define i8 @add_mul(i8 %x, i8 %y, i8 %z, i8 %m) {
+; CHECK-LABEL: @add_mul(
+; CHECK-NEXT:    [[SX:%.*]] = mul nuw i8 [[X:%.*]], 42
+; CHECK-NEXT:    [[A:%.*]] = add nuw i8 [[Z:%.*]], [[SX]]
+; CHECK-NEXT:    [[SY:%.*]] = mul nsw i8 [[M:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = add nsw i8 [[A]], [[SY]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %sx = mul nuw i8 %x, 42
+  %a = add nuw i8 %sx, %z
+  %sy = mul nsw i8 %y, %m
+  %r = add nsw i8 %sy, %a
+  ret i8 %r
+}
+
+; Floating-point works too if it's not strict.
+; TODO: These should not require the full 'fast' FMF.
+
+define float @fadd_fmul(float %x, float %y, float %z, float %m) {
+; CHECK-LABEL: @fadd_fmul(
+; CHECK-NEXT:    [[SX:%.*]] = fmul float [[X:%.*]], [[M:%.*]]
+; CHECK-NEXT:    [[A:%.*]] = fadd fast float [[SX]], [[Z:%.*]]
+; CHECK-NEXT:    [[SY:%.*]] = fmul float [[Y:%.*]], [[M]]
+; CHECK-NEXT:    [[R:%.*]] = fadd fast float [[A]], [[SY]]
+; CHECK-NEXT:    ret float [[R]]
+;
+  %sx = fmul float %x, %m
+  %a = fadd fast float %sx, %z
+  %sy = fmul float %y, %m
+  %r = fadd fast float %sy, %a
+  ret float %r
+}
+
+define float @fmul_fdiv(float %x, float %y, float %z, float %m) {
+; CHECK-LABEL: @fmul_fdiv(
+; CHECK-NEXT:    [[SX:%.*]] = fdiv float [[X:%.*]], [[M:%.*]]
+; CHECK-NEXT:    [[SY:%.*]] = fdiv float [[Y:%.*]], 4.200000e+01
+; CHECK-NEXT:    [[A:%.*]] = fmul fast float [[SY]], [[Z:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = fmul fast float [[A]], [[SX]]
+; CHECK-NEXT:    ret float [[R]]
+;
+  %sx = fdiv float %x, %m
+  %sy = fdiv float %y, 42.0
+  %a = fmul fast float %z, %sx
+  %r = fmul fast float %sy, %a
+  ret float %r
+}
+
+; Verify that debug info for modified instructions gets discarded (references become undef).
+
+define i32 @and_shl_dbg(i32 %x, i32 %y, i32 %z, i32 %shamt) {
+; CHECK-LABEL: @and_shl_dbg(
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 [[X:%.*]], metadata !7, metadata !DIExpression()), !dbg !20
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 [[Y:%.*]], metadata !13, metadata !DIExpression()), !dbg !21
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 [[Z:%.*]], metadata !14, metadata !DIExpression()), !dbg !22
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 [[SHAMT:%.*]], metadata !15, metadata !DIExpression()), !dbg !23
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[X]], [[SHAMT]], !dbg !24
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 [[SHL]], metadata !16, metadata !DIExpression()), !dbg !25
+; CHECK-NEXT:    [[SHL1:%.*]] = shl i32 [[Y]], [[SHAMT]], !dbg !26
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 [[SHL1]], metadata !17, metadata !DIExpression()), !dbg !27
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[SHL]], [[Z]], !dbg !28
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 [[AND]], metadata !18, metadata !DIExpression()), !dbg !29
+; CHECK-NEXT:    [[AND2:%.*]] = and i32 [[AND]], [[SHL1]], !dbg !30
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 [[AND2]], metadata !19, metadata !DIExpression()), !dbg !31
+; CHECK-NEXT:    ret i32 [[AND2]], !dbg !32
+;
+  call void @llvm.dbg.value(metadata i32 %x, metadata !13, metadata !DIExpression()), !dbg !21
+  call void @llvm.dbg.value(metadata i32 %y, metadata !14, metadata !DIExpression()), !dbg !22
+  call void @llvm.dbg.value(metadata i32 %z, metadata !15, metadata !DIExpression()), !dbg !23
+  call void @llvm.dbg.value(metadata i32 %shamt, metadata !16, metadata !DIExpression()), !dbg !24
+  %shl = shl i32 %x, %shamt, !dbg !25
+  call void @llvm.dbg.value(metadata i32 %shl, metadata !17, metadata !DIExpression()), !dbg !26
+  %shl1 = shl i32 %y, %shamt, !dbg !27
+  call void @llvm.dbg.value(metadata i32 %shl1, metadata !18, metadata !DIExpression()), !dbg !28
+  %and = and i32 %shl, %z, !dbg !29
+  call void @llvm.dbg.value(metadata i32 %and, metadata !19, metadata !DIExpression()), !dbg !30
+  %and2 = and i32 %and, %shl1, !dbg !31
+  call void @llvm.dbg.value(metadata i32 %and2, metadata !20, metadata !DIExpression()), !dbg !32
+  ret i32 %and2, !dbg !33
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 7.0.0 (trunk 331069)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "ass.c", directory: "/Users/spatel/myllvm/release/bin")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{i32 7, !"PIC Level", i32 2}
+!7 = !{!"clang version 7.0.0 (trunk 331069)"}
+!8 = distinct !DISubprogram(name: "and_shl_dbg", scope: !1, file: !1, line: 1, type: !9, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!9 = !DISubroutineType(types: !10)
+!10 = !{!11, !11, !11, !11, !11}
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !{!13, !14, !15, !16, !17, !18, !19, !20}
+!13 = !DILocalVariable(name: "x", arg: 1, scope: !8, file: !1, line: 1, type: !11)
+!14 = !DILocalVariable(name: "y", arg: 2, scope: !8, file: !1, line: 1, type: !11)
+!15 = !DILocalVariable(name: "z", arg: 3, scope: !8, file: !1, line: 1, type: !11)
+!16 = !DILocalVariable(name: "shamt", arg: 4, scope: !8, file: !1, line: 1, type: !11)
+!17 = !DILocalVariable(name: "sx", scope: !8, file: !1, line: 2, type: !11)
+!18 = !DILocalVariable(name: "sy", scope: !8, file: !1, line: 3, type: !11)
+!19 = !DILocalVariable(name: "a", scope: !8, file: !1, line: 4, type: !11)
+!20 = !DILocalVariable(name: "r", scope: !8, file: !1, line: 5, type: !11)
+!21 = !DILocation(line: 1, column: 21, scope: !8)
+!22 = !DILocation(line: 1, column: 28, scope: !8)
+!23 = !DILocation(line: 1, column: 35, scope: !8)
+!24 = !DILocation(line: 1, column: 42, scope: !8)
+!25 = !DILocation(line: 2, column: 14, scope: !8)
+!26 = !DILocation(line: 2, column: 7, scope: !8)
+!27 = !DILocation(line: 3, column: 14, scope: !8)
+!28 = !DILocation(line: 3, column: 7, scope: !8)
+!29 = !DILocation(line: 4, column: 14, scope: !8)
+!30 = !DILocation(line: 4, column: 7, scope: !8)
+!31 = !DILocation(line: 5, column: 14, scope: !8)
+!32 = !DILocation(line: 5, column: 7, scope: !8)
+!33 = !DILocation(line: 6, column: 3, scope: !8)
+

Added: llvm/trunk/test/Transforms/Reassociate/mightymul.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/mightymul.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/mightymul.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/mightymul.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,35 @@
+; RUN: opt < %s -reassociate -disable-output
+; PR13021
+
+define i32 @test1(i32 %x) {
+  %t0 = mul i32 %x, %x
+  %t1 = mul i32 %t0, %t0
+  %t2 = mul i32 %t1, %t1
+  %t3 = mul i32 %t2, %t2
+  %t4 = mul i32 %t3, %t3
+  %t5 = mul i32 %t4, %t4
+  %t6 = mul i32 %t5, %t5
+  %t7 = mul i32 %t6, %t6
+  %t8 = mul i32 %t7, %t7
+  %t9 = mul i32 %t8, %t8
+  %t10 = mul i32 %t9, %t9
+  %t11 = mul i32 %t10, %t10
+  %t12 = mul i32 %t11, %t11
+  %t13 = mul i32 %t12, %t12
+  %t14 = mul i32 %t13, %t13
+  %t15 = mul i32 %t14, %t14
+  %t16 = mul i32 %t15, %t15
+  %t17 = mul i32 %t16, %t16
+  %t18 = mul i32 %t17, %t17
+  %t19 = mul i32 %t18, %t18
+  %t20 = mul i32 %t19, %t19
+  %t21 = mul i32 %t20, %t20
+  %t22 = mul i32 %t21, %t21
+  %t23 = mul i32 %t22, %t22
+  %t24 = mul i32 %t23, %t23
+  %t25 = mul i32 %t24, %t24
+  %t26 = mul i32 %t25, %t25
+  %t27 = mul i32 %t26, %t26
+  %t28 = mul i32 %t27, %t27
+  ret i32 %t28
+}

Added: llvm/trunk/test/Transforms/Reassociate/min_int.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/min_int.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/min_int.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/min_int.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,13 @@
+; RUN: opt < %s -reassociate -dce -S | FileCheck %s
+
+; MIN_INT cannot be negated during reassociation
+
+define i32 @minint(i32 %i) {
+; CHECK:  %mul = mul i32 %i, -2147483648
+; CHECK-NEXT:  %add = add i32 %mul, 1
+; CHECK-NEXT:  ret i32 %add
+  %mul = mul i32 %i, -2147483648
+  %add = add i32 %mul, 1
+  ret i32 %add
+}
+

Added: llvm/trunk/test/Transforms/Reassociate/mixed-fast-nonfast-fp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/mixed-fast-nonfast-fp.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/mixed-fast-nonfast-fp.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/mixed-fast-nonfast-fp.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,42 @@
+; RUN: opt -reassociate %s -S | FileCheck %s
+
+define float @foo(float %a,float %b, float %c) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    [[MUL3:%.*]] = fmul float %a, %b
+; CHECK-NEXT:    [[FACTOR:%.*]] = fmul fast float %c, 2.000000e+00
+; CHECK-NEXT:    [[REASS_ADD1:%.*]] = fadd fast float [[FACTOR]], %b
+; CHECK-NEXT:    [[REASS_MUL:%.*]] = fmul fast float [[REASS_ADD1]], %a
+; CHECK-NEXT:    [[ADD3:%.*]] = fadd fast float [[REASS_MUL]], [[MUL3]]
+; CHECK-NEXT:    ret float [[ADD3]]
+;
+  %mul1 = fmul fast float %a, %c
+  %mul2 = fmul fast float %a, %b
+  %mul3 = fmul float %a, %b   ; STRICT
+  %mul4 = fmul fast float %a, %c
+  %add1 = fadd fast  float %mul1, %mul3
+  %add2 = fadd fast float %mul4, %mul2
+  %add3 = fadd fast float %add1, %add2
+  ret float %add3
+}
+
+define float @foo_reassoc(float %a,float %b, float %c) {
+; CHECK-LABEL: @foo_reassoc(
+; CHECK-NEXT:    [[MUL1:%.*]] = fmul reassoc float %a, %c
+; CHECK-NEXT:    [[MUL2:%.*]] = fmul fast float %b, %a
+; CHECK-NEXT:    [[MUL3:%.*]] = fmul float %a, %b
+; CHECK-NEXT:    [[MUL4:%.*]] = fmul reassoc float %a, %c
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast float [[MUL1]], [[MUL3]]
+; CHECK-NEXT:    [[ADD2:%.*]] = fadd reassoc float [[MUL2]], [[MUL4]]
+; CHECK-NEXT:    [[ADD3:%.*]] = fadd fast float [[ADD1]], [[ADD2]]
+; CHECK-NEXT:    ret float [[ADD3]]
+;
+  %mul1 = fmul reassoc float %a, %c
+  %mul2 = fmul fast float %a, %b
+  %mul3 = fmul float %a, %b   ; STRICT
+  %mul4 = fmul reassoc float %a, %c
+  %add1 = fadd fast  float %mul1, %mul3
+  %add2 = fadd reassoc float %mul4, %mul2
+  %add3 = fadd fast float %add1, %add2
+  ret float %add3
+}
+

Added: llvm/trunk/test/Transforms/Reassociate/mulfactor.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/mulfactor.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/mulfactor.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/mulfactor.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,128 @@
+; RUN: opt < %s -reassociate -S | FileCheck %s
+
+define i32 @test1(i32 %a, i32 %b) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[T2:%.*]] = mul i32 %a, %a
+; CHECK-NEXT:    [[T6:%.*]] = mul i32 %a, 2
+; CHECK-NEXT:    [[REASS_ADD:%.*]] = add i32 [[T6]], %b
+; CHECK-NEXT:    [[REASS_MUL:%.*]] = mul i32 [[REASS_ADD]], %b
+; CHECK-NEXT:    [[T11:%.*]] = add i32 [[REASS_MUL]], [[T2]]
+; CHECK-NEXT:    ret i32 [[T11]]
+;
+  %t2 = mul i32 %a, %a
+  %t5 = shl i32 %a, 1
+  %t6 = mul i32 %t5, %b
+  %t8 = mul i32 %b, %b
+  %t7 = add i32 %t6, %t2
+  %t11 = add i32 %t7, %t8
+  ret i32 %t11
+}
+
+define i32 @test2(i32 %t) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[REASS_MUL:%.*]] = mul i32 %t, 42
+; CHECK-NEXT:    [[D:%.*]] = add i32 [[REASS_MUL]], 15
+; CHECK-NEXT:    ret i32 [[D]]
+;
+  %a = mul i32 %t, 6
+  %b = mul i32 %t, 36
+  %c = add i32 %b, 15
+  %d = add i32 %c, %a
+  ret i32 %d
+}
+
+; (x^8)
+define i32 @test3(i32 %x) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 %x, %x
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], [[TMP2]]
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %a = mul i32 %x, %x
+  %b = mul i32 %a, %x
+  %c = mul i32 %b, %x
+  %d = mul i32 %c, %x
+  %e = mul i32 %d, %x
+  %f = mul i32 %e, %x
+  %g = mul i32 %f, %x
+  ret i32 %g
+}
+
+; (x^7)
+define i32 @test4(i32 %x) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 %x, %x
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP1]], %x
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], %x
+; CHECK-NEXT:    [[F:%.*]] = mul i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    ret i32 [[F]]
+;
+  %a = mul i32 %x, %x
+  %b = mul i32 %a, %x
+  %c = mul i32 %b, %x
+  %d = mul i32 %c, %x
+  %e = mul i32 %d, %x
+  %f = mul i32 %e, %x
+  ret i32 %f
+}
+
+; (x^4) * (y^2)
+define i32 @test5(i32 %x, i32 %y) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 %x, %x
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP1]], %y
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], [[TMP2]]
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %a = mul i32 %x, %y
+  %b = mul i32 %a, %y
+  %c = mul i32 %b, %x
+  %d = mul i32 %c, %x
+  %e = mul i32 %d, %x
+  ret i32 %e
+}
+
+; (x^5) * (y^3) * z
+define i32 @test6(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 %x, %x
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP1]], %y
+; CHECK-NEXT:    [[F:%.*]] = mul i32 %y, %x
+; CHECK-NEXT:    [[G:%.*]] = mul i32 [[F]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[G]], [[TMP2]]
+; CHECK-NEXT:    [[H:%.*]] = mul i32 [[TMP3]], %z
+; CHECK-NEXT:    ret i32 [[H]]
+;
+  %a = mul i32 %x, %y
+  %b = mul i32 %a, %x
+  %c = mul i32 %b, %y
+  %d = mul i32 %c, %x
+  %e = mul i32 %d, %y
+  %f = mul i32 %e, %x
+  %g = mul i32 %f, %z
+  %h = mul i32 %g, %x
+  ret i32 %h
+}
+
+; (x^4) * (y^3) * (z^2)
+define i32 @test7(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 %x, %x
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP1]], %y
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], %z
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i32 [[TMP3]], %y
+; CHECK-NEXT:    [[H:%.*]] = mul i32 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    ret i32 [[H]]
+;
+  %a = mul i32 %y, %x
+  %b = mul i32 %a, %z
+  %c = mul i32 %b, %z
+  %d = mul i32 %c, %x
+  %e = mul i32 %d, %y
+  %f = mul i32 %e, %y
+  %g = mul i32 %f, %x
+  %h = mul i32 %g, %x
+  ret i32 %h
+}
+

Added: llvm/trunk/test/Transforms/Reassociate/multistep.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/multistep.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/multistep.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/multistep.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,35 @@
+; RUN: opt < %s -reassociate -S | FileCheck %s
+
+; Check that a*a*b+a*a*c is turned into a*(a*(b+c)).
+
+define i64 @multistep1(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: @multistep1(
+; CHECK-NEXT:    [[REASS_ADD1:%.*]] = add i64 %c, %b
+; CHECK-NEXT:    [[REASS_MUL2:%.*]] = mul i64 %a, %a
+; CHECK-NEXT:    [[REASS_MUL:%.*]] = mul i64 [[REASS_MUL:%.*]]2, [[REASS_ADD1]]
+; CHECK-NEXT:    ret i64 [[REASS_MUL]]
+;
+  %t0 = mul i64 %a, %b
+  %t1 = mul i64 %a, %t0 ; a*(a*b)
+  %t2 = mul i64 %a, %c
+  %t3 = mul i64 %a, %t2 ; a*(a*c)
+  %t4 = add i64 %t1, %t3
+  ret i64 %t4
+}
+
+; Check that a*b+a*c+d is turned into a*(b+c)+d.
+
+define i64 @multistep2(i64 %a, i64 %b, i64 %c, i64 %d) {
+; CHECK-LABEL: @multistep2(
+; CHECK-NEXT:    [[REASS_ADD:%.*]] = add i64 %c, %b
+; CHECK-NEXT:    [[REASS_MUL:%.*]] = mul i64 [[REASS_ADD]], %a
+; CHECK-NEXT:    [[T3:%.*]] = add i64 [[REASS_MUL]], %d
+; CHECK-NEXT:    ret i64 [[T3]]
+;
+  %t0 = mul i64 %a, %b
+  %t1 = mul i64 %a, %c
+  %t2 = add i64 %t1, %d ; a*c+d
+  %t3 = add i64 %t0, %t2 ; a*b+(a*c+d)
+  ret i64 %t3
+}
+

Added: llvm/trunk/test/Transforms/Reassociate/negation.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/negation.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/negation.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/negation.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -reassociate -instcombine -S | FileCheck %s
+
+; Test that we can turn things like X*-(Y*Z) -> X*-1*Y*Z.
+
+define i32 @test1(i32 %a, i32 %b, i32 %z) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[E:%.*]] = mul i32 [[A:%.*]], 12345
+; CHECK-NEXT:    [[F:%.*]] = mul i32 [[E]], [[B:%.*]]
+; CHECK-NEXT:    [[G:%.*]] = mul i32 [[F]], [[Z:%.*]]
+; CHECK-NEXT:    ret i32 [[G]]
+;
+  %c = sub i32 0, %z
+  %d = mul i32 %a, %b
+  %e = mul i32 %c, %d
+  %f = mul i32 %e, 12345
+  %g = sub i32 0, %f
+  ret i32 %g
+}
+
+define i32 @test2(i32 %a, i32 %b, i32 %z) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[E:%.*]] = mul i32 [[A:%.*]], 40
+; CHECK-NEXT:    [[F:%.*]] = mul i32 [[E]], [[Z:%.*]]
+; CHECK-NEXT:    ret i32 [[F]]
+;
+  %d = mul i32 %z, 40
+  %c = sub i32 0, %d
+  %e = mul i32 %a, %c
+  %f = sub i32 0, %e
+  ret i32 %f
+}
+
+define <2 x i32> @negate_vec_undefs(<2 x i32> %a, <2 x i32> %b, <2 x i32> %z) {
+; CHECK-LABEL: @negate_vec_undefs(
+; CHECK-NEXT:    [[E:%.*]] = mul <2 x i32> [[A:%.*]], <i32 40, i32 40>
+; CHECK-NEXT:    [[F:%.*]] = mul <2 x i32> [[E]], [[Z:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[F]]
+;
+  %d = mul <2 x i32> %z, <i32 40, i32 40>
+  %c = sub <2 x i32> <i32 0, i32 undef>, %d
+  %e = mul <2 x i32> %a, %c
+  %f = sub <2 x i32> <i32 0, i32 undef>, %e
+  ret <2 x i32> %f
+}
+

Added: llvm/trunk/test/Transforms/Reassociate/negation1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/negation1.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/negation1.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/negation1.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -reassociate -instcombine -S | FileCheck %s
+
+; Test that we can turn things like A*B + X - A*B -> X.
+
+define i32 @test1(i32 %a, i32 %b, i32 %x) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    ret i32 [[X:%.*]]
+;
+  %c = mul i32 %a, %b
+  %d = add i32 %c, %x
+  %c1 = mul i32 %a, %b
+  %f = sub i32 %d, %c1
+  ret i32 %f
+}
+

Added: llvm/trunk/test/Transforms/Reassociate/no-op.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/no-op.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/no-op.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/no-op.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,38 @@
+; RUN: opt < %s -reassociate -S | FileCheck %s
+
+; When there is nothing to do, or not much to do, check that reassociate leaves
+; things alone.
+
+declare void @use(i32)
+
+define void @test1(i32 %a, i32 %b) {
+; Shouldn't change or move any of the add instructions.  Should commute but
+; otherwise not change or move any of the mul instructions.
+; CHECK-LABEL: @test1(
+  %a0 = add nsw i32 %a, 1
+; CHECK-NEXT: %a0 = add nsw i32 %a, 1
+  %m0 = mul nsw i32 3, %a
+; CHECK-NEXT: %m0 = mul nsw i32 %a, 3
+  %a1 = add nsw i32 %a0, %b
+; CHECK-NEXT: %a1 = add nsw i32 %a0, %b
+  %m1 = mul nsw i32 %b, %m0
+; CHECK-NEXT: %m1 = mul nsw i32 %m0, %b
+  call void @use(i32 %a1)
+; CHECK-NEXT: call void @use
+  call void @use(i32 %m1)
+  ret void
+}
+
+define void @test2(i32 %a, i32 %b, i32 %c, i32 %d) {
+; The initial add doesn't change so should not lose the nsw flag.
+; CHECK-LABEL: @test2(
+  %a0 = add nsw i32 %b, %a
+; CHECK-NEXT: %a0 = add nsw i32 %b, %a
+  %a1 = add nsw i32 %a0, %d
+; CHECK-NEXT: %a1 = add i32 %a0, %c
+  %a2 = add nsw i32 %a1, %c
+; CHECK-NEXT: %a2 = add i32 %a1, %d
+  call void @use(i32 %a2)
+; CHECK-NEXT: call void @use
+  ret void
+}

Added: llvm/trunk/test/Transforms/Reassociate/optional-flags.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/optional-flags.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/optional-flags.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/optional-flags.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,29 @@
+; RUN: opt -S -reassociate < %s | FileCheck %s
+; rdar://8944681
+
+; Reassociate should clear optional flags like nsw when reassociating.
+
+; CHECK-LABEL: @test0(
+; CHECK: %y = add i64 %b, %a
+; CHECK: %z = add i64 %y, %c
+define i64 @test0(i64 %a, i64 %b, i64 %c) {
+  %y = add nsw i64 %c, %b
+  %z = add i64 %y, %a
+  ret i64 %z
+}
+
+; CHECK-LABEL: @test1(
+; CHECK: %y = add i64 %b, %a
+; CHECK: %z = add i64 %y, %c
+define i64 @test1(i64 %a, i64 %b, i64 %c) {
+  %y = add i64 %c, %b
+  %z = add nsw i64 %y, %a
+  ret i64 %z
+}
+
+; PR9215
+; CHECK: %s = add nsw i32 %y, %x
+define i32 @test2(i32 %x, i32 %y) {
+  %s = add nsw i32 %x, %y
+  ret i32 %s
+}

Added: llvm/trunk/test/Transforms/Reassociate/otherops.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/otherops.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/otherops.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/otherops.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,42 @@
+; Reassociation should apply to Add, Mul, And, Or, & Xor
+;
+; RUN: opt < %s -reassociate -constprop -instcombine -die -S | FileCheck %s
+
+define i32 @test_mul(i32 %arg) {
+; CHECK-LABEL: test_mul
+; CHECK-NEXT: %tmp2 = mul i32 %arg, 144
+; CHECK-NEXT: ret i32 %tmp2
+
+  %tmp1 = mul i32 12, %arg
+  %tmp2 = mul i32 %tmp1, 12
+  ret i32 %tmp2
+}
+
+define i32 @test_and(i32 %arg) {
+; CHECK-LABEL: test_and
+; CHECK-NEXT: %tmp2 = and i32 %arg, 14
+; CHECK-NEXT: ret i32 %tmp2
+
+  %tmp1 = and i32 14, %arg
+  %tmp2 = and i32 %tmp1, 14
+  ret i32 %tmp2
+}
+
+define i32 @test_or(i32 %arg) {
+; CHECK-LABEL: test_or
+; CHECK-NEXT: %tmp2 = or i32 %arg, 14
+; CHECK-NEXT: ret i32 %tmp2
+
+  %tmp1 = or i32 14, %arg
+  %tmp2 = or i32 %tmp1, 14
+  ret i32 %tmp2
+}
+
+define i32 @test_xor(i32 %arg) {
+; CHECK-LABEL: test_xor
+; CHECK-NEXT: ret i32 %arg
+
+  %tmp1 = xor i32 12, %arg
+  %tmp2 = xor i32 %tmp1, 12
+  ret i32 %tmp2
+}

Added: llvm/trunk/test/Transforms/Reassociate/pointer-collision-non-determinism.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/pointer-collision-non-determinism.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/pointer-collision-non-determinism.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/pointer-collision-non-determinism.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -run-twice -reassociate %s -S -o - | FileCheck %s
+; RUN: opt -run-twice -reassociate %s -S -o - | FileCheck %s
+
+; The PairMap[NumBinaryOps] used by the Reassociate pass used to have Value
+; *pointers as keys and no handling for values being removed. In some cases (in
+; practice very rarely, but in this particular test - well over 50% of the time)
+; a newly created Value would happen to get allocated at the same memory
+; address, effectively "replacing" the key in the map.
+;
+; Test that that doesn't happen anymore and the pass is deterministic.
+;
+; The failure rate of this test (at least, on my 8 core iMac), when ran in the
+; context of other unit tests executed | specifically, I was trying
+;
+;   ./bin/llvm-lit -v ../test/Transforms/Reassociate
+;
+; is as follows:
+;
+; # of RUN lines repeated | just -run-twice | -run-twice and CHECK lines
+; ------------------------+-----------------+---------------------------
+;  1                      |             30% |          <didn't measure>
+;  2                      |             55% |          95%
+;  3                      |             55% |          <didn't measure>
+;
+; hence the specific shape of this test. The IR itself comes from a real-world
+; code, successfully bugpointed.
+
+define float @test(float %arg) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = fmul fast float [[ARG:%.*]], 0x3FE99999A0000000
+; CHECK-NEXT:    [[TMP110:%.*]] = fsub fast float 1.000000e+00, [[TMP]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast float [[ARG]], 0x3FE99999A0000000
+; CHECK-NEXT:    [[TMP311:%.*]] = fsub fast float 1.000000e+00, [[TMP2]]
+; CHECK-NEXT:    [[REASS_MUL160:%.*]] = fmul fast float [[TMP110]], [[ARG]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast float [[REASS_MUL160]], [[TMP311]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast float [[TMP4]], [[ARG]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP5]], [[ARG]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast float [[TMP6]], [[ARG]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP7]], [[ARG]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast float [[TMP8]], [[ARG]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast float [[TMP9]], [[ARG]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast float [[TMP10]], [[ARG]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast float [[TMP11]], [[ARG]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fadd fast float [[TMP12]], [[ARG]]
+; CHECK-NEXT:    [[TMP14:%.*]] = fmul fast float [[TMP13]], [[ARG]]
+; CHECK-NEXT:    [[TMP15:%.*]] = fadd fast float [[TMP14]], [[ARG]]
+; CHECK-NEXT:    [[TMP16:%.*]] = fmul fast float [[TMP15]], [[ARG]]
+; CHECK-NEXT:    [[TMP17:%.*]] = fadd fast float [[TMP16]], [[ARG]]
+; CHECK-NEXT:    [[TMP18:%.*]] = fmul fast float [[TMP17]], [[ARG]]
+; CHECK-NEXT:    [[TMP19:%.*]] = fadd fast float [[TMP18]], [[ARG]]
+; CHECK-NEXT:    [[TMP20:%.*]] = fmul fast float [[TMP19]], [[ARG]]
+; CHECK-NEXT:    [[TMP21:%.*]] = fadd fast float [[TMP20]], [[ARG]]
+; CHECK-NEXT:    [[TMP22:%.*]] = fmul fast float [[TMP21]], [[ARG]]
+; CHECK-NEXT:    [[TMP23:%.*]] = fadd fast float [[TMP22]], [[ARG]]
+; CHECK-NEXT:    [[REASS_MUL166:%.*]] = fmul fast float [[ARG]], [[ARG]]
+; CHECK-NEXT:    [[TMP24:%.*]] = fmul fast float [[REASS_MUL166]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = fadd fast float [[TMP24]], [[ARG]]
+; CHECK-NEXT:    [[TMP26:%.*]] = fmul fast float [[TMP25]], [[ARG]]
+; CHECK-NEXT:    [[TMP27:%.*]] = fadd fast float [[TMP26]], [[ARG]]
+; CHECK-NEXT:    [[TMP29:%.*]] = fmul fast float [[ARG]], [[ARG]]
+; CHECK-NEXT:    [[TMP31:%.*]] = fmul fast float [[TMP29]], 0x3FEA2E8B80000000
+; CHECK-NEXT:    [[TMP33:%.*]] = fmul fast float [[TMP31]], [[TMP27]]
+; CHECK-NEXT:    [[TMP34:%.*]] = fadd fast float [[TMP33]], [[ARG]]
+; CHECK-NEXT:    ret float [[TMP34]]
+;
+entry:
+  %tmp = fmul fast float %arg, 0xBFE99999A0000000
+  %tmp1 = fadd fast float %tmp, 1.000000e+00
+  %tmp2 = fmul fast float %arg, 0xBFE99999A0000000
+  %tmp3 = fadd fast float %tmp2, 1.000000e+00
+  %reass.mul156 = fmul fast float %arg, %tmp1
+  %reass.mul160 = fmul fast float %arg, %tmp1
+  %tmp4 = fmul fast float %reass.mul160, %tmp3
+  %tmp5 = fadd fast float %arg, %tmp4
+  %tmp6 = fmul fast float %tmp5, %arg
+  %tmp7 = fadd fast float %tmp6, %arg
+  %tmp8 = fmul fast float %tmp7, %arg
+  %tmp9 = fadd fast float %arg, %tmp8
+  %tmp10 = fmul fast float %tmp9, %arg
+  %tmp11 = fadd fast float %tmp10, %arg
+  %tmp12 = fmul fast float %tmp11, %arg
+  %tmp13 = fadd fast float %tmp12, %arg
+  %tmp14 = fmul fast float %tmp13, %arg
+  %tmp15 = fadd fast float %arg, %tmp14
+  %tmp16 = fmul fast float %tmp15, %arg
+  %tmp17 = fadd fast float %tmp16, %arg
+  %tmp18 = fmul fast float %tmp17, %arg
+  %tmp19 = fadd fast float %tmp18, %arg
+  %tmp20 = fmul fast float %tmp19, %arg
+  %tmp21 = fadd fast float %tmp20, %arg
+  %tmp22 = fmul fast float %tmp21, %arg
+  %tmp23 = fadd fast float %tmp22, %arg
+  %reass.mul166 = fmul fast float %arg, %tmp23
+  %tmp24 = fmul fast float %reass.mul166, %arg
+  %tmp25 = fadd fast float %arg, %tmp24
+  %tmp26 = fmul fast float %arg, %tmp25
+  %tmp27 = fadd fast float %tmp26, %arg
+  %tmp28 = fmul fast float %arg, %tmp27
+  %tmp29 = fmul fast float %tmp28, %arg
+  %tmp31 = fmul fast float %tmp29, 0x3FED1745C0000000
+  %tmp33 = fmul fast float %tmp31, 0x3FECCCCCC0000000
+  %tmp34 = fadd fast float %arg, %tmp33
+  ret float %tmp34
+}
+

Added: llvm/trunk/test/Transforms/Reassociate/pr12245.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/pr12245.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/pr12245.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/pr12245.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,46 @@
+; RUN: opt < %s -basicaa -inline -instcombine -reassociate -dse -disable-output
+; PR12245
+
+ at a = common global i32 0, align 4
+ at d = common global i32 0, align 4
+
+define i32 @fn2() nounwind uwtable ssp {
+entry:
+  %0 = load i32, i32* @a, align 4
+  %dec = add nsw i32 %0, -1
+  store i32 %dec, i32* @a, align 4
+  %1 = load i32, i32* @d, align 4
+  %sub = sub nsw i32 %dec, %1
+  store i32 %sub, i32* @d, align 4
+  %2 = load i32, i32* @a, align 4
+  %dec1 = add nsw i32 %2, -1
+  store i32 %dec1, i32* @a, align 4
+  %3 = load i32, i32* @d, align 4
+  %sub2 = sub nsw i32 %dec1, %3
+  store i32 %sub2, i32* @d, align 4
+  %4 = load i32, i32* @a, align 4
+  %dec3 = add nsw i32 %4, -1
+  store i32 %dec3, i32* @a, align 4
+  %5 = load i32, i32* @d, align 4
+  %sub4 = sub nsw i32 %dec3, %5
+  store i32 %sub4, i32* @d, align 4
+  %6 = load i32, i32* @a, align 4
+  %dec5 = add nsw i32 %6, -1
+  store i32 %dec5, i32* @a, align 4
+  %7 = load i32, i32* @d, align 4
+  %sub6 = sub nsw i32 %dec5, %7
+  store i32 %sub6, i32* @d, align 4
+  %8 = load i32, i32* @a, align 4
+  %dec7 = add nsw i32 %8, -1
+  store i32 %dec7, i32* @a, align 4
+  %9 = load i32, i32* @d, align 4
+  %sub8 = sub nsw i32 %dec7, %9
+  store i32 %sub8, i32* @d, align 4
+  ret i32 0
+}
+
+define i32 @fn1() nounwind uwtable ssp {
+entry:
+  %call = call i32 @fn2()
+  ret i32 %call
+}

Added: llvm/trunk/test/Transforms/Reassociate/pr21205.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/pr21205.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/pr21205.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/pr21205.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,21 @@
+; RUN: opt -reassociate -S < %s | FileCheck %s
+; PR21205
+
+ at a = common global i32 0, align 4
+ at b = common global i32 0, align 4
+
+; Don't canonicalize %conv - undef into %conv + (-undef).
+; CHECK-LABEL: @test1
+; CHECK: %sub = fsub fast float %conv, undef
+; CHECK: %sub1 = fadd fast float %sub, -1.000000e+00
+
+define i32 @test1() {
+entry:
+  %0 = load i32, i32* @a, align 4
+  %conv = sitofp i32 %0 to float
+  %sub = fsub fast float %conv, undef
+  %sub1 = fadd fast float %sub, -1.000000e+00
+  %conv2 = fptosi float %sub1 to i32
+  store i32 %conv2, i32* @b, align 4
+  ret i32 undef
+}

Added: llvm/trunk/test/Transforms/Reassociate/pr28367.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/pr28367.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/pr28367.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/pr28367.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,28 @@
+; RUN: opt < %s -reassociate -S
+
+; PR28367
+
+; Check to make sure this test does not assert or segfault.  If we get too
+; aggressive with retrying instructions it's possible to invalidate our
+; iterator.  See PR28367 for complete details.
+
+define void @fn1(i32 %a, i1 %c, i32* %ptr)  {
+entry:
+  br label %for.cond
+
+for.cond:
+  %d.0 = phi i32 [ 1, %entry ], [ 2, %for.body ]
+  br i1 %c, label %for.end, label %for.body
+
+for.body:
+  %sub1 = sub i32 %a, %d.0
+  %dead1 = add i32 %sub1, 1
+  %dead2 = mul i32 %dead1, 3
+  %dead3 = mul i32 %dead2, %sub1
+  %sub2 = sub nsw i32 0, %d.0
+  store i32 %sub2, i32* %ptr, align 4
+  br label %for.cond
+
+for.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/Reassociate/propagate-flags.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/propagate-flags.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/propagate-flags.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/propagate-flags.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,13 @@
+; RUN: opt < %s -reassociate -S | FileCheck %s
+
+define double @func(double %a, double %b) {
+; CHECK-LABEL: @func(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast double %b, %a
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast double [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    ret double [[TMP2]]
+;
+  %mul1 = fmul fast double %a, %a
+  %mul2 = fmul fast double %b, %b
+  %mul3 = fmul fast double %mul1, %mul2
+  ret double %mul3
+}

Added: llvm/trunk/test/Transforms/Reassociate/reassoc-intermediate-fnegs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/reassoc-intermediate-fnegs.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/reassoc-intermediate-fnegs.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/reassoc-intermediate-fnegs.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -reassociate -S | FileCheck %s
+
+; Input is A op (B op C)
+
+define half @faddsubAssoc1(half %a, half %b) {
+; CHECK-LABEL: @faddsubAssoc1(
+; CHECK-NEXT:    [[T2_NEG:%.*]] = fmul fast half [[A:%.*]], 0xH4500
+; CHECK-NEXT:    [[REASS_MUL:%.*]] = fmul fast half [[B:%.*]], 0xH4500
+; CHECK-NEXT:    [[T51:%.*]] = fsub fast half [[REASS_MUL]], [[T2_NEG]]
+; CHECK-NEXT:    [[T5:%.*]] = fadd fast half [[REASS_MUL]], [[T2_NEG]]
+; CHECK-NEXT:    ret half [[T51]]
+;
+  %t1 = fmul fast half %b, 0xH4200 ; 3*b
+  %t2 = fmul fast half %a, 0xH4500 ; 5*a
+  %t3 = fmul fast half %b, 0xH4000 ; 2*b
+  %t4 = fsub fast half %t2, %t1 ; 5 * a - 3 * b
+  %t5 = fsub fast half %t3, %t4 ; 2 * b - ( 5 * a - 3 * b)
+  ret half %t5 ; = 5 * (b - a)
+}
+
+; Input is (A op B) op C
+
+define half @faddsubAssoc2(half %a, half %b) {
+; CHECK-LABEL: @faddsubAssoc2(
+; CHECK-NEXT:    [[T2:%.*]] = fmul fast half [[A:%.*]], 0xH4500
+; CHECK-NEXT:    [[T5:%.*]] = fadd fast half [[B:%.*]], [[T2]]
+; CHECK-NEXT:    ret half [[T5]]
+;
+  %t1 = fmul fast half %b, 0xH4200 ; 3*b
+  %t2 = fmul fast half %a, 0xH4500 ; 5*a
+  %t3 = fmul fast half %b, 0xH4000 ; 2*b
+  %t4 = fadd fast half %t2, %t1 ; 5 * a + 3 * b
+  %t5 = fsub fast half %t4, %t3 ; (5 * a + 3 * b) - (2 * b)
+  ret half %t5 ; = 5 * a + b
+}
+

Added: llvm/trunk/test/Transforms/Reassociate/reassociate-deadinst.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/reassociate-deadinst.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/reassociate-deadinst.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/reassociate-deadinst.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,16 @@
+; RUN: opt < %s -inline -functionattrs -reassociate -S | FileCheck %s
+
+; CHECK-NOT: func1
+; CHECK-LABEL: main
+; CHECK-NEXT: ret void
+
+define internal i16 @func1() noinline #0 {
+  ret i16 0
+}
+
+define void @main(i16 %argc, i16** %argv) #0 {
+  %_tmp0 = call i16 @func1()
+  %_tmp2 = zext i16 %_tmp0 to i32
+  ret void
+}
+attributes #0 = { minsize nounwind optsize }

Added: llvm/trunk/test/Transforms/Reassociate/reassociate_dbgvalue_discard.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/reassociate_dbgvalue_discard.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/reassociate_dbgvalue_discard.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/reassociate_dbgvalue_discard.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -reassociate -S -o - | FileCheck %s
+
+; After reassociation m1 and m2 aren't calculated as m1=c*a and m2=c*b any longer.
+; So let's verify that the dbg.value nodes for m1 and m3 are invalidated.
+
+source_filename = "reassociate_dbgvalue_discard.c"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define dso_local i32 @test1(i32 %a, i32 %b, i32 %c, i32 %d) local_unnamed_addr #0 !dbg !7 {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 undef, metadata !16, metadata !DIExpression()), !dbg !20
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 undef, metadata !17, metadata !DIExpression()), !dbg !21
+; CHECK-NEXT:    [[M1:%.*]] = mul i32 [[D:%.*]], [[C:%.*]], !dbg !22
+; CHECK-NEXT:    [[M3:%.*]] = mul i32 [[M1]], [[A:%.*]], !dbg !23
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 [[M3]], metadata !18, metadata !DIExpression()), !dbg !24
+; CHECK-NEXT:    [[M2:%.*]] = mul i32 [[D]], [[C]], !dbg !25
+; CHECK-NEXT:    [[M4:%.*]] = mul i32 [[M2]], [[B:%.*]], !dbg !26
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 [[M4]], metadata !19, metadata !DIExpression()), !dbg !27
+; CHECK-NEXT:    [[RES:%.*]] = xor i32 [[M3]], [[M4]]
+; CHECK-NEXT:    ret i32 [[RES]], !dbg !28
+;
+entry:
+  %m1 = mul i32 %c, %a, !dbg !24
+  call void @llvm.dbg.value(metadata i32 %m1, metadata !16, metadata !DIExpression()), !dbg !25
+  %m2 = mul i32 %c, %b, !dbg !26
+  call void @llvm.dbg.value(metadata i32 %m2, metadata !17, metadata !DIExpression()), !dbg !27
+  %m3 = mul i32 %m1, %d, !dbg !28
+  call void @llvm.dbg.value(metadata i32 %m3, metadata !18, metadata !DIExpression()), !dbg !29
+  %m4 = mul i32 %m2, %d, !dbg !30
+  call void @llvm.dbg.value(metadata i32 %m4, metadata !19, metadata !DIExpression()), !dbg !31
+  %res = xor i32 %m3, %m4
+  ret i32 %res, !dbg !32
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
+attributes #0 = { nounwind readnone uwtable }
+attributes #1 = { nounwind readnone speculatable }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 7.0.0 (trunk 330596) (llvm/trunk 330594)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "reassociate_dbgvalue_discard.c", directory: "")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 7.0.0 (trunk 330596) (llvm/trunk 330594)"}
+!7 = distinct !DISubprogram(name: "test1", scope: !1, file: !1, line: 3, type: !8, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !11)
+!8 = !DISubroutineType(types: !9)
+!9 = !{!10, !10, !10, !10, !10}
+!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!11 = !{!12, !13, !14, !15, !16, !17, !18, !19}
+!12 = !DILocalVariable(name: "a", arg: 1, scope: !7, file: !1, line: 3, type: !10)
+!13 = !DILocalVariable(name: "b", arg: 2, scope: !7, file: !1, line: 3, type: !10)
+!14 = !DILocalVariable(name: "c", arg: 3, scope: !7, file: !1, line: 3, type: !10)
+!15 = !DILocalVariable(name: "d", arg: 4, scope: !7, file: !1, line: 3, type: !10)
+!16 = !DILocalVariable(name: "t1", scope: !7, file: !1, line: 4, type: !10)
+!17 = !DILocalVariable(name: "t2", scope: !7, file: !1, line: 5, type: !10)
+!18 = !DILocalVariable(name: "t3", scope: !7, file: !1, line: 6, type: !10)
+!19 = !DILocalVariable(name: "t4", scope: !7, file: !1, line: 7, type: !10)
+!20 = !DILocation(line: 3, column: 15, scope: !7)
+!21 = !DILocation(line: 3, column: 22, scope: !7)
+!22 = !DILocation(line: 3, column: 29, scope: !7)
+!23 = !DILocation(line: 3, column: 36, scope: !7)
+!24 = !DILocation(line: 4, column: 14, scope: !7)
+!25 = !DILocation(line: 4, column: 7, scope: !7)
+!26 = !DILocation(line: 5, column: 14, scope: !7)
+!27 = !DILocation(line: 5, column: 7, scope: !7)
+!28 = !DILocation(line: 6, column: 15, scope: !7)
+!29 = !DILocation(line: 6, column: 7, scope: !7)
+!30 = !DILocation(line: 7, column: 15, scope: !7)
+!31 = !DILocation(line: 7, column: 7, scope: !7)
+!32 = !DILocation(line: 8, column: 3, scope: !7)

Added: llvm/trunk/test/Transforms/Reassociate/repeats.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/repeats.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/repeats.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/repeats.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,252 @@
+; RUN: opt < %s -reassociate -S | FileCheck %s
+
+; Tests involving repeated operations on the same value.
+
+define i8 @nilpotent(i8 %x) {
+; CHECK-LABEL: @nilpotent(
+  %tmp = xor i8 %x, %x
+  ret i8 %tmp
+; CHECK: ret i8 0
+}
+
+define i2 @idempotent(i2 %x) {
+; CHECK-LABEL: @idempotent(
+  %tmp1 = and i2 %x, %x
+  %tmp2 = and i2 %tmp1, %x
+  %tmp3 = and i2 %tmp2, %x
+  ret i2 %tmp3
+; CHECK: ret i2 %x
+}
+
+define i2 @add(i2 %x) {
+; CHECK-LABEL: @add(
+  %tmp1 = add i2 %x, %x
+  %tmp2 = add i2 %tmp1, %x
+  %tmp3 = add i2 %tmp2, %x
+  ret i2 %tmp3
+; CHECK: ret i2 0
+}
+
+define i2 @cst_add() {
+; CHECK-LABEL: @cst_add(
+  %tmp1 = add i2 1, 1
+  %tmp2 = add i2 %tmp1, 1
+  ret i2 %tmp2
+; CHECK: ret i2 -1
+}
+
+define i8 @cst_mul() {
+; CHECK-LABEL: @cst_mul(
+  %tmp1 = mul i8 3, 3
+  %tmp2 = mul i8 %tmp1, 3
+  %tmp3 = mul i8 %tmp2, 3
+  %tmp4 = mul i8 %tmp3, 3
+  ret i8 %tmp4
+; CHECK: ret i8 -13
+}
+
+define i3 @foo3x5(i3 %x) {
+; Can be done with two multiplies.
+; CHECK-LABEL: @foo3x5(
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: ret
+  %tmp1 = mul i3 %x, %x
+  %tmp2 = mul i3 %tmp1, %x
+  %tmp3 = mul i3 %tmp2, %x
+  %tmp4 = mul i3 %tmp3, %x
+  ret i3 %tmp4
+}
+
+define i3 @foo3x6(i3 %x) {
+; Can be done with two multiplies.
+; CHECK-LABEL: @foo3x6(
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: ret
+  %tmp1 = mul i3 %x, %x
+  %tmp2 = mul i3 %tmp1, %x
+  %tmp3 = mul i3 %tmp2, %x
+  %tmp4 = mul i3 %tmp3, %x
+  %tmp5 = mul i3 %tmp4, %x
+  ret i3 %tmp5
+}
+
+define i3 @foo3x7(i3 %x) {
+; Can be done with two multiplies.
+; CHECK-LABEL: @foo3x7(
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: ret
+  %tmp1 = mul i3 %x, %x
+  %tmp2 = mul i3 %tmp1, %x
+  %tmp3 = mul i3 %tmp2, %x
+  %tmp4 = mul i3 %tmp3, %x
+  %tmp5 = mul i3 %tmp4, %x
+  %tmp6 = mul i3 %tmp5, %x
+  ret i3 %tmp6
+}
+
+define i4 @foo4x8(i4 %x) {
+; Can be done with two multiplies.
+; CHECK-LABEL: @foo4x8(
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: ret
+  %tmp1 = mul i4 %x, %x
+  %tmp2 = mul i4 %tmp1, %x
+  %tmp3 = mul i4 %tmp2, %x
+  %tmp4 = mul i4 %tmp3, %x
+  %tmp5 = mul i4 %tmp4, %x
+  %tmp6 = mul i4 %tmp5, %x
+  %tmp7 = mul i4 %tmp6, %x
+  ret i4 %tmp7
+}
+
+define i4 @foo4x9(i4 %x) {
+; Can be done with three multiplies.
+; CHECK-LABEL: @foo4x9(
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: ret
+  %tmp1 = mul i4 %x, %x
+  %tmp2 = mul i4 %tmp1, %x
+  %tmp3 = mul i4 %tmp2, %x
+  %tmp4 = mul i4 %tmp3, %x
+  %tmp5 = mul i4 %tmp4, %x
+  %tmp6 = mul i4 %tmp5, %x
+  %tmp7 = mul i4 %tmp6, %x
+  %tmp8 = mul i4 %tmp7, %x
+  ret i4 %tmp8
+}
+
+define i4 @foo4x10(i4 %x) {
+; Can be done with three multiplies.
+; CHECK-LABEL: @foo4x10(
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: ret
+  %tmp1 = mul i4 %x, %x
+  %tmp2 = mul i4 %tmp1, %x
+  %tmp3 = mul i4 %tmp2, %x
+  %tmp4 = mul i4 %tmp3, %x
+  %tmp5 = mul i4 %tmp4, %x
+  %tmp6 = mul i4 %tmp5, %x
+  %tmp7 = mul i4 %tmp6, %x
+  %tmp8 = mul i4 %tmp7, %x
+  %tmp9 = mul i4 %tmp8, %x
+  ret i4 %tmp9
+}
+
+define i4 @foo4x11(i4 %x) {
+; Can be done with four multiplies.
+; CHECK-LABEL: @foo4x11(
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: ret
+  %tmp1 = mul i4 %x, %x
+  %tmp2 = mul i4 %tmp1, %x
+  %tmp3 = mul i4 %tmp2, %x
+  %tmp4 = mul i4 %tmp3, %x
+  %tmp5 = mul i4 %tmp4, %x
+  %tmp6 = mul i4 %tmp5, %x
+  %tmp7 = mul i4 %tmp6, %x
+  %tmp8 = mul i4 %tmp7, %x
+  %tmp9 = mul i4 %tmp8, %x
+  %tmp10 = mul i4 %tmp9, %x
+  ret i4 %tmp10
+}
+
+define i4 @foo4x12(i4 %x) {
+; Can be done with two multiplies.
+; CHECK-LABEL: @foo4x12(
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: ret
+  %tmp1 = mul i4 %x, %x
+  %tmp2 = mul i4 %tmp1, %x
+  %tmp3 = mul i4 %tmp2, %x
+  %tmp4 = mul i4 %tmp3, %x
+  %tmp5 = mul i4 %tmp4, %x
+  %tmp6 = mul i4 %tmp5, %x
+  %tmp7 = mul i4 %tmp6, %x
+  %tmp8 = mul i4 %tmp7, %x
+  %tmp9 = mul i4 %tmp8, %x
+  %tmp10 = mul i4 %tmp9, %x
+  %tmp11 = mul i4 %tmp10, %x
+  ret i4 %tmp11
+}
+
+define i4 @foo4x13(i4 %x) {
+; Can be done with three multiplies.
+; CHECK-LABEL: @foo4x13(
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: ret
+  %tmp1 = mul i4 %x, %x
+  %tmp2 = mul i4 %tmp1, %x
+  %tmp3 = mul i4 %tmp2, %x
+  %tmp4 = mul i4 %tmp3, %x
+  %tmp5 = mul i4 %tmp4, %x
+  %tmp6 = mul i4 %tmp5, %x
+  %tmp7 = mul i4 %tmp6, %x
+  %tmp8 = mul i4 %tmp7, %x
+  %tmp9 = mul i4 %tmp8, %x
+  %tmp10 = mul i4 %tmp9, %x
+  %tmp11 = mul i4 %tmp10, %x
+  %tmp12 = mul i4 %tmp11, %x
+  ret i4 %tmp12
+}
+
+define i4 @foo4x14(i4 %x) {
+; Can be done with three multiplies.
+; CHECK-LABEL: @foo4x14(
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: ret
+  %tmp1 = mul i4 %x, %x
+  %tmp2 = mul i4 %tmp1, %x
+  %tmp3 = mul i4 %tmp2, %x
+  %tmp4 = mul i4 %tmp3, %x
+  %tmp5 = mul i4 %tmp4, %x
+  %tmp6 = mul i4 %tmp5, %x
+  %tmp7 = mul i4 %tmp6, %x
+  %tmp8 = mul i4 %tmp7, %x
+  %tmp9 = mul i4 %tmp8, %x
+  %tmp10 = mul i4 %tmp9, %x
+  %tmp11 = mul i4 %tmp10, %x
+  %tmp12 = mul i4 %tmp11, %x
+  %tmp13 = mul i4 %tmp12, %x
+  ret i4 %tmp13
+}
+
+define i4 @foo4x15(i4 %x) {
+; Can be done with four multiplies.
+; CHECK-LABEL: @foo4x15(
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: mul
+; CHECK-NEXT: ret
+  %tmp1 = mul i4 %x, %x
+  %tmp2 = mul i4 %tmp1, %x
+  %tmp3 = mul i4 %tmp2, %x
+  %tmp4 = mul i4 %tmp3, %x
+  %tmp5 = mul i4 %tmp4, %x
+  %tmp6 = mul i4 %tmp5, %x
+  %tmp7 = mul i4 %tmp6, %x
+  %tmp8 = mul i4 %tmp7, %x
+  %tmp9 = mul i4 %tmp8, %x
+  %tmp10 = mul i4 %tmp9, %x
+  %tmp11 = mul i4 %tmp10, %x
+  %tmp12 = mul i4 %tmp11, %x
+  %tmp13 = mul i4 %tmp12, %x
+  %tmp14 = mul i4 %tmp13, %x
+  ret i4 %tmp14
+}

Added: llvm/trunk/test/Transforms/Reassociate/secondary.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/secondary.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/secondary.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/secondary.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,24 @@
+; RUN: opt -S -reassociate < %s | FileCheck %s
+; rdar://9167457
+
+; Reassociate shouldn't break this testcase involving a secondary
+; reassociation.
+
+; CHECK:     define
+; CHECK-NOT: undef
+; CHECK:     %factor = mul i32 %tmp3, -2
+; CHECK-NOT: undef
+; CHECK:     }
+
+define void @x0f2f640ab6718391b59ce96d9fdeda54(i32 %arg, i32 %arg1, i32 %arg2, i32* %.out) nounwind {
+_:
+  %tmp = sub i32 %arg, %arg1
+  %tmp3 = mul i32 %tmp, -1268345047
+  %tmp4 = add i32 %tmp3, 2014710503
+  %tmp5 = add i32 %tmp3, -1048397418
+  %tmp6 = sub i32 %tmp4, %tmp5
+  %tmp7 = sub i32 -2014710503, %tmp3
+  %tmp8 = add i32 %tmp6, %tmp7
+  store i32 %tmp8, i32* %.out
+  ret void
+}

Added: llvm/trunk/test/Transforms/Reassociate/shift-factor.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/shift-factor.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/shift-factor.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/shift-factor.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,16 @@
+; RUN: opt < %s -reassociate -instcombine -S | FileCheck %s
+
+; There should be exactly one shift and one add left.
+
+define i32 @test1(i32 %X, i32 %Y) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[REASS_ADD:%.*]] = add i32 %Y, %X
+; CHECK-NEXT:    [[REASS_MUL:%.*]] = shl i32 [[REASS_ADD]], 1
+; CHECK-NEXT:    ret i32 [[REASS_MUL]]
+;
+  %t2 = shl i32 %X, 1
+  %t6 = shl i32 %Y, 1
+  %t4 = add i32 %t6, %t2
+  ret i32 %t4
+}
+

Added: llvm/trunk/test/Transforms/Reassociate/shifttest.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/shifttest.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/shifttest.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/shifttest.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,12 @@
+; With shl->mul reassociation, we can see that this is (shl A, 9) * A
+;
+; RUN: opt < %s -reassociate -instcombine -S |\
+; RUN:    grep "shl .*, 9"
+
+define i32 @test(i32 %A, i32 %B) {
+	%X = shl i32 %A, 5		; <i32> [#uses=1]
+	%Y = shl i32 %A, 4		; <i32> [#uses=1]
+	%Z = mul i32 %Y, %X		; <i32> [#uses=1]
+	ret i32 %Z
+}
+

Added: llvm/trunk/test/Transforms/Reassociate/subtest.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/subtest.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/subtest.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/subtest.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,28 @@
+; RUN: opt < %s -reassociate -instcombine -S | FileCheck %s
+
+; With sub reassociation, constant folding can eliminate the 12 and -12 constants.
+define i32 @test1(i32 %A, i32 %B) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[Z:%.*]] = sub i32 %A, %B
+; CHECK-NEXT:    ret i32 [[Z]]
+;
+  %X = add i32 -12, %A
+  %Y = sub i32 %X, %B
+  %Z = add i32 %Y, 12
+  ret i32 %Z
+}
+
+; PR2047
+; With sub reassociation, constant folding can eliminate the uses of %a.
+define i32 @test2(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[SUM:%.*]] = add i32 %c, %b
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[SUM]]
+; CHECK-NEXT:    ret i32 [[TMP7]]
+;
+  %tmp3 = sub i32 %a, %b
+  %tmp5 = sub i32 %tmp3, %c
+  %tmp7 = sub i32 %tmp5, %a
+  ret i32 %tmp7
+}
+

Added: llvm/trunk/test/Transforms/Reassociate/vaarg_movable.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/vaarg_movable.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/vaarg_movable.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/vaarg_movable.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,28 @@
+; RUN: opt -S -reassociate -die < %s | FileCheck %s
+
+; The two va_arg instructions depend on the memory/context, are therfore not
+; identical and the sub should not be optimized to 0 by reassociate.
+;
+; CHECK-LABEL: @func(
+; ...
+; CHECK: %v0 = va_arg i8** %varargs, i32
+; CHECK: %v1 = va_arg i8** %varargs, i32
+; CHECK: %v0.neg = sub i32 0, %v0
+; CHECK: %sub = add i32 %v0.neg, 1
+; CHECK: %add = add i32 %sub, %v1
+; ...
+; CHECK: ret i32 %add
+define i32 @func(i32 %dummy, ...) {
+  %varargs = alloca i8*, align 8
+  %varargs1 = bitcast i8** %varargs to i8*
+  call void @llvm.va_start(i8* %varargs1)
+  %v0 = va_arg i8** %varargs, i32
+  %v1 = va_arg i8** %varargs, i32
+  %sub = sub nsw i32 %v1, %v0
+  %add = add nsw i32 %sub, 1
+  call void @llvm.va_end(i8* %varargs1)
+  ret i32 %add
+}
+
+declare void @llvm.va_start(i8*)
+declare void @llvm.va_end(i8*)

Added: llvm/trunk/test/Transforms/Reassociate/wrap-flags.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/wrap-flags.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/wrap-flags.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/wrap-flags.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,45 @@
+; RUN: opt < %s -reassociate -dce -S | FileCheck %s
+; PR12985
+
+; Verify the nsw flags are preserved when converting shl to mul.
+
+; CHECK-LABEL: @shl_to_mul_nsw(
+; CHECK: %mul = mul i32 %i, -2147483648
+; CHECK: add i32 %mul, 1
+define i32 @shl_to_mul_nsw(i32 %i) {
+entry:
+  %mul = shl nsw i32 %i, 31
+  %mul2 = add i32 %mul, 1
+  ret i32 %mul2
+}
+
+; CHECK-LABEL: @shl_to_mul_nuw(
+; CHECK: %mul = mul nuw i32 %i, 4
+; CHECK: add i32 %mul, 1
+define i32 @shl_to_mul_nuw(i32 %i) {
+entry:
+  %mul = shl nuw i32 %i, 2
+  %mul2 = add i32 %mul, 1
+  ret i32 %mul2
+}
+
+; CHECK-LABEL: @shl_to_mul_nuw_nsw(
+; CHECK: %mul = mul nuw nsw i32 %i, 4
+; CHECK: add i32 %mul, 1
+define i32 @shl_to_mul_nuw_nsw(i32 %i) {
+entry:
+  %mul = shl nuw nsw i32 %i, 2
+  %mul2 = add i32 %mul, 1
+  ret i32 %mul2
+}
+
+; CHECK-LABEL: @pr23926(
+; CHECK:       %[[X1_neg:.*]] = sub i2 0, %X1
+; CHECK-NEXT:  %[[sub_one:.*]] = add i2 %[[X1_neg]], -1
+; CHECK-NEXT:  %[[add:.*]] = add i2 %[[sub_one]], %X2
+; CHECK-NEXT:  ret i2 %[[add]]
+define i2 @pr23926(i2 %X1, i2 %X2) {
+  %add = add nuw i2 %X1, 1
+  %sub = sub nuw nsw i2 %X2, %add
+  ret i2 %sub
+}

Added: llvm/trunk/test/Transforms/Reassociate/xor_reassoc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reassociate/xor_reassoc.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reassociate/xor_reassoc.ll (added)
+++ llvm/trunk/test/Transforms/Reassociate/xor_reassoc.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,294 @@
+;RUN: opt -S -reassociate < %s | FileCheck %s
+
+; ==========================================================================
+;
+;   Xor reassociation general cases
+;  
+; ==========================================================================
+
+; (x | c1) ^ (x | c2) => (x & c3) ^ c3, where c3 = c1^c2
+;   
+define i32 @xor1(i32 %x) {
+  %or = or i32 %x, 123
+  %or1 = or i32 %x, 456
+  %xor = xor i32 %or, %or1
+  ret i32 %xor
+
+;CHECK-LABEL: @xor1(
+;CHECK: %and.ra = and i32 %x, 435
+;CHECK: %xor = xor i32 %and.ra, 435
+}
+
+; (x | c1) ^ (x | c2) => (x & c3) ^ c3, where c3 = c1^c2
+;   
+define <2 x i32> @xor1_vec(<2 x i32> %x) {
+  %or = or <2 x i32> %x, <i32 123, i32 123>
+  %or1 = or <2 x i32> %x, <i32 456, i32 456>
+  %xor = xor <2 x i32> %or, %or1
+  ret <2 x i32> %xor
+
+;CHECK-LABEL: @xor1_vec(
+;CHECK: %and.ra = and <2 x i32> %x, <i32 435, i32 435>
+;CHECK: %xor = xor <2 x i32> %and.ra, <i32 435, i32 435>
+}
+
+; Test rule : (x & c1) ^ (x & c2) = (x & (c1^c2))
+; Real testing case : (x & 123) ^ y ^ (x & 345) => (x & 435) ^ y
+define i32 @xor2(i32 %x, i32 %y) {
+  %and = and i32 %x, 123
+  %xor = xor i32 %and, %y
+  %and1 = and i32 %x, 456
+  %xor2 = xor i32 %xor, %and1
+  ret i32 %xor2
+
+;CHECK-LABEL: @xor2(
+;CHECK: %and.ra = and i32 %x, 435
+;CHECK: %xor2 = xor i32 %and.ra, %y
+}
+
+; Test rule : (x & c1) ^ (x & c2) = (x & (c1^c2))
+; Real testing case : (x & 123) ^ y ^ (x & 345) => (x & 435) ^ y
+define <2 x i32> @xor2_vec(<2 x i32> %x, <2 x i32> %y) {
+  %and = and <2 x i32> %x, <i32 123, i32 123>
+  %xor = xor <2 x i32> %and, %y
+  %and1 = and <2 x i32> %x, <i32 456, i32 456>
+  %xor2 = xor <2 x i32> %xor, %and1
+  ret <2 x i32> %xor2
+
+;CHECK-LABEL: @xor2_vec(
+;CHECK: %and.ra = and <2 x i32> %x, <i32 435, i32 435>
+;CHECK: %xor2 = xor <2 x i32> %and.ra, %y
+}
+
+; Test rule: (x | c1) ^ (x & c2) = (x & c3) ^ c1, where c3 = ~c1 ^ c2
+;  c3 = ~c1 ^ c2
+define i32 @xor3(i32 %x, i32 %y) {
+  %or = or i32 %x, 123
+  %xor = xor i32 %or, %y
+  %and = and i32 %x, 456
+  %xor1 = xor i32 %xor, %and
+  ret i32 %xor1
+
+;CHECK-LABEL: @xor3(
+;CHECK: %and.ra = and i32 %x, -436
+;CHECK: %xor = xor i32 %y, 123
+;CHECK: %xor1 = xor i32 %xor, %and.ra
+}
+
+; Test rule: (x | c1) ^ (x & c2) = (x & c3) ^ c1, where c3 = ~c1 ^ c2
+;  c3 = ~c1 ^ c2
+define <2 x i32> @xor3_vec(<2 x i32> %x, <2 x i32> %y) {
+  %or = or <2 x i32> %x, <i32 123, i32 123>
+  %xor = xor <2 x i32> %or, %y
+  %and = and <2 x i32> %x, <i32 456, i32 456>
+  %xor1 = xor <2 x i32> %xor, %and
+  ret <2 x i32> %xor1
+
+;CHECK-LABEL: @xor3_vec(
+;CHECK: %and.ra = and <2 x i32> %x, <i32 -436, i32 -436>
+;CHECK: %xor = xor <2 x i32> %y, <i32 123, i32 123>
+;CHECK: %xor1 = xor <2 x i32> %xor, %and.ra
+}
+
+; Test rule: (x | c1) ^ c2 = (x & ~c1) ^ (c1 ^ c2)
+define i32 @xor4(i32 %x, i32 %y) {
+  %and = and i32 %x, -124
+  %xor = xor i32 %y, 435
+  %xor1 = xor i32 %xor, %and
+  ret i32 %xor1
+; CHECK-LABEL: @xor4(
+; CHECK: %and = and i32 %x, -124
+; CHECK: %xor = xor i32 %y, 435
+; CHECK: %xor1 = xor i32 %xor, %and
+}
+
+; Test rule: (x | c1) ^ c2 = (x & ~c1) ^ (c1 ^ c2)
+define <2 x i32> @xor4_vec(<2 x i32> %x, <2 x i32> %y) {
+  %and = and <2 x i32> %x, <i32 -124, i32 -124>
+  %xor = xor <2 x i32> %y, <i32 435, i32 435>
+  %xor1 = xor <2 x i32> %xor, %and
+  ret <2 x i32> %xor1
+; CHECK-LABEL: @xor4_vec(
+; CHECK: %and = and <2 x i32> %x, <i32 -124, i32 -124>
+; CHECK: %xor = xor <2 x i32> %y, <i32 435, i32 435>
+; CHECK: %xor1 = xor <2 x i32> %xor, %and
+}
+
+; ==========================================================================
+;
+;  Xor reassociation special cases
+;  
+; ==========================================================================
+
+; Special case1: 
+;  (x | c1) ^ (x & ~c1) = c1
+define i32 @xor_special1(i32 %x, i32 %y) {
+  %or = or i32 %x, 123
+  %xor = xor i32 %or, %y
+  %and = and i32 %x, -124
+  %xor1 = xor i32 %xor, %and
+  ret i32 %xor1
+; CHECK-LABEL: @xor_special1(
+; CHECK: %xor1 = xor i32 %y, 123
+; CHECK: ret i32 %xor1
+}
+
+; Special case1: 
+;  (x | c1) ^ (x & ~c1) = c1
+define <2 x i32> @xor_special1_vec(<2 x i32> %x, <2 x i32> %y) {
+  %or = or <2 x i32> %x, <i32 123, i32 123>
+  %xor = xor <2 x i32> %or, %y
+  %and = and <2 x i32> %x, <i32 -124, i32 -124>
+  %xor1 = xor <2 x i32> %xor, %and
+  ret <2 x i32> %xor1
+; CHECK-LABEL: @xor_special1_vec(
+; CHECK: %xor1 = xor <2 x i32> %y, <i32 123, i32 123>
+; CHECK: ret <2 x i32> %xor1
+}
+
+; Special case1: 
+;  (x | c1) ^ (x & c1) = x ^ c1
+define i32 @xor_special2(i32 %x, i32 %y) {
+  %or = or i32 %x, 123
+  %xor = xor i32 %or, %y
+  %and = and i32 %x, 123
+  %xor1 = xor i32 %xor, %and
+  ret i32 %xor1
+; CHECK-LABEL: @xor_special2(
+; CHECK: %xor = xor i32 %x, 123
+; CHECK: %xor1 = xor i32 %xor, %y
+; CHECK: ret i32 %xor1
+}
+
+; Special case1: 
+;  (x | c1) ^ (x & c1) = x ^ c1
+define <2 x i32> @xor_special2_vec(<2 x i32> %x, <2 x i32> %y) {
+  %or = or <2 x i32> %x, <i32 123, i32 123>
+  %xor = xor <2 x i32> %or, %y
+  %and = and <2 x i32> %x, <i32 123, i32 123>
+  %xor1 = xor <2 x i32> %xor, %and
+  ret <2 x i32> %xor1
+; CHECK-LABEL: @xor_special2_vec(
+; CHECK: %xor = xor <2 x i32> %x, <i32 123, i32 123>
+; CHECK: %xor1 = xor <2 x i32> %xor, %y
+; CHECK: ret <2 x i32> %xor1
+}
+
+; (x | c1) ^ (x | c1) => 0
+define i32 @xor_special3(i32 %x) {
+  %or = or i32 %x, 123
+  %or1 = or i32 %x, 123
+  %xor = xor i32 %or, %or1
+  ret i32 %xor
+;CHECK-LABEL: @xor_special3(
+;CHECK: ret i32 0
+}
+
+; (x | c1) ^ (x | c1) => 0
+define <2 x i32> @xor_special3_vec(<2 x i32> %x) {
+  %or = or <2 x i32> %x, <i32 123, i32 123>
+  %or1 = or <2 x i32> %x, <i32 123, i32 123>
+  %xor = xor <2 x i32> %or, %or1
+  ret <2 x i32> %xor
+;CHECK-LABEL: @xor_special3_vec(
+;CHECK: ret <2 x i32> zeroinitializer
+}
+
+; (x & c1) ^ (x & c1) => 0
+define i32 @xor_special4(i32 %x) {
+  %or = and i32 %x, 123
+  %or1 = and i32 123, %x
+  %xor = xor i32 %or, %or1
+  ret i32 %xor
+;CHECK-LABEL: @xor_special4(
+;CHECK: ret i32 0
+}
+
+; (x & c1) ^ (x & c1) => 0
+define <2 x i32> @xor_special4_vec(<2 x i32> %x) {
+  %or = and <2 x i32> %x, <i32 123, i32 123>
+  %or1 = and <2 x i32> <i32 123, i32 123>, %x
+  %xor = xor <2 x i32> %or, %or1
+  ret <2 x i32> %xor
+;CHECK-LABEL: @xor_special4_vec(
+;CHECK: ret <2 x i32> zeroinitializer
+}
+
+; ==========================================================================
+;
+;  Xor reassociation curtail code size
+;  
+; ==========================================================================
+
+; (x | c1) ^ (x | c2) => (x & c3) ^ c3
+; is enabled if one of operands has multiple uses
+;   
+define i32 @xor_ra_size1(i32 %x) {
+  %or = or i32 %x, 123
+  %or1 = or i32 %x, 456
+  %xor = xor i32 %or, %or1
+
+  %add = add i32 %xor, %or
+  ret i32 %add
+;CHECK-LABEL: @xor_ra_size1(
+;CHECK: %xor = xor i32 %and.ra, 435
+}
+
+; (x | c1) ^ (x | c2) => (x & c3) ^ c3
+; is disenabled if bothf operands has multiple uses.
+;   
+define i32 @xor_ra_size2(i32 %x) {
+  %or = or i32 %x, 123
+  %or1 = or i32 %x, 456
+  %xor = xor i32 %or, %or1
+
+  %add = add i32 %xor, %or
+  %add2 = add i32 %add, %or1
+  ret i32 %add2
+
+;CHECK-LABEL: @xor_ra_size2(
+;CHECK: %or1 = or i32 %x, 456
+;CHECK: %xor = xor i32 %or, %or1
+}
+
+
+; ==========================================================================
+;
+;  Xor reassociation bugs
+;  
+; ==========================================================================
+
+ at xor_bug1_data = external global <{}>, align 4
+define void @xor_bug1() {
+  %1 = ptrtoint i32* undef to i64
+  %2 = xor i64 %1, ptrtoint (<{}>* @xor_bug1_data to i64)
+  %3 = and i64 undef, %2
+  ret void
+}
+
+; The bug was that when the compiler optimize "(x | c1)" ^ "(x & c2)", it may
+; swap the two xor-subexpressions if they are not in canoninical order; however,
+; when optimizer swaps two sub-expressions, if forgot to swap the cached value
+; of c1 and c2 accordingly, hence cause the problem.
+;
+define i32 @xor_bug2(i32, i32, i32, i32) {
+  %5 = mul i32 %0, 123
+  %6 = add i32 %2, 24
+  %7 = add i32 %1, 8
+  %8 = and i32 %1, 3456789
+  %9 = or i32 %8,  4567890
+  %10 = and i32 %1, 543210987
+  %11 = or i32 %1, 891034567
+  %12 = and i32 %2, 255
+  %13 = xor i32 %9, %10
+  %14 = xor i32 %11, %13
+  %15 = xor i32 %5, %14
+  %16 = and i32 %3, 255
+  %17 = xor i32 %16, 42
+  %18 = add i32 %6, %7
+  %19 = add i32 %18, %12
+  %20 = add i32 %19, %15
+  ret i32 %20
+;CHECK-LABEL: @xor_bug2(
+;CHECK: xor i32 %5, 891034567
+}

Added: llvm/trunk/test/Transforms/Reg2Mem/crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Reg2Mem/crash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Reg2Mem/crash.ll (added)
+++ llvm/trunk/test/Transforms/Reg2Mem/crash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,88 @@
+; RUN: opt -reg2mem -disable-output < %s
+; PR14782
+
+declare void @f1()
+
+declare i32 @__gxx_personality_sj0(...)
+
+declare void @f2()
+
+declare void @f3()
+
+declare void @f4_()
+
+declare void @_Z12xxxdtsP10xxxpq()
+
+define hidden void @_ZN12xxxyzIi9xxxwLi29ELi0EE4f3NewES0_i() ssp align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*) {
+bb:
+  invoke void @f4_()
+          to label %bb1 unwind label %.thread
+
+.thread:                                          ; preds = %bb
+  %tmp = landingpad { i8*, i32 }
+          cleanup
+  br label %bb13
+
+bb1:                                              ; preds = %bb
+  invoke void @f1()
+          to label %.noexc unwind label %bb10
+
+.noexc:                                           ; preds = %bb1
+  invoke void @f4_()
+          to label %bb6 unwind label %bb2
+
+bb2:                                              ; preds = %.noexc
+  %tmp3 = landingpad { i8*, i32 }
+          cleanup
+  invoke void @f3()
+          to label %.body unwind label %bb4
+
+bb4:                                              ; preds = %bb2
+  %tmp5 = landingpad { i8*, i32 }
+          catch i8* null
+  unreachable
+
+bb6:                                              ; preds = %.noexc
+  invoke void @_Z12xxxdtsP10xxxpq()
+          to label %_ZN6xxxdIN12xxxyzIi9xxxwLi29ELi0EE4fr1jS3_.exit unwind label %bb10
+
+_ZN6xxxdIN12xxxyzIi9xxxwLi29ELi0EE4fr1jS3_.exit:  ; preds = %bb6
+  invoke void @f2()
+          to label %bb7 unwind label %bb8
+
+bb7:                                              ; preds = %_ZN6xxxdIN12xxxyzIi9xxxwLi29ELi0EE4fr1jS3_.exit
+  ret void
+
+bb8:                                              ; preds = %_ZN6xxxdIN12xxxyzIi9xxxwLi29ELi0EE4fr1jS3_.exit
+  %tmp9 = landingpad { i8*, i32 }
+          cleanup
+  br label %_ZN10xxxpqdlev.exit
+
+bb10:                                             ; preds = %bb6, %bb1
+  %.1 = phi i1 [ true, %bb1 ], [ false, %bb6 ]
+  %tmp11 = landingpad { i8*, i32 }
+          cleanup
+  br label %.body
+
+.body:                                            ; preds = %bb10, %bb2
+  %.1.lpad-body = phi i1 [ %.1, %bb10 ], [ true, %bb2 ]
+  invoke void @f2()
+          to label %bb12 unwind label %bb14
+
+bb12:                                             ; preds = %.body
+  br i1 %.1.lpad-body, label %bb13, label %_ZN10xxxpqdlev.exit
+
+bb13:                                             ; preds = %bb12, %.thread
+  invoke void @xxx_MemFree()
+          to label %_ZN10xxxpqdlev.exit unwind label %bb14
+
+_ZN10xxxpqdlev.exit:                              ; preds = %bb13, %bb12, %bb8
+  resume { i8*, i32 } undef
+
+bb14:                                             ; preds = %bb13, %.body
+  %tmp15 = landingpad { i8*, i32 }
+          catch i8* null
+  unreachable
+}
+
+declare void @xxx_MemFree()

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-1.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-1.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-1.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,26 @@
+; RUN: opt < %s -rewrite-statepoints-for-gc -spp-print-base-pointers -S 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=rewrite-statepoints-for-gc -spp-print-base-pointers -S 2>&1 | FileCheck %s
+
+; CHECK: derived %merged_value base %merged_value.base
+
+declare void @site_for_call_safpeoint()
+
+define i64 addrspace(1)* @test(i64 addrspace(1)* %base_obj_x, i64 addrspace(1)* %base_obj_y, i1 %runtime_condition) gc "statepoint-example" {
+entry:
+  br i1 %runtime_condition, label %here, label %there
+
+here:                                             ; preds = %entry
+  %x = getelementptr i64, i64 addrspace(1)* %base_obj_x, i32 1
+  br label %merge
+
+there:                                            ; preds = %entry
+  %y = getelementptr i64, i64 addrspace(1)* %base_obj_y, i32 1
+  br label %merge
+
+merge:                                            ; preds = %there, %here
+; CHECK-LABEL: merge:
+; CHECK:   %merged_value.base = phi i64 addrspace(1)* [ %base_obj_x, %here ], [ %base_obj_y, %there ]
+  %merged_value = phi i64 addrspace(1)* [ %x, %here ], [ %y, %there ]
+  call void @site_for_call_safpeoint() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  ret i64 addrspace(1)* %merged_value
+}

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-10.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-10.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-10.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-10.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,36 @@
+; RUN: opt < %s -rewrite-statepoints-for-gc -spp-print-base-pointers -S 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=rewrite-statepoints-for-gc -spp-print-base-pointers -S 2>&1 | FileCheck %s
+
+
+declare i1 @runtime_value() "gc-leaf-function"
+
+declare void @do_safepoint()
+
+define void @select_of_phi(i64 addrspace(1)* %base_obj_x, i64 addrspace(1)* %base_obj_y) gc "statepoint-example" {
+entry:
+  br label %loop
+
+loop:                                             ; preds = %merge, %entry
+  %current_x = phi i64 addrspace(1)* [ %base_obj_x, %entry ], [ %next_x, %merge ]
+  %current_y = phi i64 addrspace(1)* [ %base_obj_y, %entry ], [ %next_y, %merge ]
+  %current = phi i64 addrspace(1)* [ null, %entry ], [ %next, %merge ]
+  %condition = call i1 @runtime_value()
+  %next_x = getelementptr i64, i64 addrspace(1)* %current_x, i32 1
+  %next_y = getelementptr i64, i64 addrspace(1)* %current_y, i32 1
+  br i1 %condition, label %true, label %false
+
+true:                                             ; preds = %loop
+  br label %merge
+
+false:                                            ; preds = %loop
+  br label %merge
+
+merge:                                            ; preds = %false, %true
+  %next = phi i64 addrspace(1)* [ %next_x, %true ], [ %next_y, %false ]
+  call void @do_safepoint() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  br label %loop
+}
+; CHECK: Base Pairs (w/o Relocation):
+; CHECK-DAG: derived %next base %next.base
+; CHECK-DAG: derived %next_x base %base_obj_x
+; CHECK-DAG: derived %next_y base %base_obj_y

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-11.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-11.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-11.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-11.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,25 @@
+; RUN: opt < %s -rewrite-statepoints-for-gc -spp-print-base-pointers  -S 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=rewrite-statepoints-for-gc -spp-print-base-pointers  -S 2>&1 | FileCheck %s
+
+; CHECK: derived %next base %base_obj
+
+declare void @do_safepoint()
+
+define void @test(i64 addrspace(1)* %base_obj) gc "statepoint-example" {
+entry:
+  %obj = getelementptr i64, i64 addrspace(1)* %base_obj, i32 1
+  br label %loop
+
+loop:                                             ; preds = %loop, %entry
+; CHECK-LABEL: loop:
+; CHECK: phi i64 addrspace(1)*
+; CHECK-DAG:  [ %base_obj.relocated.casted, %loop ] 
+; CHECK-DAG:  [ %base_obj, %entry ]
+; CHECK:  %current = phi i64 addrspace(1)* 
+; CHECK-DAG:  [ %obj, %entry ]
+; CHECK-DAG:  [ %next.relocated.casted, %loop ]
+  %current = phi i64 addrspace(1)* [ %obj, %entry ], [ %next, %loop ]
+  %next = getelementptr i64, i64 addrspace(1)* %current, i32 1
+  call void @do_safepoint() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  br label %loop
+}

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-12.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-12.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-12.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-12.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,21 @@
+; RUN: opt < %s -rewrite-statepoints-for-gc -spp-print-base-pointers -S 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=rewrite-statepoints-for-gc -spp-print-base-pointers -S 2>&1 | FileCheck %s
+
+; CHECK: derived %select base null
+
+ at global = external addrspace(1) global i8
+
+define i8 @test(i1 %cond) gc "statepoint-example" {
+  %derived1 = getelementptr i8, i8 addrspace(1)* @global, i64 1
+  %derived2 = getelementptr i8, i8 addrspace(1)* @global, i64 2
+  %select = select i1 %cond, i8 addrspace(1)* %derived1, i8 addrspace(1)* %derived2
+  call void @extern()
+; CHECK-NOT: relocate
+; CHECK: %load = load i8, i8 addrspace(1)* %select
+  %load = load i8, i8 addrspace(1)* %select
+  ret i8 %load
+}
+
+declare void @extern() gc "statepoint-example"
+
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-13.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-13.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-13.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-13.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,20 @@
+; RUN: opt < %s -rewrite-statepoints-for-gc -spp-print-base-pointers -S 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=rewrite-statepoints-for-gc -spp-print-base-pointers -S 2>&1 | FileCheck %s
+
+; CHECK: derived %derived base null
+
+ at global = external addrspace(1) global i8
+
+define i8 @test(i64 %offset) gc "statepoint-example" {
+  %derived = getelementptr i8, i8 addrspace(1)* @global, i64 %offset
+  call void @extern()
+; CHECK-NOT: relocate
+; CHECK-NOT: remat
+; CHECK: %load = load i8, i8 addrspace(1)* %derived
+  %load = load i8, i8 addrspace(1)* %derived
+  ret i8 %load
+}
+
+declare void @extern() gc "statepoint-example"
+
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-2.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-2.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-2.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,20 @@
+; RUN: opt < %s -rewrite-statepoints-for-gc -spp-print-base-pointers -S 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=rewrite-statepoints-for-gc -spp-print-base-pointers -S 2>&1 | FileCheck %s
+
+; CHECK: derived %merged_value base %base_obj
+
+define i64 addrspace(1)* @test(i64 addrspace(1)* %base_obj, i1 %runtime_condition) gc "statepoint-example" {
+entry:
+  br i1 %runtime_condition, label %merge, label %there
+
+there:                                            ; preds = %entry
+  %derived_obj = getelementptr i64, i64 addrspace(1)* %base_obj, i32 1
+  br label %merge
+
+merge:                                            ; preds = %there, %entry
+  %merged_value = phi i64 addrspace(1)* [ %base_obj, %entry ], [ %derived_obj, %there ]
+  call void @foo() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  ret i64 addrspace(1)* %merged_value
+}
+
+declare void @foo()

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-3.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-3.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-3.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,20 @@
+; RUN: opt < %s -rewrite-statepoints-for-gc -spp-print-base-pointers -S 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=rewrite-statepoints-for-gc -spp-print-base-pointers -S 2>&1 | FileCheck %s
+
+; CHECK: derived %next.i64 base %base_obj
+
+define void @test(i64 addrspace(1)* %base_obj) gc "statepoint-example" {
+entry:
+  %obj = getelementptr i64, i64 addrspace(1)* %base_obj, i32 1
+  br label %loop
+
+loop:                                             ; preds = %loop, %entry
+  %current = phi i64 addrspace(1)* [ %obj, %entry ], [ %next.i64, %loop ]
+  %current.i32 = bitcast i64 addrspace(1)* %current to i32 addrspace(1)*
+  %next.i32 = getelementptr i32, i32 addrspace(1)* %current.i32, i32 1
+  %next.i64 = bitcast i32 addrspace(1)* %next.i32 to i64 addrspace(1)*
+  call void @do_safepoint() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  br label %loop
+}
+
+declare void @do_safepoint()

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-4.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-4.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-4.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-4.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,45 @@
+; RUN: opt < %s -rewrite-statepoints-for-gc -spp-print-base-pointers -S 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=rewrite-statepoints-for-gc -spp-print-base-pointers -S 2>&1 | FileCheck %s
+
+; CHECK: derived %obj_to_consume base %obj_to_consume.base
+
+declare void @foo()
+
+declare i64 addrspace(1)* @generate_obj()
+
+declare void @consume_obj(i64 addrspace(1)*)
+
+define void @test(i32 %condition) gc "statepoint-example" {
+entry:
+  br label %loop
+
+loop:                                             ; preds = %merge.split, %entry
+; CHECK: loop:
+; CHECK:  [[TOKEN_0:%[^ ]+]] = call token (i64, i32, i64 addrspace(1)* ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_p1i64f(i64 2882400000, i32 0, i64 addrspace(1)* ()* @generate_obj, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i3
+; CHECK-NEXT:  [[RESULT_0:%[^ ]+]] = call i64 addrspace(1)* @llvm.experimental.gc.result
+  %0 = call i64 addrspace(1)* @generate_obj() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  switch i32 %condition, label %dest_a [
+    i32 0, label %dest_b
+    i32 1, label %dest_c
+  ]
+
+dest_a:                                           ; preds = %loop
+  br label %merge
+
+dest_b:                                           ; preds = %loop
+  br label %merge
+
+dest_c:                                           ; preds = %loop
+  br label %merge
+
+merge:                                            ; preds = %dest_c, %dest_b, %dest_a
+; CHECK: merge:
+; CHECK:  %obj_to_consume = phi i64 addrspace(1)* [ [[RESULT_0]], %dest_a ], [ null, %dest_b ], [ null, %dest_c ]
+  %obj_to_consume = phi i64 addrspace(1)* [ %0, %dest_a ], [ null, %dest_b ], [ null, %dest_c ]
+  call void @consume_obj(i64 addrspace(1)* %obj_to_consume) [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  br label %merge.split
+
+merge.split:                                      ; preds = %merge
+  call void @foo() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  br label %loop
+}

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-5.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-5.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-5.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-5.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,29 @@
+; RUN: opt < %s -rewrite-statepoints-for-gc -spp-print-base-pointers -S 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=rewrite-statepoints-for-gc -spp-print-base-pointers -S 2>&1 | FileCheck %s
+
+; CHECK: derived %merged_value base %merged_value.base
+
+declare void @foo()
+
+define i64 addrspace(1)* @test(i64 addrspace(1)* %base_obj_x, i64 addrspace(1)* %base_obj_y, i1 %runtime_condition) gc "statepoint-example" {
+entry:
+  br i1 %runtime_condition, label %here, label %there
+
+here:                                             ; preds = %entry
+  br label %bump
+
+bump:                                             ; preds = %here
+  br label %merge
+
+there:                                            ; preds = %entry
+  %y = getelementptr i64, i64 addrspace(1)* %base_obj_y, i32 1
+  br label %merge
+
+merge:                                            ; preds = %there, %bump
+; CHECK: merge:
+; CHECK:  %merged_value.base = phi i64 addrspace(1)* [ %base_obj_x, %bump ], [ %base_obj_y, %there ]
+; CHECK-NEXT:  %merged_value = phi i64 addrspace(1)* [ %base_obj_x, %bump ], [ %y, %there ]  
+  %merged_value = phi i64 addrspace(1)* [ %base_obj_x, %bump ], [ %y, %there ]
+  call void @foo() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  ret i64 addrspace(1)* %merged_value
+}

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-6.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-6.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-6.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-6.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,38 @@
+; RUN: opt < %s -rewrite-statepoints-for-gc -spp-print-base-pointers -S 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=rewrite-statepoints-for-gc -spp-print-base-pointers -S 2>&1 | FileCheck %s
+
+; CHECK: derived %merged_value base %merged_value.base
+
+declare void @site_for_call_safpeoint()
+
+define i64 addrspace(1)* @test(i64 addrspace(1)* %base_obj_x, i64 addrspace(1)* %base_obj_y, i1 %runtime_condition_x, i1 %runtime_condition_y) gc "statepoint-example" {
+entry:
+  br i1 %runtime_condition_x, label %here, label %there
+
+here:                                             ; preds = %entry
+  br i1 %runtime_condition_y, label %bump_here_a, label %bump_here_b
+
+bump_here_a:                                      ; preds = %here
+  %x_a = getelementptr i64, i64 addrspace(1)* %base_obj_x, i32 1
+  br label %merge_here
+
+bump_here_b:                                      ; preds = %here
+  %x_b = getelementptr i64, i64 addrspace(1)* %base_obj_x, i32 2
+  br label %merge_here
+
+merge_here:                                       ; preds = %bump_here_b, %bump_here_a
+  %x = phi i64 addrspace(1)* [ %x_a, %bump_here_a ], [ %x_b, %bump_here_b ]
+  br label %merge
+
+there:                                            ; preds = %entry
+  %y = getelementptr i64, i64 addrspace(1)* %base_obj_y, i32 1
+  br label %merge
+
+merge:                                            ; preds = %there, %merge_here
+; CHECK: merge:
+; CHECK:  %merged_value.base = phi i64 addrspace(1)* [ %base_obj_x, %merge_here ], [ %base_obj_y, %there ]
+; CHECK-NEXT:  %merged_value = phi i64 addrspace(1)* [ %x, %merge_here ], [ %y, %there ]  
+  %merged_value = phi i64 addrspace(1)* [ %x, %merge_here ], [ %y, %there ]
+  call void @site_for_call_safpeoint() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  ret i64 addrspace(1)* %merged_value
+}

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-7.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-7.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-7.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-7.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,46 @@
+; RUN: opt < %s -rewrite-statepoints-for-gc -spp-print-base-pointers -S 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=rewrite-statepoints-for-gc -spp-print-base-pointers -S 2>&1 | FileCheck %s
+
+; CHECK: derived %merged_value base %merged_value.base
+
+declare void @site_for_call_safpeoint()
+
+define i64 addrspace(1)* @test(i64 addrspace(1)* %base_obj_x, i64 addrspace(1)* %base_obj_y, i1 %runtime_condition_x, i1 %runtime_condition_y) gc "statepoint-example" {
+entry:
+  br i1 %runtime_condition_x, label %here, label %there
+
+here:                                             ; preds = %entry
+  br i1 %runtime_condition_y, label %bump_here_a, label %bump_here_b
+
+bump_here_a:                                      ; preds = %here
+  %x_a = getelementptr i64, i64 addrspace(1)* %base_obj_x, i32 1
+  br label %merge_here
+
+bump_here_b:                                      ; preds = %here
+  %x_b = getelementptr i64, i64 addrspace(1)* %base_obj_y, i32 2
+  br label %merge_here
+
+merge_here:                                       ; preds = %bump_here_b, %bump_here_a
+; CHECK: merge_here:
+; CHECK-DAG: %x.base
+; CHECK-DAG: phi i64 addrspace(1)*
+; CHECK-DAG: [ %base_obj_x, %bump_here_a ]
+; CHECK-DAG: [ %base_obj_y, %bump_here_b ]
+  %x = phi i64 addrspace(1)* [ %x_a, %bump_here_a ], [ %x_b, %bump_here_b ]
+  br label %merge
+
+there:                                            ; preds = %entry
+  %y = getelementptr i64, i64 addrspace(1)* %base_obj_y, i32 1
+  br label %merge
+
+merge:                                            ; preds = %there, %merge_here
+; CHECK: merge:
+; CHECK-DAG:  %merged_value.base
+; CHECK-DAG: phi i64 addrspace(1)*
+; CHECK-DAG: %merge_here
+; CHECK-DAG: [ %base_obj_y, %there ]
+; CHECK:  %merged_value = phi i64 addrspace(1)* [ %x, %merge_here ], [ %y, %there ]  
+  %merged_value = phi i64 addrspace(1)* [ %x, %merge_here ], [ %y, %there ]
+  call void @site_for_call_safpeoint() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  ret i64 addrspace(1)* %merged_value
+}

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-8.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-8.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-8.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-8.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,38 @@
+; RUN: opt < %s -rewrite-statepoints-for-gc -spp-print-base-pointers -S 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=rewrite-statepoints-for-gc -spp-print-base-pointers -S 2>&1 | FileCheck %s
+
+; CHECK: derived %next_element_ptr base %array_obj
+
+define i32 @null_in_array(i64 addrspace(1)* %array_obj) gc "statepoint-example" {
+entry:
+  %array_len_pointer.i64 = getelementptr i64, i64 addrspace(1)* %array_obj, i32 1
+  %array_len_pointer.i32 = bitcast i64 addrspace(1)* %array_len_pointer.i64 to i32 addrspace(1)*
+  %array_len = load i32, i32 addrspace(1)* %array_len_pointer.i32
+  %array_elems = bitcast i32 addrspace(1)* %array_len_pointer.i32 to i64 addrspace(1)* addrspace(1)*
+  br label %loop_check
+
+loop_check:                                       ; preds = %loop_back, %entry
+  %index = phi i32 [ 0, %entry ], [ %next_index, %loop_back ]
+  %current_element_ptr = phi i64 addrspace(1)* addrspace(1)* [ %array_elems, %entry ], [ %next_element_ptr, %loop_back ]
+  %index_lt = icmp ult i32 %index, %array_len
+  br i1 %index_lt, label %check_for_null, label %not_found
+
+check_for_null:                                   ; preds = %loop_check
+  %current_element = load i64 addrspace(1)*, i64 addrspace(1)* addrspace(1)* %current_element_ptr
+  %is_null = icmp eq i64 addrspace(1)* %current_element, null
+  br i1 %is_null, label %found, label %loop_back
+
+loop_back:                                        ; preds = %check_for_null
+  %next_element_ptr = getelementptr i64 addrspace(1)*, i64 addrspace(1)* addrspace(1)* %current_element_ptr, i32 1
+  %next_index = add i32 %index, 1
+  call void @do_safepoint() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  br label %loop_check
+
+not_found:                                        ; preds = %loop_check
+  ret i32 -1
+
+found:                                            ; preds = %check_for_null
+  ret i32 %index
+}
+
+declare void @do_safepoint()

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-9.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-9.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-9.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers-9.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,21 @@
+; RUN: opt < %s -rewrite-statepoints-for-gc -spp-print-base-pointers -S  2>&1 | FileCheck %s
+; RUN: opt < %s -passes=rewrite-statepoints-for-gc -spp-print-base-pointers -S  2>&1 | FileCheck %s
+
+; CHECK: derived %next base %base_obj
+
+declare i1 @runtime_value() "gc-leaf-function"
+
+define void @maybe_GEP(i64 addrspace(1)* %base_obj) gc "statepoint-example" {
+entry:
+  br label %loop
+
+loop:                                             ; preds = %loop, %entry
+  %current = phi i64 addrspace(1)* [ %base_obj, %entry ], [ %next, %loop ]
+  %condition = call i1 @runtime_value()
+  %maybe_next = getelementptr i64, i64 addrspace(1)* %current, i32 1
+  %next = select i1 %condition, i64 addrspace(1)* %maybe_next, i64 addrspace(1)* %current
+  call void @do_safepoint() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  br label %loop
+}
+
+declare void @do_safepoint()

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-pointers.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,155 @@
+; RUN: opt < %s -rewrite-statepoints-for-gc -S 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=rewrite-statepoints-for-gc -S 2>&1 | FileCheck %s
+
+; The rewriting needs to make %obj loop variant by inserting a phi 
+; of the original value and it's relocation.
+
+declare i64 addrspace(1)* @generate_obj() "gc-leaf-function"
+
+declare void @use_obj(i64 addrspace(1)*) "gc-leaf-function"
+
+define void @def_use_safepoint() gc "statepoint-example" {
+; CHECK-LABEL: def_use_safepoint
+; CHECK: phi i64 addrspace(1)* 
+; CHECK-DAG: [ %obj.relocated.casted, %loop ]
+; CHECK-DAG: [ %obj, %entry ]
+entry:
+  %obj = call i64 addrspace(1)* @generate_obj()
+  br label %loop
+
+loop:                                             ; preds = %loop, %entry
+  call void @use_obj(i64 addrspace(1)* %obj)
+  call void @do_safepoint() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  br label %loop
+}
+
+declare void @do_safepoint()
+
+declare void @parse_point(i64 addrspace(1)*)
+
+define i64 addrspace(1)* @test1(i32 %caller, i8 addrspace(1)* %a, i8 addrspace(1)* %b, i32 %unknown) gc "statepoint-example" {
+; CHECK-LABEL: test1
+entry:
+  br i1 undef, label %left, label %right
+
+left:                                             ; preds = %entry
+; CHECK: left:
+; CHECK-NEXT: %a.cast = bitcast i8 addrspace(1)* %a to i64 addrspace(1)*
+; CHECK-NEXT: [[CAST_L:%.*]] = bitcast i8 addrspace(1)* %a to i64 addrspace(1)*
+; Our safepoint placement pass calls removeUnreachableBlocks, which does a bunch
+; of simplifications to branch instructions.  This bug is visible only when
+; there are multiple branches into the same block from the same predecessor, and
+; the following ceremony is to make that artefact survive a call to 
+; removeUnreachableBlocks.  As an example, "br i1 undef, label %merge, label %merge"
+; will get simplified to "br label %merge" by removeUnreachableBlocks.
+  %a.cast = bitcast i8 addrspace(1)* %a to i64 addrspace(1)*
+  switch i32 %unknown, label %right [
+    i32 0, label %merge
+    i32 1, label %merge
+    i32 5, label %merge
+    i32 3, label %right
+  ]
+
+right:                                            ; preds = %left, %left, %entry
+; CHECK: right:
+; CHECK-NEXT: %b.cast = bitcast i8 addrspace(1)* %b to i64 addrspace(1)*
+; CHECK-NEXT: [[CAST_R:%.*]] = bitcast i8 addrspace(1)* %b to i64 addrspace(1)*
+  %b.cast = bitcast i8 addrspace(1)* %b to i64 addrspace(1)*
+  br label %merge
+
+merge:                                            ; preds = %right, %left, %left, %left
+; CHECK: merge:
+; CHECK-NEXT: %value.base = phi i64 addrspace(1)* [ [[CAST_L]], %left ], [ [[CAST_L]], %left ], [ [[CAST_L]], %left ], [ [[CAST_R]], %right ], !is_base_value !0
+  %value = phi i64 addrspace(1)* [ %a.cast, %left ], [ %a.cast, %left ], [ %a.cast, %left ], [ %b.cast, %right ]
+  call void @parse_point(i64 addrspace(1)* %value) [ "deopt"(i32 0, i32 0, i32 0, i32 0, i32 0) ]
+  ret i64 addrspace(1)* %value
+}
+
+;; The purpose of this test is to ensure that when two live values share a
+;;  base defining value with inherent conflicts, we end up with a *single*
+;;  base phi/select per such node.  This is testing an optimization, not a
+;;  fundemental correctness criteria
+define void @test2(i1 %cnd, i64 addrspace(1)* %base_obj, i64 addrspace(1)* %base_arg2) gc "statepoint-example" {
+; CHECK-LABEL: @test2
+entry:
+  %obj = getelementptr i64, i64 addrspace(1)* %base_obj, i32 1
+  br label %loop
+; CHECK-LABEL: loop
+; CHECK:   %current.base = phi i64 addrspace(1)*
+; CHECK-DAG: [ %base_obj, %entry ]
+
+; Given the two selects are equivelent, so are their base phis - ideally,
+; we'd have commoned these, but that's a missed optimization, not correctness.
+; CHECK-DAG: [ [[DISCARD:%.*.base.relocated.casted]], %loop ]
+; CHECK-NOT: extra.base
+; CHECK: next.base = select
+; CHECK: next = select
+; CHECK: extra2.base = select
+; CHECK: extra2 = select
+; CHECK: statepoint
+;; Both 'next' and 'extra2' are live across the backedge safepoint...
+
+loop:                                             ; preds = %loop, %entry
+  %current = phi i64 addrspace(1)* [ %obj, %entry ], [ %next, %loop ]
+  %extra = phi i64 addrspace(1)* [ %obj, %entry ], [ %extra2, %loop ]
+  %nexta = getelementptr i64, i64 addrspace(1)* %current, i32 1
+  %next = select i1 %cnd, i64 addrspace(1)* %nexta, i64 addrspace(1)* %base_arg2
+  %extra2 = select i1 %cnd, i64 addrspace(1)* %nexta, i64 addrspace(1)* %base_arg2
+  call void @foo() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  br label %loop
+}
+
+define i64 addrspace(1)* @test3(i1 %cnd, i64 addrspace(1)* %obj, i64 addrspace(1)* %obj2) gc "statepoint-example" {
+; CHECK-LABEL: @test3
+entry:
+  br i1 %cnd, label %merge, label %taken
+
+taken:                                            ; preds = %entry
+  br label %merge
+
+merge:                                            ; preds = %taken, %entry
+; CHECK-LABEL: merge:
+; CHECK-NEXT: phi
+; CHECK-NEXT: phi
+; CHECK-NEXT: gc.statepoint
+  %bdv = phi i64 addrspace(1)* [ %obj, %entry ], [ %obj2, %taken ]
+  call void @foo() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  ret i64 addrspace(1)* %bdv
+}
+
+define i64 addrspace(1)* @test4(i1 %cnd, i64 addrspace(1)* %obj, i64 addrspace(1)* %obj2) gc "statepoint-example" {
+; CHECK-LABEL: @test4
+entry:
+  br i1 %cnd, label %merge, label %taken
+
+taken:                                            ; preds = %entry
+  br label %merge
+
+merge:                                            ; preds = %taken, %entry
+; CHECK-LABEL: merge:
+; CHECK-NEXT: phi
+; CHECK-NEXT: gc.statepoint
+  %bdv = phi i64 addrspace(1)* [ %obj, %entry ], [ %obj, %taken ]
+  call void @foo() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  ret i64 addrspace(1)* %bdv
+}
+
+define i64 addrspace(1)* @test5(i1 %cnd, i64 addrspace(1)* %obj, i64 addrspace(1)* %obj2) gc "statepoint-example" {
+; CHECK-LABEL: @test5
+entry:
+  br label %merge
+
+merge:                                            ; preds = %merge, %entry
+; CHECK-LABEL: merge:
+; CHECK-NEXT: phi
+; CHECK-NEXT: phi
+; CHECK-NEXT: br i1
+  %bdv = phi i64 addrspace(1)* [ %obj, %entry ], [ %obj2, %merge ]
+  br i1 %cnd, label %merge, label %next
+
+next:                                             ; preds = %merge
+  call void @foo() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  ret i64 addrspace(1)* %bdv
+}
+
+declare void @foo()

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-vector.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-vector.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-vector.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/base-vector.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,279 @@
+; RUN: opt < %s -rewrite-statepoints-for-gc -S | FileCheck  %s
+; RUN: opt < %s -passes=rewrite-statepoints-for-gc -S | FileCheck  %s
+
+
+define i64 addrspace(1)* @test(<2 x i64 addrspace(1)*> %vec, i32 %idx) gc "statepoint-example" {
+; CHECK-LABEL: @test
+; CHECK: extractelement
+; CHECK: extractelement
+; CHECK: statepoint
+; CHECK: gc.relocate
+; CHECK-DAG: ; (%base_ee, %obj)
+; CHECK: gc.relocate
+; CHECK-DAG: ; (%base_ee, %base_ee)
+; Note that the second extractelement is actually redundant here.  A correct output would
+; be to reuse the existing obj as a base since it is actually a base pointer.
+entry:
+  %obj = extractelement <2 x i64 addrspace(1)*> %vec, i32 %idx
+  call void @do_safepoint() [ "deopt"() ]
+  ret i64 addrspace(1)* %obj
+}
+
+define i64 addrspace(1)* @test2(<2 x i64 addrspace(1)*>* %ptr, i1 %cnd, i32 %idx1, i32 %idx2) gc "statepoint-example" {
+; CHECK-LABEL: test2
+entry:
+  br i1 %cnd, label %taken, label %untaken
+
+taken:                                            ; preds = %entry
+  %obja = load <2 x i64 addrspace(1)*>, <2 x i64 addrspace(1)*>* %ptr
+  br label %merge
+
+untaken:                                          ; preds = %entry
+  %objb = load <2 x i64 addrspace(1)*>, <2 x i64 addrspace(1)*>* %ptr
+  br label %merge
+
+merge:                                            ; preds = %untaken, %taken
+  %vec = phi <2 x i64 addrspace(1)*> [ %obja, %taken ], [ %objb, %untaken ]
+  br i1 %cnd, label %taken2, label %untaken2
+
+taken2:                                           ; preds = %merge
+  %obj0 = extractelement <2 x i64 addrspace(1)*> %vec, i32 %idx1
+  br label %merge2
+
+untaken2:                                         ; preds = %merge
+  %obj1 = extractelement <2 x i64 addrspace(1)*> %vec, i32 %idx2
+  br label %merge2
+
+merge2:                                           ; preds = %untaken2, %taken2
+; CHECK-LABEL: merge2:
+; CHECK: %obj.base = phi i64 addrspace(1)*
+; CHECK: %obj = phi i64 addrspace(1)*
+; CHECK: statepoint
+; CHECK: gc.relocate
+; CHECK-DAG: ; (%obj.base, %obj)
+; CHECK: gc.relocate
+; CHECK-DAG: ; (%obj.base, %obj.base)
+  %obj = phi i64 addrspace(1)* [ %obj0, %taken2 ], [ %obj1, %untaken2 ]
+  call void @do_safepoint() [ "deopt"() ]
+  ret i64 addrspace(1)* %obj
+}
+
+define i64 addrspace(1)* @test3(i64 addrspace(1)* %ptr) gc "statepoint-example" {
+; CHECK-LABEL: test3
+; CHECK: insertelement
+; CHECK: extractelement
+; CHECK: statepoint
+; CHECK: gc.relocate
+; CHECK-DAG: (%obj.base, %obj)
+entry:
+  %vec = insertelement <2 x i64 addrspace(1)*> undef, i64 addrspace(1)* %ptr, i32 0
+  %obj = extractelement <2 x i64 addrspace(1)*> %vec, i32 0
+  call void @do_safepoint() [ "deopt"() ]
+  ret i64 addrspace(1)* %obj
+}
+
+define i64 addrspace(1)* @test4(i64 addrspace(1)* %ptr) gc "statepoint-example" {
+; CHECK-LABEL: test4
+; CHECK: statepoint
+; CHECK: gc.relocate
+; CHECK-DAG: ; (%obj.base, %obj)
+; When we can optimize an extractelement from a known
+; index and avoid introducing new base pointer instructions
+entry:
+  %derived = getelementptr i64, i64 addrspace(1)* %ptr, i64 16
+  %veca = insertelement <2 x i64 addrspace(1)*> undef, i64 addrspace(1)* %derived, i32 0
+  %vec = insertelement <2 x i64 addrspace(1)*> %veca, i64 addrspace(1)* %ptr, i32 1
+  %obj = extractelement <2 x i64 addrspace(1)*> %vec, i32 0
+  call void @do_safepoint() [ "deopt"() ]
+  ret i64 addrspace(1)* %obj
+}
+
+declare void @use(i64 addrspace(1)*) "gc-leaf-function"
+declare void @use_vec(<4 x i64 addrspace(1)*>) "gc-leaf-function"
+
+define void @test5(i1 %cnd, i64 addrspace(1)* %obj) gc "statepoint-example" {
+; CHECK-LABEL: @test5
+; CHECK: gc.relocate
+; CHECK-DAG: (%bdv.base, %bdv)
+; When we fundementally have to duplicate
+entry:
+  %gep = getelementptr i64, i64 addrspace(1)* %obj, i64 1
+  %vec = insertelement <2 x i64 addrspace(1)*> undef, i64 addrspace(1)* %gep, i32 0
+  %bdv = extractelement <2 x i64 addrspace(1)*> %vec, i32 0
+  call void @do_safepoint() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  call void @use(i64 addrspace(1)* %bdv)
+  ret void
+}
+
+define void @test6(i1 %cnd, i64 addrspace(1)* %obj, i64 %idx) gc "statepoint-example" {
+; CHECK-LABEL: @test6
+; CHECK: %gep = getelementptr i64, i64 addrspace(1)* %obj, i64 1
+; CHECK: %vec.base = insertelement <2 x i64 addrspace(1)*> zeroinitializer, i64 addrspace(1)* %obj, i32 0, !is_base_value !0
+; CHECK: %vec = insertelement <2 x i64 addrspace(1)*> undef, i64 addrspace(1)* %gep, i32 0
+; CHECK: %bdv.base = extractelement <2 x i64 addrspace(1)*> %vec.base, i64 %idx, !is_base_value !0
+; CHECK:  %bdv = extractelement <2 x i64 addrspace(1)*> %vec, i64 %idx
+; CHECK: gc.statepoint
+; CHECK: gc.relocate
+; CHECK-DAG: (%bdv.base, %bdv)
+; A more complicated example involving vector and scalar bases.
+; This is derived from a failing test case when we didn't have correct
+; insertelement handling.
+entry:
+  %gep = getelementptr i64, i64 addrspace(1)* %obj, i64 1
+  %vec = insertelement <2 x i64 addrspace(1)*> undef, i64 addrspace(1)* %gep, i32 0
+  %bdv = extractelement <2 x i64 addrspace(1)*> %vec, i64 %idx
+  call void @do_safepoint() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  call void @use(i64 addrspace(1)* %bdv)
+  ret void
+}
+
+define i64 addrspace(1)* @test7(i1 %cnd, i64 addrspace(1)* %obj, i64 addrspace(1)* %obj2) gc "statepoint-example" {
+; CHECK-LABEL: @test7
+entry:
+  %vec = insertelement <2 x i64 addrspace(1)*> undef, i64 addrspace(1)* %obj2, i32 0
+  br label %merge1
+
+merge1:                                           ; preds = %merge1, %entry
+; CHECK-LABEL: merge1:
+; CHECK: vec2.base
+; CHECK: vec2
+; CHECK: gep
+; CHECK: vec3.base
+; CHECK: vec3
+  %vec2 = phi <2 x i64 addrspace(1)*> [ %vec, %entry ], [ %vec3, %merge1 ]
+  %gep = getelementptr i64, i64 addrspace(1)* %obj2, i64 1
+  %vec3 = insertelement <2 x i64 addrspace(1)*> undef, i64 addrspace(1)* %gep, i32 0
+  br i1 %cnd, label %merge1, label %next1
+
+next1:                                            ; preds = %merge1
+; CHECK-LABEL: next1:
+; CHECK: bdv.base = 
+; CHECK: bdv = 
+  %bdv = extractelement <2 x i64 addrspace(1)*> %vec2, i32 0
+  br label %merge
+
+merge:                                            ; preds = %merge, %next1
+; CHECK-LABEL: merge:
+; CHECK: %objb.base
+; CHECK: %objb
+; CHECK: gc.statepoint
+; CHECK: gc.relocate
+; CHECK-DAG: (%objb.base, %objb)
+  %objb = phi i64 addrspace(1)* [ %obj, %next1 ], [ %bdv, %merge ]
+  br i1 %cnd, label %merge, label %next
+
+next:                                             ; preds = %merge
+  call void @do_safepoint() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  ret i64 addrspace(1)* %objb
+}
+
+; identify base for shufflevector
+define void @test8(i64 addrspace(1)* %obj, i64 %idx) gc "statepoint-example" {
+; CHECK-LABEL: @test8
+; CHECK: %gep = getelementptr i64, i64 addrspace(1)* %obj, i64 1
+; CHECK: %gep2 = getelementptr i64, i64 addrspace(1)* %obj, i64 2
+; CHECK: %vec1.base = insertelement <4 x i64 addrspace(1)*> zeroinitializer, i64 addrspace(1)* %obj, i32 0, !is_base_value !0
+; CHECK: %vec1 = insertelement <4 x i64 addrspace(1)*> undef, i64 addrspace(1)* %gep, i32 0
+; CHECK: %vec2.base = insertelement <4 x i64 addrspace(1)*> zeroinitializer, i64 addrspace(1)* %obj, i32 2, !is_base_value !0
+; CHECK: %vec2 = insertelement <4 x i64 addrspace(1)*> undef, i64 addrspace(1)* %gep2, i32 2
+; CHECK: %vec.base = shufflevector <4 x i64 addrspace(1)*> %vec1.base, <4 x i64 addrspace(1)*> %vec2.base, <2 x i32> <i32 0, i32 2>, !is_base_value !0
+; CHECK: %vec = shufflevector <4 x i64 addrspace(1)*> %vec1, <4 x i64 addrspace(1)*> %vec2, <2 x i32> <i32 0, i32 2>
+; CHECK: %bdv.base = extractelement <2 x i64 addrspace(1)*> %vec.base, i64 %idx, !is_base_value !0
+; CHECK: %bdv = extractelement <2 x i64 addrspace(1)*> %vec, i64 %idx
+; CHECK: gc.statepoint
+; CHECK: gc.relocate
+; CHECK-DAG: (%bdv.base, %bdv)
+entry:
+  %gep = getelementptr i64, i64 addrspace(1)* %obj, i64 1
+  %gep2 = getelementptr i64, i64 addrspace(1)* %obj, i64 2
+  %vec1 = insertelement <4 x i64 addrspace(1)*> undef, i64 addrspace(1)* %gep, i32 0
+  %vec2 = insertelement <4 x i64 addrspace(1)*> undef, i64 addrspace(1)* %gep2, i32 2
+  %vec = shufflevector <4 x i64 addrspace(1)*> %vec1, <4 x i64 addrspace(1)*> %vec2, <2 x i32> <i32 0, i32 2>
+  %bdv = extractelement <2 x i64 addrspace(1)*> %vec, i64 %idx
+  call void @do_safepoint() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  call void @use(i64 addrspace(1)* %bdv)
+  ret void
+}
+
+; Since the same 'base' vector is used in the shuffle operands, we do not need
+; create a shufflevector base.
+define void @test9(<4 x i64 addrspace(1)*> %vec1, i64 %idx) gc "statepoint-example" {
+; CHECK-LABEL: @test9
+; CHECK: %vec = shufflevector <4 x i64 addrspace(1)*> %vec1, <4 x i64 addrspace(1)*> %vec1, <2 x i32> <i32 0, i32 2>
+; CHECK: %base_ee = extractelement <4 x i64 addrspace(1)*> %vec1, i64 %idx, !is_base_value !0
+; CHECK: %bdv = extractelement <2 x i64 addrspace(1)*> %vec, i64 %idx
+; CHECK: gc.statepoint
+; CHECK: gc.relocate
+; CHECK-DAG: (%base_ee, %bdv)
+entry:
+ ; shrinking vec1 into vec
+  %vec = shufflevector <4 x i64 addrspace(1)*> %vec1, <4 x i64 addrspace(1)*> %vec1, <2 x i32> <i32 0, i32 2>
+  %bdv = extractelement <2 x i64 addrspace(1)*> %vec, i64 %idx
+  call void @do_safepoint() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  call void @use(i64 addrspace(1)* %bdv)
+  ret void
+}
+
+; vector operand of shufflevector is a phi
+define i64 addrspace(1)* @test10(i1 %cnd, i64 addrspace(1)* %obj, i64 addrspace(1)* %obj2) gc "statepoint-example" {
+; CHECK-LABEL: @test10
+entry:
+  %vec1 = insertelement <4 x i64 addrspace(1)*> undef, i64 addrspace(1)* %obj, i32 0
+  br i1 %cnd, label %here, label %merge
+
+here:
+  %vec2 = insertelement <4 x i64 addrspace(1)*> undef, i64 addrspace(1)* %obj2, i32 2
+  br label %merge
+
+merge:                                           ; preds = %merge, %entry, %here
+; CHECK-LABEL: merge:
+; CHECK: %vec.base = phi <4 x i64 addrspace(1)*> [ %vec1.base, %entry ], [ %vec2.base, %here ], [ %vec3.base, %merge ], !is_base_value !0
+; CHECK: vec
+; CHECK: vec3.base = shufflevector <4 x i64 addrspace(1)*> %vec.base, <4 x i64 addrspace(1)*> %vec.base
+; CHECK: vec3
+; CHECK: bdv.base
+; CHECK: bdv
+  %vec = phi <4 x i64 addrspace(1)*> [ %vec1, %entry ], [ %vec2, %here], [ %vec3, %merge]
+  %vec3 = shufflevector <4 x i64 addrspace(1)*> %vec, <4 x i64 addrspace(1)*> %vec, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
+  %bdv = extractelement <4 x i64 addrspace(1)*> %vec3, i32 0
+  br i1 %cnd, label %merge, label %next
+
+next:
+; CHECK-LABEL: next:
+; CHECK: gc.statepoint
+; CHECK: gc.relocate
+; CHECK-DAG: (%bdv.base, %bdv)
+  call void @do_safepoint() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  ret i64 addrspace(1)* %bdv
+}
+declare void @do_safepoint()
+
+define void @test11(<4 x i64 addrspace(1)*> %vec1) gc "statepoint-example" {
+; CHECK-LABEL: @test11(
+; CHECK: @llvm.experimental.gc.statepoint.p0f_isVoidf{{.*}}<4 x i64 addrspace(1)*> %vec1)
+; CHECK: %vec1.relocated = call coldcc <4 x i8 addrspace(1)*> @llvm.experimental.gc.relocate.v4p1i8
+; CHECK: %vec1.relocated.casted = bitcast <4 x i8 addrspace(1)*> %vec1.relocated to <4 x i64 addrspace(1)*>
+; CHECK: %vec2.remat = getelementptr i64, <4 x i64 addrspace(1)*> %vec1.relocated.casted, i32 1024
+; CHECK: call void @use_vec(<4 x i64 addrspace(1)*> %vec2.remat)
+entry:
+  %vec2 = getelementptr i64, <4 x i64 addrspace(1)*> %vec1, i32 1024
+  call void @do_safepoint() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  call void @use_vec(<4 x i64 addrspace(1) *> %vec2)
+  ret void
+}
+
+declare <4 x i64 addrspace(1)*> @def_vec() "gc-leaf-function"
+
+define void @test12(<4 x i64 addrspace(1)*> %vec1) gc "statepoint-example" {
+; CHECK-LABEL: @test12(
+; CHECK: @llvm.experimental.gc.statepoint.p0f_isVoidf{{.*}}<4 x i64 addrspace(1)*> %vec)
+; CHECK-NEXT: %vec.relocated = call coldcc <4 x i8 addrspace(1)*> @llvm.experimental.gc.relocate.v4p1i8(
+; CHECK-NEXT: %vec.relocated.casted = bitcast <4 x i8 addrspace(1)*> %vec.relocated to <4 x i64 addrspace(1)*>
+; CHECK-NEXT: call void @use_vec(<4 x i64 addrspace(1)*> %vec.relocated.casted)
+; CHECK-NEXT: ret void
+entry:
+  %vec = call <4 x i64 addrspace(1)*> @def_vec()
+  call void @do_safepoint() [ "deopt"() ]
+  call void @use_vec(<4 x i64 addrspace(1)*> %vec)
+  ret void
+}

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/basic.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/basic.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/basic.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/basic.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,74 @@
+; RUN: opt -S -rewrite-statepoints-for-gc < %s | FileCheck %s
+; RUN: opt -S -passes=rewrite-statepoints-for-gc < %s | FileCheck %s
+
+declare void @g()
+declare i32 @h()
+
+define i32 addrspace(1)* @f0(i32 addrspace(1)* %arg) gc "statepoint-example" {
+; CHECK-LABEL: @f0(
+ entry:
+; CHECK: [[TOKEN_0:%[^ ]+]] = call token {{[^@]*}} @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @g, i32 0, i32 0, i32 0, i32 1, i32 100, i32 addrspace(1)* %arg)
+  call void @g() [ "deopt"(i32 100) ]
+
+; CHECK: %arg.relocated = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[TOKEN_0]], i32 8, i32 8)
+  ret i32 addrspace(1)* %arg
+}
+
+define i32 addrspace(1)* @f1(i32 addrspace(1)* %arg) gc "statepoint-example"  personality i32 8  {
+; CHECK-LABEL: @f1(
+ entry:
+; CHECK: [[TOKEN_1:%[^ ]+]] = invoke token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @g, i32 0, i32 0, i32 0, i32 1, i32 100, i32 addrspace(1)* %arg)
+  invoke void @g() [ "deopt"(i32 100) ] to label %normal_dest unwind label %unwind_dest
+
+ normal_dest:
+; CHECK: %arg.relocated1 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[TOKEN_1]], i32 8, i32 8)
+  ret i32 addrspace(1)* %arg
+
+ unwind_dest: 
+  %lpad = landingpad token cleanup
+  resume token undef
+}
+
+define i32 addrspace(1)* @f2(i32 addrspace(1)* %arg) gc "statepoint-example" {
+; CHECK-LABEL: @f2(
+ entry:
+; CHECK: [[TOKEN_2:%[^ ]+]] = call token (i64, i32, i32 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32f(i64 2882400000, i32 0, i32 ()* @h, i32 0, i32 0, i32 0, i32 1, i32 100, i32 addrspace(1)* %arg)
+  %val = call i32 @h() [ "deopt"(i32 100) ]
+
+; CHECK: [[RESULT_F2:%[^ ]+]] = call i32 @llvm.experimental.gc.result.i32(token [[TOKEN_2]])
+; CHECK: %arg.relocated = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[TOKEN_2]], i32 8, i32 8)
+; CHECK: %arg.relocated.casted = bitcast i8 addrspace(1)* %arg.relocated to i32 addrspace(1)*
+
+  store i32 %val, i32 addrspace(1)* %arg
+; CHECK: store i32 [[RESULT_F2]], i32 addrspace(1)* %arg.relocated.casted
+  ret i32 addrspace(1)* %arg
+}
+
+define i32 addrspace(1)* @f3(i32 addrspace(1)* %arg) gc "statepoint-example"  personality i32 8  {
+; CHECK-LABEL: @f3(
+ entry:
+; CHECK: [[TOKEN_3:%[^ ]+]] = invoke token (i64, i32, i32 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32f(i64 2882400000, i32 0, i32 ()* @h, i32 0, i32 0, i32 0, i32 1, i32 100, i32 addrspace(1)* %arg)
+  %val = invoke i32 @h() [ "deopt"(i32 100) ] to label %normal_dest unwind label %unwind_dest
+
+ normal_dest:
+; CHECK: [[RESULT_F3:%[^ ]+]] = call i32 @llvm.experimental.gc.result.i32(token [[TOKEN_3]])
+; CHECK: [[ARG_RELOCATED:%[^ ]+]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[TOKEN_3]], i32 8, i32 8)
+; CHECK: [[ARG_RELOCATED_CASTED:%[^ ]+]] = bitcast i8 addrspace(1)* [[ARG_RELOCATED]] to i32 addrspace(1)*
+
+  store i32 %val, i32 addrspace(1)* %arg
+
+; CHECK: store i32 [[RESULT_F3]], i32 addrspace(1)* [[ARG_RELOCATED_CASTED]]
+  ret i32 addrspace(1)* %arg
+
+ unwind_dest: 
+  %lpad = landingpad token cleanup
+  resume token undef
+}
+
+define i32 addrspace(1)* @f4(i32 addrspace(1)* %arg) gc "statepoint-example" {
+; CHECK-LABEL: @f4(
+ entry:
+; CHECK: @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @g, i32 0, i32 1, i32 2, i32 400, i8 90,
+  call void @g() [ "gc-transition"(i32 400, i8 90) ]
+  ret i32 addrspace(1)* %arg
+}

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/basics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/basics.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/basics.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/basics.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,89 @@
+; This is a collection of really basic tests for gc.statepoint rewriting.
+; RUN: opt < %s -rewrite-statepoints-for-gc -spp-rematerialization-threshold=0 -S | FileCheck %s
+; RUN: opt < %s -passes=rewrite-statepoints-for-gc -spp-rematerialization-threshold=0 -S | FileCheck %s
+
+; Trivial relocation over a single call
+
+declare void @foo()
+
+define i8 addrspace(1)* @test1(i8 addrspace(1)* %obj) gc "statepoint-example" {
+; CHECK-LABEL: @test1
+entry:
+; CHECK-LABEL: entry:
+; CHECK-NEXT: gc.statepoint
+; CHECK-NEXT: %obj.relocated = call coldcc i8 addrspace(1)*
+; Two safepoints in a row (i.e. consistent liveness)
+  call void @foo() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  ret i8 addrspace(1)* %obj
+}
+
+define i8 addrspace(1)* @test2(i8 addrspace(1)* %obj) gc "statepoint-example" {
+; CHECK-LABEL: @test2
+entry:
+; CHECK-LABEL: entry:
+; CHECK-NEXT: gc.statepoint
+; CHECK-NEXT: %obj.relocated = call coldcc i8 addrspace(1)*
+; CHECK-NEXT: gc.statepoint
+; CHECK-NEXT: %obj.relocated2 = call coldcc i8 addrspace(1)*
+; A simple derived pointer
+  call void @foo() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  call void @foo() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  ret i8 addrspace(1)* %obj
+}
+
+define i8 @test3(i8 addrspace(1)* %obj) gc "statepoint-example" {
+entry:
+; CHECK-LABEL: entry:
+; CHECK-NEXT: getelementptr
+; CHECK-NEXT: gc.statepoint
+; CHECK-NEXT: %obj.relocated = call coldcc i8 addrspace(1)*
+; CHECK-NEXT: %derived.relocated = call coldcc i8 addrspace(1)*
+; CHECK-NEXT: load i8, i8 addrspace(1)* %derived.relocated
+; CHECK-NEXT: load i8, i8 addrspace(1)* %obj.relocated
+; Tests to make sure we visit both the taken and untaken predeccessor 
+; of merge.  This was a bug in the dataflow liveness at one point.
+  %derived = getelementptr i8, i8 addrspace(1)* %obj, i64 10
+  call void @foo() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  %a = load i8, i8 addrspace(1)* %derived
+  %b = load i8, i8 addrspace(1)* %obj
+  %c = sub i8 %a, %b
+  ret i8 %c
+}
+
+define i8 addrspace(1)* @test4(i1 %cmp, i8 addrspace(1)* %obj) gc "statepoint-example" {
+entry:
+  br i1 %cmp, label %taken, label %untaken
+
+taken:                                            ; preds = %entry
+; CHECK-LABEL: taken:
+; CHECK-NEXT: gc.statepoint
+; CHECK-NEXT: %obj.relocated = call coldcc i8 addrspace(1)*
+  call void @foo() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  br label %merge
+
+untaken:                                          ; preds = %entry
+; CHECK-LABEL: untaken:
+; CHECK-NEXT: gc.statepoint
+; CHECK-NEXT: %obj.relocated2 = call coldcc i8 addrspace(1)*
+  call void @foo() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  br label %merge
+
+merge:                                            ; preds = %untaken, %taken
+; CHECK-LABEL: merge:
+; CHECK-NEXT: %.0 = phi i8 addrspace(1)* [ %obj.relocated, %taken ], [ %obj.relocated2, %untaken ]
+; CHECK-NEXT: ret i8 addrspace(1)* %.0
+; When run over a function which doesn't opt in, should do nothing!
+  ret i8 addrspace(1)* %obj
+}
+
+define i8 addrspace(1)* @test5(i8 addrspace(1)* %obj) gc "ocaml" {
+; CHECK-LABEL: @test5
+entry:
+; CHECK-LABEL: entry:
+; CHECK-NEXT: gc.statepoint
+; CHECK-NOT: %obj.relocated = call coldcc i8 addrspace(1)*
+  %0 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+  ret i8 addrspace(1)* %obj
+}
+
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/call-gc-result.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/call-gc-result.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/call-gc-result.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/call-gc-result.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,39 @@
+;; RUN: opt < %s -rewrite-statepoints-for-gc -S | FileCheck %s
+;; RUN: opt < %s -passes=rewrite-statepoints-for-gc -S | FileCheck %s
+
+;; This test is to verify that gc_result from a call statepoint
+;; can have preceding phis in its parent basic block. Unlike
+;; invoke statepoint, call statepoint does not terminate the
+;; block, and thus its gc_result is in the same block with the
+;; call statepoint.
+
+declare i32 @foo()
+
+define i32 @test1(i1 %cond, i32 %a) gc "statepoint-example" {
+entry:
+  br i1 %cond, label %branch1, label %branch2
+  
+branch1:
+  %b = add i32 %a, 1
+  br label %merge
+ 
+branch2:
+  br label %merge
+
+merge:
+;; CHECK: 		%phi = phi i32 [ %a, %branch2 ], [ %b, %branch1 ]
+;; CHECK-NEXT:  [[TOKEN:%[^ ]+]] = call token (i64, i32, i32 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32f(i64 2882400000, i32 0, i32 ()* @foo, i32 0, i32 0, i32 0, i32 0
+;; CHECK-NEXT:  call i32 @llvm.experimental.gc.result.i32(token [[TOKEN]])
+  %phi = phi i32 [ %a, %branch2 ], [ %b, %branch1 ]
+  %ret = call i32 @foo()
+  ret i32 %ret
+}
+
+; This function is inlined when inserting a poll.
+declare void @do_safepoint()
+define void @gc.safepoint_poll() {
+; CHECK-LABEL: gc.safepoint_poll
+entry:
+  call void @do_safepoint()
+  ret void
+}

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/check_traversal_order.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/check_traversal_order.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/check_traversal_order.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/check_traversal_order.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,38 @@
+; RUN: opt -S -rewrite-statepoints-for-gc < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @f()
+declare void @g(i8 addrspace(1)*, i8 addrspace(1)*)
+declare i32 @personality_function()
+
+; Make sure that we do not fail assertion because we process call of @g before
+; we process the call of @f.
+
+define void @test_01(i8 addrspace(1)* %p, i1 %cond) gc "statepoint-example" personality i32 ()* @personality_function {
+
+; CHECK-LABEL: @test_01(
+
+entry:
+  %tmp0 = insertelement <2 x i8 addrspace(1)*> undef, i8 addrspace(1)* %p, i32 0
+  %tmp1 = insertelement <2 x i8 addrspace(1)*> %tmp0, i8 addrspace(1)* %p, i32 1
+  %tmp2 = extractelement <2 x i8 addrspace(1)*> %tmp1, i32 1
+  %tmp3 = extractelement <2 x i8 addrspace(1)*> %tmp1, i32 0
+  br label %loop
+
+loop:
+  br i1 %cond, label %cond_block, label %exit
+
+cond_block:
+  br i1 %cond, label %backedge, label %exit
+
+exit:
+  %tmp4 = phi i8 addrspace(1)* [ %tmp2, %loop ], [ %tmp2, %cond_block ]
+  call void @g(i8 addrspace(1)* %tmp3, i8 addrspace(1)* %tmp4)
+  ret void
+
+backedge:
+  call void @f()
+  br label %loop
+}

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/codegen-cond.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/codegen-cond.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/codegen-cond.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/codegen-cond.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,82 @@
+; RUN: opt -rewrite-statepoints-for-gc -S < %s | FileCheck %s
+; RUN: opt -passes=rewrite-statepoints-for-gc -S < %s | FileCheck %s
+
+; A null test of a single value
+
+define i1 @test(i8 addrspace(1)* %p, i1 %rare) gc "statepoint-example" {
+; CHECK-LABEL: @test
+entry:
+  %cond = icmp eq i8 addrspace(1)* %p, null
+  br i1 %rare, label %safepoint, label %continue, !prof !0
+
+safepoint:                                        ; preds = %entry
+  call void @safepoint() [ "deopt"() ]
+  br label %continue
+
+continue:                                         ; preds = %safepoint, %entry
+; CHECK-LABEL: continue:
+; CHECK: phi
+; CHECK-DAG: [ %p.relocated, %safepoint ]
+; CHECK-DAG: [ %p, %entry ]
+; CHECK: %cond = icmp
+; CHECK: br i1 %cond
+; Comparing two pointers
+  br i1 %cond, label %taken, label %untaken
+
+taken:                                            ; preds = %continue
+  ret i1 true
+
+untaken:                                          ; preds = %continue
+  ret i1 false
+}
+
+define i1 @test2(i8 addrspace(1)* %p, i8 addrspace(1)* %q, i1 %rare) gc "statepoint-example" {
+; CHECK-LABEL: @test2
+entry:
+  %cond = icmp eq i8 addrspace(1)* %p, %q
+  br i1 %rare, label %safepoint, label %continue, !prof !0
+
+safepoint:                                        ; preds = %entry
+  call void @safepoint() [ "deopt"() ]
+  br label %continue
+
+continue:                                         ; preds = %safepoint, %entry
+; CHECK-LABEL: continue:
+; CHECK: phi
+; CHECK-DAG: [ %q.relocated, %safepoint ]
+; CHECK-DAG: [ %q, %entry ]
+; CHECK: phi
+; CHECK-DAG: [ %p.relocated, %safepoint ]
+; CHECK-DAG: [ %p, %entry ]
+; CHECK: %cond = icmp
+; CHECK: br i1 %cond
+; Sanity check that nothing bad happens if already last instruction
+; before terminator
+  br i1 %cond, label %taken, label %untaken
+
+taken:                                            ; preds = %continue
+  ret i1 true
+
+untaken:                                          ; preds = %continue
+  ret i1 false
+}
+
+define i1 @test3(i8 addrspace(1)* %p, i8 addrspace(1)* %q, i1 %rare) gc "statepoint-example" {
+; CHECK-LABEL: @test3
+; CHECK: gc.statepoint
+; CHECK: %cond = icmp
+; CHECK: br i1 %cond
+entry:
+  call void @safepoint() [ "deopt"() ]
+  %cond = icmp eq i8 addrspace(1)* %p, %q
+  br i1 %cond, label %taken, label %untaken
+
+taken:                                            ; preds = %entry
+  ret i1 true
+
+untaken:                                          ; preds = %entry
+  ret i1 false
+}
+
+declare void @safepoint()
+!0 = !{!"branch_weights", i32 1, i32 10000}

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/constants.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/constants.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/constants.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/constants.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,264 @@
+; RUN: opt -S -rewrite-statepoints-for-gc < %s | FileCheck %s
+; RUN: opt -S -passes=rewrite-statepoints-for-gc < %s | FileCheck %s
+
+; constants don't get relocated.
+ at G = addrspace(1) global i8 5
+
+declare void @foo()
+
+define i8 @test() gc "statepoint-example" {
+; CHECK-LABEL: @test
+; CHECK: gc.statepoint
+; CHECK-NEXT: load i8, i8 addrspace(1)* inttoptr (i64 15 to i8 addrspace(1)*)
+; Mostly just here to show reasonable code test can come from.  
+entry:
+  call void @foo() [ "deopt"() ]
+  %res = load i8, i8 addrspace(1)* inttoptr (i64 15 to i8 addrspace(1)*)
+  ret i8 %res
+}
+
+define i8 @test2(i8 addrspace(1)* %p) gc "statepoint-example" {
+; CHECK-LABEL: @test2
+; CHECK: gc.statepoint
+; CHECK-NEXT: gc.relocate
+; CHECK-NEXT: icmp
+; Globals don't move and thus don't get relocated
+entry:
+  call void @foo() [ "deopt"() ]
+  %cmp = icmp eq i8 addrspace(1)* %p, null
+  br i1 %cmp, label %taken, label %not_taken
+
+taken:                                            ; preds = %not_taken, %entry
+  ret i8 0
+
+not_taken:                                        ; preds = %entry
+  %cmp2 = icmp ne i8 addrspace(1)* %p, null
+  br i1 %cmp2, label %taken, label %dead
+
+dead:                                             ; preds = %not_taken
+  %addr = getelementptr i8, i8 addrspace(1)* %p, i32 15
+  %res = load i8, i8 addrspace(1)* %addr
+  ret i8 %res
+}
+
+define i8 @test3(i1 %always_true) gc "statepoint-example" {
+; CHECK-LABEL: @test3
+; CHECK: gc.statepoint
+; CHECK-NEXT: load i8, i8 addrspace(1)* @G
+entry:
+  call void @foo() [ "deopt"() ]
+  %res = load i8, i8 addrspace(1)* @G, align 1
+  ret i8 %res
+}
+
+; Even for source languages without constant references, we can
+; see constants can show up along paths where the value is dead.
+; This is particular relevant when computing bases of PHIs.  
+define i8 addrspace(1)* @test4(i8 addrspace(1)* %p) gc "statepoint-example" {
+; CHECK-LABEL: @test4
+entry:
+  %is_null = icmp eq i8 addrspace(1)* %p, null
+  br i1 %is_null, label %split, label %join
+
+split:
+  call void @foo()
+  %arg_value_addr.i = getelementptr inbounds i8, i8 addrspace(1)* %p, i64 8
+  %arg_value_addr_casted.i = bitcast i8 addrspace(1)* %arg_value_addr.i to i8 addrspace(1)* addrspace(1)*
+  br label %join
+
+join:
+; CHECK-LABEL: join
+; CHECK: %addr2.base =
+  %addr2 = phi i8 addrspace(1)* addrspace(1)* [ %arg_value_addr_casted.i, %split ], [ inttoptr (i64 8 to i8 addrspace(1)* addrspace(1)*), %entry ]
+  ;; NOTE: This particular example can be jump-threaded, but in general,
+  ;; we can't, and have to deal with the resulting IR.
+  br i1 %is_null, label %early-exit, label %use
+
+early-exit:
+  ret i8 addrspace(1)* null
+
+use:
+; CHECK-LABEL: use:
+; CHECK: gc.statepoint
+; CHECK: gc.relocate
+  call void @foo()
+  %res = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %addr2, align 1
+  ret i8 addrspace(1)* %res
+}
+
+; Globals don't move and thus don't get relocated
+define i8 addrspace(1)* @test5(i1 %always_true) gc "statepoint-example" {
+; CHECK-LABEL: @test5
+; CHECK: gc.statepoint
+; CHECK-NEXT: %res = extractelement <2 x i8 addrspace(1)*> <i8 addrspace(1)* @G, i8 addrspace(1)* @G>, i32 0
+entry:
+  call void @foo()
+  %res = extractelement <2 x i8 addrspace(1)*> <i8 addrspace(1)* @G, i8 addrspace(1)* @G>, i32 0
+  ret i8 addrspace(1)* %res
+}
+
+define i8 addrspace(1)* @test6(i64 %arg) gc "statepoint-example" {
+entry:
+  ; Don't fail any assertions and don't record null as a live value
+  ; CHECK-LABEL: test6
+  ; CHECK: gc.statepoint
+  ; CHECK-NOT: call {{.*}}gc.relocate
+  %load_addr = getelementptr i8, i8 addrspace(1)* null, i64 %arg
+  call void @foo() [ "deopt"() ]
+  ret i8 addrspace(1)* %load_addr
+}
+
+define i8 addrspace(1)* @test7(i64 %arg) gc "statepoint-example" {
+entry:
+  ; Same as test7 but use regular constant instead of a null
+  ; CHECK-LABEL: test7
+  ; CHECK: gc.statepoint
+  ; CHECK-NOT: call {{.*}}gc.relocate
+  %load_addr = getelementptr i8, i8 addrspace(1)* inttoptr (i64 15 to i8 addrspace(1)*), i64 %arg
+  call void @foo() [ "deopt"() ]
+  ret i8 addrspace(1)* %load_addr
+}
+
+define i8 @test8(i8 addrspace(1)* %p) gc "statepoint-example" {
+; Checks that base( phi(gep null, oop) ) = phi(null, base(oop)) and that we
+; correctly relocate this value
+; CHECK-LABEL: @test8
+entry:
+  %is_null = icmp eq i8 addrspace(1)* %p, null
+  br i1 %is_null, label %null.crit-edge, label %not-null
+
+not-null:
+  %load_addr = getelementptr inbounds i8, i8 addrspace(1)* %p, i64 8
+  br label %join
+
+null.crit-edge:
+  %load_addr.const = getelementptr inbounds i8, i8 addrspace(1)* null, i64 8
+  br label %join
+
+join:
+  %addr = phi i8 addrspace(1)* [ %load_addr, %not-null ], [%load_addr.const, %null.crit-edge]
+  ; CHECK: %addr.base = phi i8 addrspace(1)*
+  ; CHECK-DAG: [ %p, %not-null ]
+  ; CHECK-DAG: [ null, %null.crit-edge ]
+  ; CHECK: gc.statepoint
+  call void @foo() [ "deopt"() ]
+  ; CHECK-DAG: call {{.*}}gc.relocate{{.*}}(%addr.base, %addr.base)
+  ; CHECK-DAG: call {{.*}}gc.relocate{{.*}}(%addr.base, %addr)
+  br i1 %is_null, label %early-exit, label %use
+
+early-exit:
+  ret i8 0
+
+use:
+  %res = load i8, i8 addrspace(1)* %addr, align 1
+  ret i8 %res
+}
+
+define i8 @test9(i8 addrspace(1)* %p) gc "statepoint-example" {
+; Checks that base( phi(inttoptr, oop) ) = phi(null, base(oop)) and that we
+; correctly relocate this value
+; CHECK-LABEL: @test9
+entry:
+  %is_null = icmp eq i8 addrspace(1)* %p, null
+  br i1 %is_null, label %null.crit-edge, label %not-null
+
+not-null:
+  %load_addr = getelementptr inbounds i8, i8 addrspace(1)* %p, i64 8
+  br label %join
+
+null.crit-edge:
+  br label %join
+
+join:
+  %addr = phi i8 addrspace(1)* [ %load_addr, %not-null ], [inttoptr (i64 8 to i8 addrspace(1)*), %null.crit-edge]
+  ; CHECK: %addr.base = phi i8 addrspace(1)*
+  ; CHECK-DAG: [ %p, %not-null ]
+  ; CHECK-DAG: [ null, %null.crit-edge ]
+  ; CHECK: gc.statepoint
+  call void @foo() [ "deopt"() ]
+  ; CHECK-DAG: call {{.*}}gc.relocate{{.*}}(%addr.base, %addr.base)
+  ; CHECK-DAG: call {{.*}}gc.relocate{{.*}}(%addr.base, %addr)
+  br i1 %is_null, label %early-exit, label %use
+
+early-exit:
+  ret i8 0
+
+use:
+  %res = load i8, i8 addrspace(1)* %addr, align 1
+  ret i8 %res
+}
+
+define i8 @test10(i8 addrspace(1)* %p) gc "statepoint-example" {
+; Checks that base( phi(const gep, oop) ) = phi(null, base(oop)) and that we
+; correctly relocate this value
+; CHECK-LABEL: @test10
+entry:
+  %is_null = icmp eq i8 addrspace(1)* %p, null
+  br i1 %is_null, label %null.crit-edge, label %not-null
+
+not-null:
+  %load_addr = getelementptr inbounds i8, i8 addrspace(1)* %p, i64 8
+  br label %join
+
+null.crit-edge:
+  br label %join
+
+join:
+  %addr = phi i8 addrspace(1)* [ %load_addr, %not-null ], [getelementptr (i8, i8 addrspace(1)* null, i64 8), %null.crit-edge]
+  ; CHECK: %addr.base = phi i8 addrspace(1)*
+  ; CHECK-DAG: [ %p, %not-null ]
+  ; CHECK-DAG: [ null, %null.crit-edge ]
+  ; CHECK: gc.statepoint
+  call void @foo() [ "deopt"() ]
+  ; CHECK-DAG: call {{.*}}gc.relocate{{.*}}(%addr.base, %addr.base)
+  ; CHECK-DAG: call {{.*}}gc.relocate{{.*}}(%addr.base, %addr)
+  br i1 %is_null, label %early-exit, label %use
+
+early-exit:
+  ret i8 0
+
+use:
+  %res = load i8, i8 addrspace(1)* %addr, align 1
+  ret i8 %res
+}
+
+define i32 addrspace(1)* @test11(i1 %c) gc "statepoint-example" {
+; CHECK-LABEL: @test11
+; Checks that base( select(const1, const2) ) == null and that we don't record
+; such value in the oop map
+entry:
+  %val = select i1 %c, i32 addrspace(1)* inttoptr (i64 8 to i32 addrspace(1)*), i32 addrspace(1)* inttoptr (i64 15 to i32 addrspace(1)*)
+  ; CHECK: gc.statepoint
+  ; CHECK-NOT: call {{.*}}gc.relocate
+  call void @foo() [ "deopt"() ]
+  ret i32 addrspace(1)* %val
+}
+
+
+define <2 x i32 addrspace(1)*> @test12(i1 %c) gc "statepoint-example" {
+; CHECK-LABEL: @test12
+; Same as test11 but with vectors
+entry:
+  %val = select i1 %c, <2 x i32 addrspace(1)*> <i32 addrspace(1)* inttoptr (i64 5 to i32 addrspace(1)*), 
+                                                i32 addrspace(1)* inttoptr (i64 15 to i32 addrspace(1)*)>, 
+                       <2 x i32 addrspace(1)*> <i32 addrspace(1)* inttoptr (i64 30 to i32 addrspace(1)*), 
+                                                i32 addrspace(1)* inttoptr (i64 60 to i32 addrspace(1)*)>
+  ; CHECK: gc.statepoint
+  ; CHECK-NOT: call {{.*}}gc.relocate
+  call void @foo() [ "deopt"() ]
+  ret <2 x i32 addrspace(1)*> %val
+}
+
+define <2 x i32 addrspace(1)*> @test13(i1 %c, <2 x i32 addrspace(1)*> %ptr) gc "statepoint-example" {
+; CHECK-LABEL: @test13
+; Similar to test8, test9 and test10 but with vectors
+entry:
+  %val = select i1 %c, <2 x i32 addrspace(1)*> %ptr, 
+                       <2 x i32 addrspace(1)*> <i32 addrspace(1)* inttoptr (i64 30 to i32 addrspace(1)*), i32 addrspace(1)* inttoptr (i64 60 to i32 addrspace(1)*)>
+  ; CHECK: %val.base = select i1 %c, <2 x i32 addrspace(1)*> %ptr, <2 x i32 addrspace(1)*> zeroinitializer, !is_base_value !0
+  ; CHECK: gc.statepoint
+  call void @foo() [ "deopt"() ]
+  ; CHECK-DAG: call {{.*}}gc.relocate{{.*}}(%val.base, %val.base)
+  ; CHECK-DAG: call {{.*}}gc.relocate{{.*}}(%val.base, %val)
+  ret <2 x i32 addrspace(1)*> %val
+}

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/deopt-intrinsic-cconv.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/deopt-intrinsic-cconv.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/deopt-intrinsic-cconv.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/deopt-intrinsic-cconv.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,17 @@
+; RUN: opt -rewrite-statepoints-for-gc -S < %s | FileCheck %s
+; RUN: opt -passes=rewrite-statepoints-for-gc -S < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+declare cc42 double @llvm.experimental.deoptimize.f64(...)
+
+define double @caller_3() gc "statepoint-example" {
+; CHECK-LABEL: @caller_3(
+; CHECK: call cc42 token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint
+; CHECK:  unreachable
+
+entry:
+  %val = call cc42 double(...) @llvm.experimental.deoptimize.f64() [ "deopt"() ]
+  ret double %val
+}

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/deopt-intrinsic.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/deopt-intrinsic.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/deopt-intrinsic.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/deopt-intrinsic.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,36 @@
+; RUN: opt -rewrite-statepoints-for-gc -S < %s | FileCheck %s
+; RUN: opt -passes=rewrite-statepoints-for-gc -S < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+declare i32 @llvm.experimental.deoptimize.i32(...)
+declare void @llvm.experimental.deoptimize.isVoid(...)
+
+define i32 @caller_0(i32 addrspace(1)* %ptr) gc "statepoint-example" {
+; CHECK-LABEL: @caller_0(
+; CHECK: @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @__llvm_deoptimize, i32 0
+; CHECK: unreachable
+entry:
+  %v = call i32(...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 0, i32 addrspace(1)* %ptr) ]
+  ret i32 %v
+}
+
+
+define i32 @caller_1(i32 addrspace(1)* %ptr) gc "statepoint-example" {
+; CHECK-LABEL: @caller_1
+; CHECK: @llvm.experimental.gc.statepoint.p0f_isVoidi32p1i32f(i64 2882400000, i32 0, void (i32, i32 addrspace(1)*)* bitcast (void ()* @__llvm_deoptimize to void (i32, i32 addrspace(1)*)*), i32 2, i32 0, i32 50, i32 addrspace(1)* %ptr
+; CHECK: unreachable
+entry:
+  %v = call i32(...) @llvm.experimental.deoptimize.i32(i32 50, i32 addrspace(1)* %ptr) [ "deopt"(i32 0) ]
+  ret i32 %v
+}
+
+define void @caller_2(i32 addrspace(1)* %ptr) gc "statepoint-example" {
+; CHECK-LABEL: @caller_2(
+; CHECK: @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @__llvm_deoptimize, i32 0
+; CHECK: unreachable
+entry:
+  call void(...) @llvm.experimental.deoptimize.isVoid() [ "deopt"(i32 0, i32 addrspace(1)* %ptr) ]
+  ret void
+}

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/deopt-lowering-attrs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/deopt-lowering-attrs.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/deopt-lowering-attrs.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/deopt-lowering-attrs.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,35 @@
+; RUN: opt -rewrite-statepoints-for-gc -S < %s | FileCheck %s
+; RUN: opt -passes=rewrite-statepoints-for-gc -S < %s | FileCheck %s
+; Check that the "deopt-lowering" function attribute gets transcoded into
+; flags on the resulting statepoint
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+declare void @foo()
+declare void @bar() "deopt-lowering"="live-in"
+declare void @baz() "deopt-lowering"="live-through"
+
+define void @test1() gc "statepoint-example" {
+; CHECK-LABEL: @test1(
+; CHECK: @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 1, i32 57)
+; CHECK: @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @bar, i32 0, i32 2, i32 0, i32 1, i32 42)
+; CHECK: @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @baz, i32 0, i32 0, i32 0, i32 1, i32 13)
+
+entry:
+  call void @foo() [ "deopt"(i32 57) ]
+  call void @bar() [ "deopt"(i32 42) ]
+  call void @baz() [ "deopt"(i32 13) ]
+  ret void
+}
+
+; add deopt-lowering attribute as part of callsite
+define void @test2() gc "statepoint-example" {
+; CHECK-LABEL: @test2(
+; CHECK: @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @foo, i32 0, i32 2, i32 0, i32 1, i32 57)
+
+entry:
+  call void @foo()  "deopt-lowering"="live-in"  [ "deopt"(i32 57) ]
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/deref-pointers.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/deref-pointers.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/deref-pointers.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/deref-pointers.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,126 @@
+; RUN: opt -S -rewrite-statepoints-for-gc < %s | FileCheck %s
+; RUN: opt -S -passes=rewrite-statepoints-for-gc < %s | FileCheck %s
+
+; CHECK: declare i8 addrspace(1)* @some_function_ret_deref()
+; CHECK: define i8 addrspace(1)* @test_deref_arg(i8 addrspace(1)* %a)
+; CHECK: define i8 addrspace(1)* @test_deref_or_null_arg(i8 addrspace(1)* %a)
+; CHECK: define i8 addrspace(1)* @test_noalias_arg(i8 addrspace(1)* %a)
+
+declare void @foo()
+
+declare i8 addrspace(1)* @some_function() "gc-leaf-function"
+
+declare void @some_function_consumer(i8 addrspace(1)*) "gc-leaf-function"
+
+declare dereferenceable(4) i8 addrspace(1)* @some_function_ret_deref() "gc-leaf-function"
+declare noalias i8 addrspace(1)* @some_function_ret_noalias() "gc-leaf-function"
+
+define i8 addrspace(1)* @test_deref_arg(i8 addrspace(1)* dereferenceable(4) %a) gc "statepoint-example" {
+entry:
+  call void @foo() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  ret i8 addrspace(1)* %a
+}
+
+define i8 addrspace(1)* @test_deref_or_null_arg(i8 addrspace(1)* dereferenceable_or_null(4) %a) gc "statepoint-example" {
+entry:
+  call void @foo() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  ret i8 addrspace(1)* %a
+}
+
+define i8 addrspace(1)* @test_noalias_arg(i8 addrspace(1)* noalias %a) gc "statepoint-example" {
+entry:
+  call void @foo() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  ret i8 addrspace(1)* %a
+}
+
+define i8 addrspace(1)* @test_deref_retval() gc "statepoint-example" {
+; CHECK-LABEL: @test_deref_retval(
+; CHECK: %a = call i8 addrspace(1)* @some_function()
+entry:
+  %a = call dereferenceable(4) i8 addrspace(1)* @some_function()
+  call void @foo() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  ret i8 addrspace(1)* %a
+}
+
+define i8 addrspace(1)* @test_deref_or_null_retval() gc "statepoint-example" {
+; CHECK-LABEL: @test_deref_or_null_retval(
+; CHECK: %a = call i8 addrspace(1)* @some_function()
+entry:
+  %a = call dereferenceable_or_null(4) i8 addrspace(1)* @some_function()
+  call void @foo() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  ret i8 addrspace(1)* %a
+}
+
+define i8 addrspace(1)* @test_noalias_retval() gc "statepoint-example" {
+; CHECK-LABEL: @test_noalias_retval(
+; CHECK: %a = call i8 addrspace(1)* @some_function()
+entry:
+  %a = call noalias i8 addrspace(1)* @some_function()
+  call void @foo() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  ret i8 addrspace(1)* %a
+}
+
+define i8 @test_md(i8 addrspace(1)* %ptr) gc "statepoint-example" {
+; CHECK-LABEL: @test_md(
+; CHECK: %tmp = load i8, i8 addrspace(1)* %ptr, !tbaa [[TAG_old:!.*]]
+entry:
+  %tmp = load i8, i8 addrspace(1)* %ptr, !tbaa !0
+  call void @foo() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  ret i8 %tmp
+}
+
+; Same as test_md() above, but with new-format TBAA metadata.
+define i8 @test_md_new(i8 addrspace(1)* %ptr) gc "statepoint-example" {
+; CHECK-LABEL: @test_md_new(
+; CHECK: %tmp = load i8, i8 addrspace(1)* %ptr, !tbaa [[TAG_new:!.*]]
+entry:
+  %tmp = load i8, i8 addrspace(1)* %ptr, !tbaa !4
+  call void @foo() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  ret i8 %tmp
+}
+
+define i8 addrspace(1)* @test_decl_only_attribute(i8 addrspace(1)* %ptr) gc "statepoint-example" {
+; CHECK-LABEL: @test_decl_only_attribute(
+; No change here, but the prototype of some_function_ret_deref should have changed.
+; CHECK: call i8 addrspace(1)* @some_function_ret_deref()
+entry:
+  %a = call i8 addrspace(1)* @some_function_ret_deref()
+  call void @foo() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  ret i8 addrspace(1)* %a
+}
+
+define i8 addrspace(1)* @test_decl_only_noalias(i8 addrspace(1)* %ptr) gc "statepoint-example" {
+; CHECK-LABEL: @test_decl_only_noalias(
+; No change here, but the prototype of some_function_ret_noalias should have changed.
+; CHECK: call i8 addrspace(1)* @some_function_ret_noalias()
+entry:
+  %a = call i8 addrspace(1)* @some_function_ret_noalias()
+  call void @foo() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  ret i8 addrspace(1)* %a
+}
+
+define i8 addrspace(1)* @test_callsite_arg_attribute(i8 addrspace(1)* %ptr) gc "statepoint-example" {
+; CHECK-LABEL: @test_callsite_arg_attribute(
+; CHECK: call void @some_function_consumer(i8 addrspace(1)* %ptr)
+entry:
+  call void @some_function_consumer(i8 addrspace(1)* dereferenceable(4) noalias %ptr)
+  call void @foo() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  ret i8 addrspace(1)* %ptr
+}
+
+!0 = !{!1, !2, i64 0, i64 1}  ; TAG_old
+!1 = !{!"type_base_old", !2, i64 0}
+!2 = !{!"type_access_old", !3}
+!3 = !{!"root"}
+
+!4 = !{!5, !6, i64 0, i64 1, i64 1}  ; TAG_new
+!5 = !{!3, i64 1, !"type_base_new", !6, i64 0, i64 1}
+!6 = !{!3, i64 1, !"type_access_new"}
+
+; CHECK-DAG: [[ROOT:!.*]] = !{!"root"}
+; CHECK-DAG: [[TYPE_access_old:!.*]] = !{!"type_access_old", [[ROOT]]}
+; CHECK-DAG: [[TYPE_base_old:!.*]] = !{!"type_base_old", [[TYPE_access_old]], i64 0}
+; CHECK-DAG: [[TAG_old]] = !{[[TYPE_base_old]], [[TYPE_access_old]], i64 0}
+; CHECK-DAG: [[TYPE_access_new:!.*]] = !{[[ROOT]], i64 1, !"type_access_new"}
+; CHECK-DAG: [[TYPE_base_new:!.*]] = !{[[ROOT]], i64 1, !"type_base_new", [[TYPE_access_new]], i64 0, i64 1}
+; CHECK-DAG: [[TAG_new]] = !{[[TYPE_base_new]], [[TYPE_access_new]], i64 0, i64 1}

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,141 @@
+; RUN: opt -S -rewrite-statepoints-for-gc < %s | FileCheck %s
+; RUN: opt -S -passes=rewrite-statepoints-for-gc < %s | FileCheck %s
+
+; This test checks that metadata that's invalid after RS4GC is dropped. 
+; We can miscompile if optimizations scheduled after RS4GC uses the
+; metadata that's infact invalid.
+
+declare void @bar()
+
+declare void @baz(i32)
+; Confirm that loadedval instruction does not contain invariant.load metadata.
+; but contains the range metadata.
+; Since loadedval is not marked invariant, it will prevent incorrectly sinking
+; %loadedval in LICM and avoid creation of an unrelocated use of %baseaddr.
+define void @test_invariant_load() gc "statepoint-example" {
+; CHECK-LABEL: @test_invariant_load
+; CHECK: %loadedval = load i32, i32 addrspace(1)* %baseaddr, align 8, !range !0
+bb:
+  br label %outerloopHdr
+
+outerloopHdr:                                              ; preds = %bb6, %bb
+  %baseaddr = phi i32 addrspace(1)* [ undef, %bb ], [ %tmp4, %bb6 ]
+; LICM may sink this load to exit block after RS4GC because it's tagged invariant.
+  %loadedval = load i32, i32 addrspace(1)* %baseaddr, align 8, !range !0, !invariant.load !1
+  br label %innerloopHdr
+
+innerloopHdr:                                              ; preds = %innerlooplatch, %outerloopHdr
+  %tmp4 = phi i32 addrspace(1)* [ %baseaddr, %outerloopHdr ], [ %gep, %innerlooplatch ]
+  br label %innermostloophdr
+
+innermostloophdr:                                              ; preds = %bb6, %innerloopHdr
+  br i1 undef, label %exitblock, label %bb6
+
+bb6:                                              ; preds = %innermostloophdr
+  switch i32 undef, label %innermostloophdr [
+    i32 0, label %outerloopHdr
+    i32 1, label %innerlooplatch
+  ]
+
+innerlooplatch:                                              ; preds = %bb6
+  call void @bar()
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %tmp4, i64 8
+  br label %innerloopHdr
+
+exitblock:                                             ; preds = %innermostloophdr
+  %tmp13 = add i32 42, %loadedval
+  call void @baz(i32 %tmp13)
+  unreachable
+}
+
+; drop the noalias metadata.
+define void @test_noalias(i32 %x, i32 addrspace(1)* %p, i32 addrspace(1)* %q) gc "statepoint-example" {
+; CHECK-LABEL: test_noalias
+; CHECK: %y = load i32, i32 addrspace(1)* %q, align 16
+; CHECK: gc.statepoint
+; CHECK: %p.relocated
+; CHECK-NEXT: %p.relocated.casted = bitcast i8 addrspace(1)* %p.relocated to i32 addrspace(1)*
+; CHECK-NEXT: store i32 %x, i32 addrspace(1)* %p.relocated.casted, align 16
+entry:
+  %y = load i32, i32 addrspace(1)* %q, align 16, !noalias !3
+  call void @baz(i32 %x)
+  store i32 %x, i32 addrspace(1)* %p, align 16, !noalias !4
+  ret void
+}
+
+; drop the dereferenceable metadata
+define void @test_dereferenceable(i32 addrspace(1)* addrspace(1)* %p, i32 %x, i32 addrspace(1)* %q) gc "statepoint-example" {
+; CHECK-LABEL: test_dereferenceable
+; CHECK: %v1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %p
+; CHECK-NEXT: %v2 = load i32, i32 addrspace(1)* %v1
+; CHECK: gc.statepoint
+  %v1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %p, !dereferenceable !5
+  %v2 = load i32, i32 addrspace(1)* %v1
+  call void @baz(i32 %x)
+  store i32 %v2, i32 addrspace(1)* %q, align 16
+  ret void
+}
+
+; invariant.start allows us to sink the load past the baz statepoint call into taken block, which is
+; incorrect. remove the invariant.start and RAUW undef.
+define void @test_inv_start(i1 %cond, i32 addrspace(1)* addrspace(1)* %p, i32 %x, i32 addrspace(1)* %q) gc "statepoint-example" {
+; CHECK-LABEL: test_inv_start
+; CHECK-NOT: invariant.start
+; CHECK: gc.statepoint
+  %v1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %p
+  %invst = call {}* @llvm.invariant.start.p1i32(i64 1, i32 addrspace(1)* %v1)
+  %v2 = load i32, i32 addrspace(1)* %v1
+  call void @baz(i32 %x)
+  br i1 %cond, label %taken, label %untaken
+
+taken:
+  store i32 %v2, i32 addrspace(1)* %q, align 16
+  call void @llvm.invariant.end.p1i32({}* %invst, i64 4, i32 addrspace(1)* %v1)
+  ret void
+
+; CHECK-LABEL: untaken:
+; CHECK: gc.statepoint
+untaken:
+  %foo = call i32 @escaping.invariant.start({}* %invst)
+  call void @dummy(i32 %foo)
+  ret void
+}
+
+; invariant.start is removed and the uses are undef'ed.
+define void @test_inv_start2(i1 %cond, i32 addrspace(1)* addrspace(1)* %p, i32 %x, i32 addrspace(1)* %q) gc "statepoint-example" {
+; CHECK-LABEL: test_inv_start2
+; CHECK-NOT: invariant.start
+; CHECK: gc.statepoint
+  %v1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %p
+  %invst = call {}* @llvm.invariant.start.p1i32(i64 1, i32 addrspace(1)* %v1)
+  %v2 = load i32, i32 addrspace(1)* %v1
+  call void @baz(i32 %x)
+  br i1 %cond, label %taken, label %untaken
+
+taken:
+  store i32 %v2, i32 addrspace(1)* %q, align 16
+  call void @llvm.invariant.end.p1i32({}* %invst, i64 4, i32 addrspace(1)* %v1)
+  ret void
+
+untaken:
+  ret void
+}
+declare {}* @llvm.invariant.start.p1i32(i64, i32 addrspace(1)*  nocapture) nounwind readonly
+declare void @llvm.invariant.end.p1i32({}*, i64, i32 addrspace(1)* nocapture) nounwind
+declare i32 @escaping.invariant.start({}*) nounwind
+declare void @dummy(i32)
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidi32f(i64, i32, void (i32)*, i32, i32, ...)
+
+; Function Attrs: nounwind readonly
+declare i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token, i32, i32) #0
+
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
+
+attributes #0 = { nounwind readonly }
+
+!0 = !{i32 0, i32 2147483647}
+!1 = !{}
+!2 = !{i32 10, i32 1}
+!3 = !{!3}
+!4 = !{!4}
+!5 = !{i64 8}

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/gc-relocate-creation.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/gc-relocate-creation.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/gc-relocate-creation.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/gc-relocate-creation.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,23 @@
+; RUN: opt < %s -rewrite-statepoints-for-gc -S | FileCheck %s
+; RUN: opt < %s -passes=rewrite-statepoints-for-gc -S | FileCheck %s
+
+; This test is to verify gc.relocate can handle pointer to vector of
+; pointers (<2 x i32 addrspace(1)*> addrspace(1)* in this case).
+; The old scheme to create a gc.relocate of <2 x i32 addrspace(1)*> addrspace(1)*
+; type will fail because llvm does not support mangling vector of pointers.
+; The new scheme will create all gc.relocate to i8 addrspace(1)* type and
+; then bitcast to the correct type.
+
+declare void @foo()
+
+declare void @use(...) "gc-leaf-function"
+
+define void @test1(<2 x i32 addrspace(1)*> addrspace(1)* %obj) gc "statepoint-example" {
+entry:
+; CHECK: %obj.relocated = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %statepoint_token, i32 7, i32 7)
+; CHECK-NEXT:  %obj.relocated.casted = bitcast i8 addrspace(1)* %obj.relocated to <2 x i32 addrspace(1)*> addrspace(1)*
+
+  call void @foo() [ "deopt"() ]
+  call void (...) @use(<2 x i32 addrspace(1)*> addrspace(1)* %obj)
+  ret void
+}

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/invokes.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/invokes.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/invokes.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/invokes.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,138 @@
+; RUN: opt < %s -S -rewrite-statepoints-for-gc | FileCheck %s
+; RUN: opt < %s -S -passes=rewrite-statepoints-for-gc | FileCheck %s
+
+declare i64 addrspace(1)* @some_call(i64 addrspace(1)*)
+declare i32 @personality_function()
+
+define i64 addrspace(1)* @test_basic(i64 addrspace(1)* %obj, i64 addrspace(1)* %obj1) gc "statepoint-example" personality i32 ()* @personality_function {
+; CHECK-LABEL: entry:
+entry:
+  ; CHECK: invoke
+  ; CHECK: statepoint
+  ; CHECK: some_call
+  %ret_val = invoke i64 addrspace(1)* @some_call(i64 addrspace(1)* %obj)
+               to label %normal_return unwind label %exceptional_return
+
+; CHECK-LABEL: normal_return:
+; CHECK: gc.result
+; CHECK: ret i64
+
+normal_return:
+  ret i64 addrspace(1)* %ret_val
+
+; CHECK-LABEL: exceptional_return:
+; CHECK: landingpad
+; CHECK: ret i64
+
+exceptional_return:
+  %landing_pad4 = landingpad token
+          cleanup
+  ret i64 addrspace(1)* %obj1
+}
+
+declare <4 x i64 addrspace(1)*> @some_vector_call(<4 x i64 addrspace(1)*>)
+
+define <4 x i64 addrspace(1)*> @test_basic_vector(<4 x i64 addrspace(1)*> %objs, <4 x i64 addrspace(1)*> %objs1) gc "statepoint-example" personality i32 ()* @personality_function {
+; CHECK-LABEL: @test_basic_vector
+entry:
+; CHECK: invoke{{.*}}llvm.experimental.gc.statepoint{{.*}}some_vector_call
+  %ret_val = invoke <4 x i64 addrspace(1)*> @some_vector_call(<4 x i64 addrspace(1)*> %objs)
+               to label %normal_return unwind label %exceptional_return
+
+; CHECK-LABEL: normal_return:
+; CHECK: gc.result
+; CHECK: ret <4 x i64 addrspace(1)*>
+
+normal_return:
+  ret <4 x i64 addrspace(1)*> %ret_val
+
+; CHECK-LABEL: exceptional_return:
+; CHECK: landingpad
+; CHECK: ret <4 x i64 addrspace(1)*>
+
+exceptional_return:
+  %landing_pad4 = landingpad token
+          cleanup
+  ret <4 x i64 addrspace(1)*> %objs1
+}
+
+define i64 addrspace(1)* @test_two_invokes(i64 addrspace(1)* %obj, i64 addrspace(1)* %obj1) gc "statepoint-example" personality i32 ()* @personality_function {
+; CHECK-LABEL: entry:
+entry:
+  ; CHECK: invoke 
+  ; CHECK: statepoint
+  ; CHECK: some_call
+  %ret_val1 = invoke i64 addrspace(1)* @some_call(i64 addrspace(1)* %obj)
+               to label %second_invoke unwind label %exceptional_return
+
+; CHECK-LABEL: second_invoke:
+second_invoke:
+  ; CHECK: invoke
+  ; CHECK: statepoint
+  ; CHECK: some_call
+  %ret_val2 = invoke i64 addrspace(1)* @some_call(i64 addrspace(1)* %ret_val1)
+                to label %normal_return unwind label %exceptional_return
+
+; CHECK-LABEL: normal_return:
+normal_return:
+  ; CHECK: gc.result
+  ; CHECK: ret i64
+  ret i64 addrspace(1)* %ret_val2
+
+; CHECK: exceptional_return:
+; CHECK: ret i64
+
+exceptional_return:
+  %landing_pad4 = landingpad token
+          cleanup
+  ret i64 addrspace(1)* %obj1
+}
+
+define i64 addrspace(1)* @test_phi_node(i1 %cond, i64 addrspace(1)* %obj) gc "statepoint-example" personality i32 ()* @personality_function {
+; CHECK-LABEL: @test_phi_node
+; CHECK-LABEL: entry:
+entry:
+  br i1 %cond, label %left, label %right
+
+left:
+  %ret_val_left = invoke i64 addrspace(1)* @some_call(i64 addrspace(1)* %obj)
+                    to label %merge unwind label %exceptional_return
+
+right:
+  %ret_val_right = invoke i64 addrspace(1)* @some_call(i64 addrspace(1)* %obj)
+                     to label %merge unwind label %exceptional_return
+
+; CHECK: merge[[A:[0-9]]]:
+; CHECK: gc.result
+; CHECK: br label %[[with_phi:merge[0-9]*]]
+
+; CHECK: merge[[B:[0-9]]]:
+; CHECK: gc.result
+; CHECK: br label %[[with_phi]]
+
+; CHECK: [[with_phi]]:
+; CHECK: phi
+; CHECK: ret i64 addrspace(1)* %ret_val
+merge:
+  %ret_val = phi i64 addrspace(1)* [%ret_val_left, %left], [%ret_val_right, %right]
+  ret i64 addrspace(1)* %ret_val
+
+; CHECK-LABEL: exceptional_return:
+; CHECK: ret i64 addrspace(1)*
+
+exceptional_return:
+  %landing_pad4 = landingpad token
+          cleanup
+  ret i64 addrspace(1)* %obj
+}
+
+declare void @do_safepoint()
+define void @gc.safepoint_poll() {
+; CHECK-LABEL: gc.safepoint_poll
+; CHECK-LABEL: entry
+; CHECK-NEXT: do_safepoint
+; CHECK-NEXT: ret void 
+entry:
+  call void @do_safepoint()
+  ret void
+}

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/leaf-function.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/leaf-function.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/leaf-function.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/leaf-function.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,34 @@
+; RUN: opt < %s -S -rewrite-statepoints-for-gc | FileCheck %s
+; RUN: opt < %s -S -passes=rewrite-statepoints-for-gc | FileCheck %s
+
+declare void @foo() "gc-leaf-function"
+declare void @bar()
+
+; Calls of functions with the "gc-leaf-function" attribute shouldn't be turned
+; into a safepoint.  An entry safepoint should get inserted, though.
+define void @test_leaf_function() gc "statepoint-example" {
+; CHECK-LABEL: test_leaf_function
+; CHECK-NOT: gc.statepoint
+; CHECK-NOT: gc.result
+entry:
+  call void @foo()
+  ret void
+}
+
+define void @test_leaf_function_call() gc "statepoint-example" {
+; CHECK-LABEL: test_leaf_function_call
+; CHECK-NOT: gc.statepoint
+; CHECK-NOT: gc.result
+entry:
+  call void @bar() "gc-leaf-function"
+  ret void
+}
+
+; This function is inlined when inserting a poll.
+declare void @do_safepoint()
+define void @gc.safepoint_poll() {
+; CHECK-LABEL: gc.safepoint_poll
+entry:
+  call void @do_safepoint()
+  ret void
+}

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/libcall.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/libcall.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/libcall.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/libcall.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,15 @@
+; A call to a libcall function is not a statepoint.
+; This test verifies that calls to libcalls functions do not get converted to
+; statepoint calls.
+; RUN: opt -S -rewrite-statepoints-for-gc < %s | FileCheck %s
+; RUN: opt -S -passes=rewrite-statepoints-for-gc < %s | FileCheck %s
+
+declare double @ldexp(double %x, i32 %n) nounwind readnone
+
+define double @test_libcall(double %x) gc "statepoint-example" {
+; CHECK-LABEL: test_libcall
+; CHECK-NEXT: %res = call double @ldexp(double %x, i32 5)
+; CHECK-NEXT: ret double %res
+  %res = call double @ldexp(double %x, i32 5) nounwind readnone
+  ret double %res
+}

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/live-vector-nosplit.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/live-vector-nosplit.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/live-vector-nosplit.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/live-vector-nosplit.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,119 @@
+; Test that we can correctly handle vectors of pointers in statepoint 
+; rewriting.  
+; RUN: opt < %s -rewrite-statepoints-for-gc -S | FileCheck  %s
+; RUN: opt < %s -passes=rewrite-statepoints-for-gc -S | FileCheck  %s
+
+; A non-vector relocation for comparison
+define i64 addrspace(1)* @test(i64 addrspace(1)* %obj) gc "statepoint-example" {
+; CHECK-LABEL: test
+; CHECK: gc.statepoint
+; CHECK-NEXT: gc.relocate
+; CHECK-NEXT: bitcast
+; CHECK-NEXT: ret i64 addrspace(1)*
+; A base vector from a argument
+entry:
+  call void @do_safepoint() [ "deopt"() ]
+  ret i64 addrspace(1)* %obj
+}
+
+; A vector argument
+define <2 x i64 addrspace(1)*> @test2(<2 x i64 addrspace(1)*> %obj) gc "statepoint-example" {
+; CHECK-LABEL: test2
+; CHECK-NEXT: gc.statepoint
+; CHECK-NEXT: gc.relocate
+; CHECK-NEXT: bitcast
+; CHECK-NEXT: ret <2 x i64 addrspace(1)*>
+  call void @do_safepoint() [ "deopt"() ]
+  ret <2 x i64 addrspace(1)*> %obj
+}
+
+; A load
+define <2 x i64 addrspace(1)*> @test3(<2 x i64 addrspace(1)*>* %ptr) gc "statepoint-example" {
+; CHECK-LABEL: test3
+; CHECK: load
+; CHECK-NEXT: gc.statepoint
+; CHECK-NEXT: gc.relocate
+; CHECK-NEXT: bitcast
+; CHECK-NEXT: ret <2 x i64 addrspace(1)*>
+entry:
+  %obj = load <2 x i64 addrspace(1)*>, <2 x i64 addrspace(1)*>* %ptr
+  call void @do_safepoint() [ "deopt"() ]
+  ret <2 x i64 addrspace(1)*> %obj
+}
+
+declare i32 @fake_personality_function()
+
+; When a statepoint is an invoke rather than a call
+define <2 x i64 addrspace(1)*> @test4(<2 x i64 addrspace(1)*>* %ptr) gc "statepoint-example" personality i32 ()* @fake_personality_function {
+; CHECK-LABEL: test4
+; CHECK: load
+; CHECK-NEXT: gc.statepoint
+entry:
+  %obj = load <2 x i64 addrspace(1)*>, <2 x i64 addrspace(1)*>* %ptr
+  invoke void @do_safepoint() [ "deopt"() ]
+          to label %normal_return unwind label %exceptional_return
+
+normal_return:                                    ; preds = %entry
+; CHECK-LABEL: normal_return:
+; CHECK: gc.relocate
+; CHECK-NEXT: bitcast
+; CHECK-NEXT: ret <2 x i64 addrspace(1)*>
+  ret <2 x i64 addrspace(1)*> %obj
+
+exceptional_return:                               ; preds = %entry
+; CHECK-LABEL: exceptional_return:
+; CHECK: gc.relocate
+; CHECK-NEXT: bitcast
+; CHECK-NEXT: ret <2 x i64 addrspace(1)*>
+  %landing_pad4 = landingpad token
+          cleanup
+  ret <2 x i64 addrspace(1)*> %obj
+}
+
+; A newly created vector
+define <2 x i64 addrspace(1)*> @test5(i64 addrspace(1)* %p) gc "statepoint-example" {
+; CHECK-LABEL: test5
+; CHECK: insertelement
+; CHECK-NEXT: insertelement
+; CHECK-NEXT: gc.statepoint
+; CHECK-NEXT: gc.relocate
+; CHECK-NEXT: bitcast
+; CHECK-NEXT: gc.relocate
+; CHECK-NEXT: bitcast
+; CHECK-NEXT: ret <2 x i64 addrspace(1)*> %vec.relocated.casted
+entry:
+  %vec = insertelement <2 x i64 addrspace(1)*> undef, i64 addrspace(1)* %p, i32 0
+  call void @do_safepoint() [ "deopt"() ]
+  ret <2 x i64 addrspace(1)*> %vec
+}
+
+; A merge point
+define <2 x i64 addrspace(1)*> @test6(i1 %cnd, <2 x i64 addrspace(1)*>* %ptr) gc "statepoint-example" {
+; CHECK-LABEL: test6
+entry:
+  br i1 %cnd, label %taken, label %untaken
+
+taken:                                            ; preds = %entry
+  %obja = load <2 x i64 addrspace(1)*>, <2 x i64 addrspace(1)*>* %ptr
+  br label %merge
+
+untaken:                                          ; preds = %entry
+  %objb = load <2 x i64 addrspace(1)*>, <2 x i64 addrspace(1)*>* %ptr
+  br label %merge
+
+merge:                                            ; preds = %untaken, %taken
+; CHECK-LABEL: merge:
+; CHECK-NEXT: = phi
+; CHECK-NEXT: = phi
+; CHECK-NEXT: gc.statepoint
+; CHECK-NEXT: gc.relocate
+; CHECK-NEXT: bitcast
+; CHECK-NEXT: gc.relocate
+; CHECK-NEXT: bitcast
+; CHECK-NEXT: ret <2 x i64 addrspace(1)*>
+  %obj = phi <2 x i64 addrspace(1)*> [ %obja, %taken ], [ %objb, %untaken ]
+  call void @do_safepoint() [ "deopt"() ]
+  ret <2 x i64 addrspace(1)*> %obj
+}
+
+declare void @do_safepoint()

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/liveness-basics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/liveness-basics.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/liveness-basics.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/liveness-basics.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,166 @@
+; A collection of liveness test cases to ensure we're reporting the
+; correct live values at statepoints
+; RUN: opt -rewrite-statepoints-for-gc -spp-rematerialization-threshold=0 -S < %s | FileCheck %s
+; RUN: opt -passes=rewrite-statepoints-for-gc -spp-rematerialization-threshold=0 -S < %s | FileCheck %s
+
+; Tests to make sure we consider %obj live in both the taken and untaken 
+; predeccessor of merge.
+
+define i64 addrspace(1)* @test1(i1 %cmp, i64 addrspace(1)* %obj) gc "statepoint-example" {
+; CHECK-LABEL: @test1
+entry:
+  br i1 %cmp, label %taken, label %untaken
+
+taken:                                            ; preds = %entry
+; CHECK-LABEL: taken:
+; CHECK-NEXT: gc.statepoint
+; CHECK-NEXT: %obj.relocated = call coldcc i8 addrspace(1)*
+; CHECK-NEXT: bitcast
+; CHECK-NEXT: br label %merge
+  call void @foo() [ "deopt"() ]
+  br label %merge
+
+untaken:                                          ; preds = %entry
+; CHECK-LABEL: untaken:
+; CHECK-NEXT: gc.statepoint
+; CHECK-NEXT: %obj.relocated2 = call coldcc i8 addrspace(1)*
+; CHECK-NEXT: bitcast
+; CHECK-NEXT: br label %merge
+  call void @foo() [ "deopt"() ]
+  br label %merge
+
+merge:                                            ; preds = %untaken, %taken
+; CHECK-LABEL: merge:
+; CHECK-NEXT: %.0 = phi i64 addrspace(1)* [ %obj.relocated.casted, %taken ], [ %obj.relocated2.casted, %untaken ]
+; CHECK-NEXT: ret i64 addrspace(1)* %.0
+; A local kill should not effect liveness in predecessor block
+  ret i64 addrspace(1)* %obj
+}
+
+define i64 addrspace(1)* @test2(i1 %cmp, i64 addrspace(1)** %loc) gc "statepoint-example" {
+; CHECK-LABEL: @test2
+entry:
+; CHECK-LABEL: entry:
+; CHECK-NEXT:  gc.statepoint
+; CHECK-NEXT:  br
+  call void @foo() [ "deopt"() ]
+  br i1 %cmp, label %taken, label %untaken
+
+taken:                                            ; preds = %entry
+; CHECK-LABEL: taken:
+; CHECK-NEXT:  %obj = load
+; CHECK-NEXT:  gc.statepoint
+; CHECK-NEXT:  gc.relocate
+; CHECK-NEXT: bitcast
+; CHECK-NEXT:  ret i64 addrspace(1)* %obj.relocated.casted
+; A local kill should effect values live from a successor phi.  Also, we
+; should only propagate liveness from a phi to the appropriate predecessors.
+  %obj = load i64 addrspace(1)*, i64 addrspace(1)** %loc
+  call void @foo() [ "deopt"() ]
+  ret i64 addrspace(1)* %obj
+
+untaken:                                          ; preds = %entry
+  ret i64 addrspace(1)* null
+}
+
+define i64 addrspace(1)* @test3(i1 %cmp, i64 addrspace(1)** %loc) gc "statepoint-example" {
+; CHECK-LABEL: @test3
+entry:
+  br i1 %cmp, label %taken, label %untaken
+
+taken:                                            ; preds = %entry
+; CHECK-LABEL: taken:
+; CHECK-NEXT: gc.statepoint
+; CHECK-NEXT: %obj = load
+; CHECK-NEXT: gc.statepoint
+; CHECK-NEXT: %obj.relocated = call coldcc i8 addrspace(1)*
+; CHECK-NEXT: bitcast
+; CHECK-NEXT: br label %merge
+  call void @foo() [ "deopt"() ]
+  %obj = load i64 addrspace(1)*, i64 addrspace(1)** %loc
+  call void @foo() [ "deopt"() ]
+  br label %merge
+
+untaken:                                          ; preds = %entry
+; CHECK-LABEL: taken:
+; CHECK-NEXT: gc.statepoint
+; CHECK-NEXT: br label %merge
+; A base pointer must be live if it is needed at a later statepoint,
+; even if the base pointer is otherwise unused.
+  call void @foo() [ "deopt"() ]
+  br label %merge
+
+merge:                                            ; preds = %untaken, %taken
+  %phi = phi i64 addrspace(1)* [ %obj, %taken ], [ null, %untaken ]
+  ret i64 addrspace(1)* %phi
+}
+
+define i64 addrspace(1)* @test4(i1 %cmp, i64 addrspace(1)* %obj) gc "statepoint-example" {
+; CHECK-LABEL: @test4
+entry:
+; CHECK-LABEL: entry:
+; CHECK-NEXT:  %derived = getelementptr
+; CHECK-NEXT:  gc.statepoint
+; CHECK-NEXT:  %derived.relocated =
+; CHECK-NEXT:  bitcast 
+; CHECK-NEXT:  %obj.relocated =
+; CHECK-NEXT:  bitcast
+; CHECK-NEXT:  gc.statepoint
+; CHECK-NEXT:  %derived.relocated2 =
+; CHECK-NEXT:  bitcast 
+
+; Note: It's legal to relocate obj again, but not strictly needed
+; CHECK-NEXT:  %obj.relocated3 =
+; CHECK-NEXT:  bitcast
+; CHECK-NEXT:  ret i64 addrspace(1)* %derived.relocated2.casted
+; 
+; Make sure that a phi def visited during iteration is considered a kill.
+; Also, liveness after base pointer analysis can change based on new uses,
+; not just new defs.
+  %derived = getelementptr i64, i64 addrspace(1)* %obj, i64 8
+  call void @foo() [ "deopt"() ]
+  call void @foo() [ "deopt"() ]
+  ret i64 addrspace(1)* %derived
+}
+
+declare void @consume(...) readonly "gc-leaf-function"
+
+define i64 addrspace(1)* @test5(i1 %cmp, i64 addrspace(1)* %obj) gc "statepoint-example" {
+; CHECK-LABEL: @test5
+entry:
+  br i1 %cmp, label %taken, label %untaken
+
+taken:                                            ; preds = %entry
+; CHECK-LABEL: taken:
+; CHECK-NEXT: gc.statepoint
+; CHECK-NEXT: %obj.relocated = call coldcc i8 addrspace(1)*
+; CHECK-NEXT: bitcast
+; CHECK-NEXT: br label %merge
+  call void @foo() [ "deopt"() ]
+  br label %merge
+
+untaken:                                          ; preds = %entry
+; CHECK-LABEL: untaken:
+; CHECK-NEXT: br label %merge
+  br label %merge
+
+merge:                                            ; preds = %untaken, %taken
+; CHECK-LABEL: merge:
+; CHECK-NEXT: %.0 = phi i64 addrspace(1)*
+; CHECK-NEXT: %obj2a = phi
+; CHECK-NEXT: @consume
+; CHECK-NEXT: br label %final
+  %obj2a = phi i64 addrspace(1)* [ %obj, %taken ], [ null, %untaken ]
+  call void (...) @consume(i64 addrspace(1)* %obj2a)
+  br label %final
+
+final:                                            ; preds = %merge
+; CHECK-LABEL: final:
+; CHECK-NEXT: @consume
+; CHECK-NEXT: ret i64 addrspace(1)* %.0
+  call void (...) @consume(i64 addrspace(1)* %obj2a)
+  ret i64 addrspace(1)* %obj
+}
+
+declare void @foo()
+

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/patchable-statepoints.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/patchable-statepoints.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/patchable-statepoints.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/patchable-statepoints.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,45 @@
+; RUN: opt -S -rewrite-statepoints-for-gc < %s | FileCheck %s
+; RUN: opt -S -passes=rewrite-statepoints-for-gc < %s | FileCheck %s
+
+declare void @f()
+declare i32 @personality_function()
+
+define void @test_id() gc "statepoint-example" personality i32 ()* @personality_function {
+; CHECK-LABEL: @test_id(
+entry:
+; CHECK-LABEL: entry:
+; CHECK: invoke token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 100, i32 0, void ()* @f
+  invoke void @f()  "statepoint-id"="100" to label %normal_return unwind label %exceptional_return
+
+normal_return:
+  ret void
+
+exceptional_return:
+  %landing_pad4 = landingpad {i8*, i32} cleanup
+  ret void
+}
+
+define void @test_num_patch_bytes() gc "statepoint-example" personality i32 ()* @personality_function {
+; CHECK-LABEL: @test_num_patch_bytes(
+entry:
+; CHECK-LABEL: entry:
+; CHECK: invoke token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 99, void ()* @f,
+  invoke void @f()  "statepoint-num-patch-bytes"="99" to label %normal_return unwind label %exceptional_return
+
+normal_return:
+  ret void
+
+exceptional_return:
+  %landing_pad4 = landingpad {i8*, i32} cleanup
+  ret void
+}
+
+declare void @do_safepoint()
+define void @gc.safepoint_poll() {
+entry:
+  call void @do_safepoint()
+  ret void
+}
+
+; CHECK-NOT: statepoint-id
+; CHECK-NOT: statepoint-num-patch_bytes

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/preprocess.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/preprocess.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/preprocess.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/preprocess.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,63 @@
+; RUN: opt -rewrite-statepoints-for-gc -S < %s | FileCheck %s
+; RUN: opt -passes=rewrite-statepoints-for-gc -S < %s | FileCheck %s
+
+; Test to make sure we destroy LCSSA's single entry phi nodes before
+; running liveness
+
+declare void @consume(...) "gc-leaf-function"
+
+define void @test6(i64 addrspace(1)* %obj) gc "statepoint-example" {
+; CHECK-LABEL: @test6
+entry:
+  br label %next
+
+next:                                             ; preds = %entry
+; CHECK-LABEL: next:
+; CHECK-NEXT: gc.statepoint
+; CHECK-NEXT: gc.relocate
+; CHECK-NEXT: bitcast
+; CHECK-NEXT: @consume(i64 addrspace(1)* %obj.relocated.casted)
+; CHECK-NEXT: @consume(i64 addrspace(1)* %obj.relocated.casted)
+; Need to delete unreachable gc.statepoint call
+  %obj2 = phi i64 addrspace(1)* [ %obj, %entry ]
+  call void @foo() [ "deopt"() ]
+  call void (...) @consume(i64 addrspace(1)* %obj2)
+  call void (...) @consume(i64 addrspace(1)* %obj)
+  ret void
+}
+
+define void @test7() gc "statepoint-example" {
+; CHECK-LABEL: test7
+; CHECK-NOT: gc.statepoint
+; Need to delete unreachable gc.statepoint invoke - tested seperately given
+; a correct implementation could only remove the instructions, not the block
+  ret void
+
+unreached:                                        ; preds = %unreached
+  %obj = phi i64 addrspace(1)* [ null, %unreached ]
+  call void @foo() [ "deopt"() ]
+  call void (...) @consume(i64 addrspace(1)* %obj)
+  br label %unreached
+}
+
+define void @test8() gc "statepoint-example" personality i32 ()* undef {
+; CHECK-LABEL: test8
+; CHECK-NOT: gc.statepoint
+; Bound the last check-not
+  ret void
+
+unreached:                                        ; No predecessors!
+  invoke void @foo() [ "deopt"() ]
+; CHECK-LABEL: @foo
+          to label %normal_return unwind label %exceptional_return
+
+normal_return:                                    ; preds = %unreached
+  ret void
+
+exceptional_return:                               ; preds = %unreached
+  %landing_pad4 = landingpad { i8*, i32 }
+          cleanup
+  ret void
+}
+
+declare void @foo()

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/relocate-invoke-result.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/relocate-invoke-result.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/relocate-invoke-result.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/relocate-invoke-result.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,33 @@
+
+;; RUN: opt -rewrite-statepoints-for-gc -verify -S < %s | FileCheck %s
+;; RUN: opt -passes=rewrite-statepoints-for-gc -verify -S < %s | FileCheck %s
+;; This test is to verify that RewriteStatepointsForGC correctly relocates values
+;; defined by invoke instruction results. 
+
+declare i64* addrspace(1)* @non_gc_call() "gc-leaf-function"
+
+declare void @gc_call()
+
+declare i32* @fake_personality_function()
+
+define i64* addrspace(1)* @test() gc "statepoint-example" personality i32* ()* @fake_personality_function {
+; CHECK-LABEL: @test(
+
+entry:
+  %obj = invoke i64* addrspace(1)* @non_gc_call()
+          to label %normal_dest unwind label %unwind_dest
+
+unwind_dest:                                      ; preds = %entry
+  %lpad = landingpad { i8*, i32 }
+          cleanup
+  resume { i8*, i32 } undef
+
+normal_dest:                                      ; preds = %entry
+; CHECK: normal_dest:
+; CHECK-NEXT: gc.statepoint
+; CHECK-NEXT: %obj.relocated = call coldcc i8 addrspace(1)*
+; CHECK-NEXT: bitcast
+
+  call void @gc_call() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  ret i64* addrspace(1)* %obj
+}

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/relocation.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/relocation.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/relocation.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/relocation.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,286 @@
+; RUN: opt < %s -rewrite-statepoints-for-gc -spp-rematerialization-threshold=0 -S | FileCheck %s
+; RUN: opt < %s -passes=rewrite-statepoints-for-gc -spp-rematerialization-threshold=0 -S | FileCheck %s
+
+
+declare void @foo()
+
+declare void @use(...) "gc-leaf-function"
+
+define i64 addrspace(1)* @test1(i64 addrspace(1)* %obj, i64 addrspace(1)* %obj2, i1 %condition) gc "statepoint-example" {
+; CHECK-LABEL: @test1
+; CHECK-DAG: %obj.relocated
+; CHECK-DAG: %obj2.relocated
+entry:
+  call void @foo() [ "deopt"() ]
+  br label %joint
+
+joint:                                            ; preds = %joint2, %entry
+; CHECK-LABEL: joint:
+; CHECK: %phi1 = phi i64 addrspace(1)* [ %obj.relocated.casted, %entry ], [ %obj3, %joint2 ]
+  %phi1 = phi i64 addrspace(1)* [ %obj, %entry ], [ %obj3, %joint2 ]
+  br i1 %condition, label %use, label %joint2
+
+use:                                              ; preds = %joint
+  br label %joint2
+
+joint2:                                           ; preds = %use, %joint
+; CHECK-LABEL: joint2:
+; CHECK: %phi2 = phi i64 addrspace(1)* [ %obj.relocated.casted, %use ], [ %obj2.relocated.casted, %joint ]
+; CHECK: %obj3 = getelementptr i64, i64 addrspace(1)* %obj2.relocated.casted, i32 1
+  %phi2 = phi i64 addrspace(1)* [ %obj, %use ], [ %obj2, %joint ]
+  %obj3 = getelementptr i64, i64 addrspace(1)* %obj2, i32 1
+  br label %joint
+}
+
+declare i64 addrspace(1)* @generate_obj() "gc-leaf-function"
+
+declare void @consume_obj(i64 addrspace(1)*) "gc-leaf-function"
+
+declare i1 @rt() "gc-leaf-function"
+
+define void @test2() gc "statepoint-example" {
+; CHECK-LABEL: @test2
+entry:
+  %obj_init = call i64 addrspace(1)* @generate_obj()
+  %obj = getelementptr i64, i64 addrspace(1)* %obj_init, i32 42
+  br label %loop
+
+loop:                                             ; preds = %loop.backedge, %entry
+; CHECK: loop:
+; CHECK-DAG: [ %obj_init.relocated.casted, %loop.backedge ]
+; CHECK-DAG: [ %obj_init, %entry ]
+; CHECK-DAG: [ %obj.relocated.casted, %loop.backedge ]
+; CHECK-DAG: [ %obj, %entry ]
+; CHECK-NOT: %location = getelementptr i64, i64 addrspace(1)* %obj, i32 %index
+  %index = phi i32 [ 0, %entry ], [ %index.inc, %loop.backedge ]
+  %location = getelementptr i64, i64 addrspace(1)* %obj, i32 %index
+  call void @consume_obj(i64 addrspace(1)* %location)
+  %index.inc = add i32 %index, 1
+  %condition = call i1 @rt()
+  br i1 %condition, label %loop_x, label %loop_y
+
+loop_x:                                           ; preds = %loop
+  br label %loop.backedge
+
+loop.backedge:                                    ; preds = %loop_y, %loop_x
+  call void @do_safepoint() [ "deopt"() ]
+  br label %loop
+
+loop_y:                                           ; preds = %loop
+  br label %loop.backedge
+}
+
+declare void @some_call(i8 addrspace(1)*) "gc-leaf-function"
+
+define void @relocate_merge(i1 %cnd, i8 addrspace(1)* %arg) gc "statepoint-example" {
+; CHECK-LABEL: @relocate_merge
+
+bci_0:
+  br i1 %cnd, label %if_branch, label %else_branch
+
+if_branch:                                        ; preds = %bci_0
+; CHECK-LABEL: if_branch:
+; CHECK: gc.statepoint
+; CHECK: gc.relocate
+  call void @foo() [ "deopt"() ]
+  br label %join
+
+else_branch:                                      ; preds = %bci_0
+; CHECK-LABEL: else_branch:
+; CHECK: gc.statepoint
+; CHECK: gc.relocate
+; We need to end up with a single relocation phi updated from both paths 
+  call void @foo() [ "deopt"() ]
+  br label %join
+
+join:                                             ; preds = %else_branch, %if_branch
+; CHECK-LABEL: join:
+; CHECK: phi i8 addrspace(1)*
+; CHECK-DAG: [ %arg.relocated, %if_branch ]
+; CHECK-DAG: [ %arg.relocated2, %else_branch ]
+; CHECK-NOT: phi
+  call void @some_call(i8 addrspace(1)* %arg)
+  ret void
+}
+
+declare void @goo(i64)
+
+declare i32 @moo(i64 addrspace(1)*)
+
+; Make sure a use in a statepoint gets properly relocated at a previous one.  
+; This is basically just making sure that statepoints aren't accidentally 
+; treated specially.
+define void @test3(i64 addrspace(1)* %obj) gc "statepoint-example" {
+; CHECK-LABEL: @test3
+; CHECK: gc.statepoint
+; CHECK-NEXT: gc.relocate
+; CHECK-NEXT: bitcast
+; CHECK-NEXT: gc.statepoint
+entry:
+  call void @goo(i64 undef) [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  %0 = call i32 @moo(i64 addrspace(1)* %obj) [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  ret void
+}
+
+declare i8 addrspace(1)* @boo()
+
+; Check specifically for the case where the result of a statepoint needs to 
+; be relocated itself
+define void @test4() gc "statepoint-example" {
+; CHECK-LABEL: @test4
+; CHECK: gc.statepoint
+; CHECK: gc.result
+; CHECK: gc.statepoint
+; CHECK: [[RELOCATED:%[^ ]+]] = call {{.*}}gc.relocate
+; CHECK: @use(i8 addrspace(1)* [[RELOCATED]])
+  %1 = call i8 addrspace(1)* @boo() [ "deopt"() ]
+  %2 = call i8 addrspace(1)* @boo() [ "deopt"() ]
+  call void (...) @use(i8 addrspace(1)* %1)
+  ret void
+}
+
+; Test updating a phi where not all inputs are live to begin with
+define void @test5(i8 addrspace(1)* %arg) gc "statepoint-example" {
+; CHECK-LABEL: test5
+entry:
+  %0 = call i8 addrspace(1)* @boo() [ "deopt"() ]
+  switch i32 undef, label %kill [
+    i32 10, label %merge
+    i32 13, label %merge
+  ]
+
+kill:                                             ; preds = %entry
+  br label %merge
+
+merge:                                            ; preds = %kill, %entry, %entry
+; CHECK: merge:
+; CHECK: %test = phi i8 addrspace(1)
+; CHECK-DAG: [ null, %kill ]
+; CHECK-DAG: [ %arg.relocated, %entry ]
+; CHECK-DAG: [ %arg.relocated, %entry ]
+  %test = phi i8 addrspace(1)* [ null, %kill ], [ %arg, %entry ], [ %arg, %entry ]
+  call void (...) @use(i8 addrspace(1)* %test)
+  ret void
+}
+
+; Check to make sure we handle values live over an entry statepoint
+define void @test6(i8 addrspace(1)* %arg1, i8 addrspace(1)* %arg2, i8 addrspace(1)* %arg3) gc "statepoint-example" {
+; CHECK-LABEL: @test6
+entry:
+  br i1 undef, label %gc.safepoint_poll.exit2, label %do_safepoint
+
+do_safepoint:                                     ; preds = %entry
+; CHECK-LABEL: do_safepoint:
+; CHECK: gc.statepoint
+; CHECK: arg1.relocated = 
+; CHECK: arg2.relocated = 
+; CHECK: arg3.relocated = 
+  call void @foo() [ "deopt"(i8 addrspace(1)* %arg1, i8 addrspace(1)* %arg2, i8 addrspace(1)* %arg3) ]
+  br label %gc.safepoint_poll.exit2
+
+gc.safepoint_poll.exit2:                          ; preds = %do_safepoint, %entry
+; CHECK-LABEL: gc.safepoint_poll.exit2:
+; CHECK: phi i8 addrspace(1)*
+; CHECK-DAG: [ %arg3, %entry ]
+; CHECK-DAG: [ %arg3.relocated, %do_safepoint ]
+; CHECK: phi i8 addrspace(1)*
+; CHECK-DAG: [ %arg2, %entry ]
+; CHECK-DAG: [ %arg2.relocated, %do_safepoint ]
+; CHECK: phi i8 addrspace(1)*
+; CHECK-DAG: [ %arg1, %entry ]
+; CHECK-DAG:  [ %arg1.relocated, %do_safepoint ]
+  call void (...) @use(i8 addrspace(1)* %arg1, i8 addrspace(1)* %arg2, i8 addrspace(1)* %arg3)
+  ret void
+}
+
+; Check relocation in a loop nest where a relocation happens in the outer
+; but not the inner loop
+define void @test_outer_loop(i8 addrspace(1)* %arg1, i8 addrspace(1)* %arg2, i1 %cmp) gc "statepoint-example" {
+; CHECK-LABEL: @test_outer_loop
+
+bci_0:
+  br label %outer-loop
+
+outer-loop:                                       ; preds = %outer-inc, %bci_0
+; CHECK-LABEL: outer-loop:
+; CHECK: phi i8 addrspace(1)* [ %arg2, %bci_0 ], [ %arg2.relocated, %outer-inc ]
+; CHECK: phi i8 addrspace(1)* [ %arg1, %bci_0 ], [ %arg1.relocated, %outer-inc ]
+  br label %inner-loop
+
+inner-loop:                                       ; preds = %inner-loop, %outer-loop
+  br i1 %cmp, label %inner-loop, label %outer-inc
+
+outer-inc:                                        ; preds = %inner-loop
+; CHECK-LABEL: outer-inc:
+; CHECK: %arg1.relocated
+; CHECK: %arg2.relocated
+  call void @foo() [ "deopt"(i8 addrspace(1)* %arg1, i8 addrspace(1)* %arg2) ]
+  br label %outer-loop
+}
+
+; Check that both inner and outer loops get phis when relocation is in
+;  inner loop
+define void @test_inner_loop(i8 addrspace(1)* %arg1, i8 addrspace(1)* %arg2, i1 %cmp) gc "statepoint-example" {
+; CHECK-LABEL: @test_inner_loop
+
+bci_0:
+  br label %outer-loop
+
+outer-loop:                                       ; preds = %outer-inc, %bci_0
+; CHECK-LABEL: outer-loop:
+; CHECK: phi i8 addrspace(1)* [ %arg2, %bci_0 ], [ %arg2.relocated, %outer-inc ]
+; CHECK: phi i8 addrspace(1)* [ %arg1, %bci_0 ], [ %arg1.relocated, %outer-inc ]
+  br label %inner-loop
+; CHECK-LABEL: inner-loop
+; CHECK: phi i8 addrspace(1)* 
+; CHECK-DAG: %outer-loop ]
+; CHECK-DAG: [ %arg2.relocated, %inner-loop ]
+; CHECK: phi i8 addrspace(1)* 
+; CHECK-DAG: %outer-loop ]
+; CHECK-DAG: [ %arg1.relocated, %inner-loop ]
+; CHECK: gc.statepoint
+; CHECK: %arg1.relocated
+; CHECK: %arg2.relocated
+
+inner-loop:                                       ; preds = %inner-loop, %outer-loop
+  call void @foo() [ "deopt"(i8 addrspace(1)* %arg1, i8 addrspace(1)* %arg2) ]
+  br i1 %cmp, label %inner-loop, label %outer-inc
+
+outer-inc:                                        ; preds = %inner-loop
+; CHECK-LABEL: outer-inc:
+; This test shows why updating just those uses of the original value being
+; relocated dominated by the inserted relocation is not always sufficient.
+  br label %outer-loop
+}
+
+define i64 addrspace(1)* @test7(i64 addrspace(1)* %obj, i64 addrspace(1)* %obj2, i1 %condition) gc "statepoint-example" {
+; CHECK-LABEL: @test7
+entry:
+  br i1 %condition, label %branch2, label %join
+
+branch2:                                          ; preds = %entry
+  br i1 %condition, label %callbb, label %join2
+
+callbb:                                           ; preds = %branch2
+  call void @foo() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  br label %join
+
+join:                                             ; preds = %callbb, %entry
+; CHECK-LABEL: join:
+; CHECK: phi i64 addrspace(1)* [ %obj.relocated.casted, %callbb ], [ %obj, %entry ]
+; CHECK: phi i64 addrspace(1)* 
+; CHECK-DAG: [ %obj, %entry ]
+; CHECK-DAG: [ %obj2.relocated.casted, %callbb ]
+  %phi1 = phi i64 addrspace(1)* [ %obj, %entry ], [ %obj2, %callbb ]
+  br label %join2
+
+join2:                                            ; preds = %join, %branch2
+; CHECK-LABEL: join2:
+; CHECK: phi2 = phi i64 addrspace(1)* 
+; CHECK-DAG: %join ] 
+; CHECK-DAG:  [ %obj2, %branch2 ]
+  %phi2 = phi i64 addrspace(1)* [ %obj, %join ], [ %obj2, %branch2 ]
+  ret i64 addrspace(1)* %phi2
+}
+
+declare void @do_safepoint()

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/rematerialize-derived-pointers.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/rematerialize-derived-pointers.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/rematerialize-derived-pointers.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/rematerialize-derived-pointers.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,331 @@
+; RUN: opt < %s -rewrite-statepoints-for-gc -S | FileCheck %s
+; RUN: opt < %s -passes=rewrite-statepoints-for-gc -S | FileCheck %s
+
+
+declare void @use_obj16(i16 addrspace(1)*) "gc-leaf-function"
+declare void @use_obj32(i32 addrspace(1)*) "gc-leaf-function"
+declare void @use_obj64(i64 addrspace(1)*) "gc-leaf-function"
+
+declare void @do_safepoint()
+
+define void @test_gep_const(i32 addrspace(1)* %base) gc "statepoint-example" {
+; CHECK-LABEL: test_gep_const
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %base, i32 15
+; CHECK: getelementptr i32, i32 addrspace(1)* %base, i32 15
+  call void @do_safepoint() [ "deopt"() ]
+; CHECK: %base.relocated = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %statepoint_token, i32 7, i32 7)
+; CHECK: bitcast i8 addrspace(1)* %base.relocated to i32 addrspace(1)*
+; CHECK: getelementptr i32, i32 addrspace(1)* %base.relocated.casted, i32 15
+  call void @use_obj32(i32 addrspace(1)* %base)
+  call void @use_obj32(i32 addrspace(1)* %ptr)
+  ret void
+}
+
+define void @test_gep_idx(i32 addrspace(1)* %base, i32 %idx) gc "statepoint-example" {
+; CHECK-LABEL: test_gep_idx
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %base, i32 %idx
+; CHECK: getelementptr
+  call void @do_safepoint() [ "deopt"() ]
+; CHECK: %base.relocated = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %statepoint_token, i32 7, i32 7)
+; CHECK: %base.relocated.casted = bitcast i8 addrspace(1)* %base.relocated to i32 addrspace(1)*
+; CHECK: getelementptr i32, i32 addrspace(1)* %base.relocated.casted, i32 %idx
+  call void @use_obj32(i32 addrspace(1)* %base)
+  call void @use_obj32(i32 addrspace(1)* %ptr)
+  ret void
+}
+
+define void @test_bitcast(i32 addrspace(1)* %base) gc "statepoint-example" {
+; CHECK-LABEL: test_bitcast
+entry:
+  %ptr = bitcast i32 addrspace(1)* %base to i64 addrspace(1)*
+; CHECK: bitcast i32 addrspace(1)* %base to i64 addrspace(1)*
+  call void @do_safepoint() [ "deopt"() ]
+; CHECK: %base.relocated = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %statepoint_token, i32 7, i32 7)
+; CHECK: %base.relocated.casted = bitcast i8 addrspace(1)* %base.relocated to i32 addrspace(1)*
+; CHECK: bitcast i32 addrspace(1)* %base.relocated.casted to i64 addrspace(1)*
+  call void @use_obj32(i32 addrspace(1)* %base)
+  call void @use_obj64(i64 addrspace(1)* %ptr)
+  ret void
+}
+
+define void @test_bitcast_bitcast(i32 addrspace(1)* %base) gc "statepoint-example" {
+; CHECK-LABEL: test_bitcast_bitcast
+entry:
+  %ptr1 = bitcast i32 addrspace(1)* %base to i64 addrspace(1)*
+  %ptr2 = bitcast i64 addrspace(1)* %ptr1 to i16 addrspace(1)*
+; CHECK: bitcast i32 addrspace(1)* %base to i64 addrspace(1)*
+; CHECK: bitcast i64 addrspace(1)* %ptr1 to i16 addrspace(1)*
+  call void @do_safepoint() [ "deopt"() ]
+
+; CHECK: %base.relocated = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %statepoint_token, i32 7, i32 7)
+; CHECK: %base.relocated.casted = bitcast i8 addrspace(1)* %base.relocated to i32 addrspace(1)*
+; CHECK: bitcast i32 addrspace(1)* %base.relocated.casted to i64 addrspace(1)*
+; CHECK: bitcast i64 addrspace(1)* %ptr1.remat to i16 addrspace(1)*
+  call void @use_obj32(i32 addrspace(1)* %base)
+  call void @use_obj16(i16 addrspace(1)* %ptr2)
+  ret void
+}
+
+define void @test_addrspacecast_addrspacecast(i32 addrspace(1)* %base) gc "statepoint-example" {
+; CHECK-LABEL: test_addrspacecast_addrspacecast
+entry:
+  %ptr1 = addrspacecast i32 addrspace(1)* %base to i32*
+  %ptr2 = addrspacecast i32* %ptr1 to i32 addrspace(1)*
+; CHECK: addrspacecast i32 addrspace(1)* %base to i32*
+; CHECK: addrspacecast i32* %ptr1 to i32 addrspace(1)*
+  call void @do_safepoint() [ "deopt"() ]
+
+; CHECK: %ptr2.relocated = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %statepoint_token, i32 8, i32 7)
+; CHECK: %ptr2.relocated.casted = bitcast i8 addrspace(1)* %ptr2.relocated to i32 addrspace(1)*
+; CHECK: %base.relocated = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %statepoint_token, i32 8, i32 8)
+; CHECK: %base.relocated.casted = bitcast i8 addrspace(1)* %base.relocated to i32 addrspace(1)*
+  call void @use_obj32(i32 addrspace(1)* %base)
+  call void @use_obj32(i32 addrspace(1)* %ptr2)
+  ret void
+}
+
+define void @test_bitcast_gep(i32 addrspace(1)* %base) gc "statepoint-example" {
+; CHECK-LABEL: test_bitcast_gep
+entry:
+  %ptr.gep = getelementptr i32, i32 addrspace(1)* %base, i32 15
+; CHECK: getelementptr
+; CHECK: bitcast i32 addrspace(1)* %ptr.gep to i64 addrspace(1)*
+  %ptr.cast = bitcast i32 addrspace(1)* %ptr.gep to i64 addrspace(1)*
+  call void @do_safepoint() [ "deopt"() ]
+
+; CHECK: gc.relocate
+; CHECK: bitcast
+; CHECK: getelementptr
+; CHECK: bitcast
+  call void @use_obj32(i32 addrspace(1)* %base)
+  call void @use_obj64(i64 addrspace(1)* %ptr.cast)
+  ret void
+}
+
+define void @test_intersecting_chains(i32 addrspace(1)* %base, i32 %idx) gc "statepoint-example" {
+; CHECK-LABEL: test_intersecting_chains
+entry:
+  %ptr.gep = getelementptr i32, i32 addrspace(1)* %base, i32 15
+; CHECK: getelementptr
+  %ptr.cast = bitcast i32 addrspace(1)* %ptr.gep to i64 addrspace(1)*
+; CHECK: bitcast
+  %ptr.cast2 = bitcast i32 addrspace(1)* %ptr.gep to i16 addrspace(1)*
+; CHECK: bitcast
+  call void @do_safepoint() [ "deopt"() ]
+
+; CHECK: getelementptr
+; CHECK: bitcast
+; CHECK: getelementptr
+; CHECK: bitcast
+  call void @use_obj64(i64 addrspace(1)* %ptr.cast)
+  call void @use_obj16(i16 addrspace(1)* %ptr.cast2)
+  ret void
+}
+
+define void @test_cost_threshold(i32 addrspace(1)* %base, i32 %idx1, i32 %idx2, i32 %idx3) gc "statepoint-example" {
+; CHECK-LABEL: test_cost_threshold
+entry:
+  %ptr.gep = getelementptr i32, i32 addrspace(1)* %base, i32 15
+; CHECK: getelementptr
+  %ptr.gep2 = getelementptr i32, i32 addrspace(1)* %ptr.gep, i32 %idx1
+; CHECK: getelementptr
+  %ptr.gep3 = getelementptr i32, i32 addrspace(1)* %ptr.gep2, i32 %idx2
+; CHECK: getelementptr
+  %ptr.gep4 = getelementptr i32, i32 addrspace(1)* %ptr.gep3, i32 %idx3
+; CHECK: getelementptr
+  %ptr.cast = bitcast i32 addrspace(1)* %ptr.gep4 to i64 addrspace(1)*
+  call void @do_safepoint() [ "deopt"() ]
+
+; CHECK: gc.relocate
+; CHECK: bitcast
+; CHECK: gc.relocate
+; CHECK: bitcast
+  call void @use_obj64(i64 addrspace(1)* %ptr.cast)
+  ret void
+}
+
+define void @test_two_derived(i32 addrspace(1)* %base) gc "statepoint-example" {
+; CHECK-LABEL: test_two_derived
+entry:
+  %ptr = getelementptr i32, i32 addrspace(1)* %base, i32 15
+  %ptr2 = getelementptr i32, i32 addrspace(1)* %base, i32 12
+; CHECK: getelementptr
+; CHECK: getelementptr
+  call void @do_safepoint() [ "deopt"() ]
+
+; CHECK: gc.relocate
+; CHECK: bitcast
+; CHECK: getelementptr
+; CHECK: getelementptr
+  call void @use_obj32(i32 addrspace(1)* %ptr)
+  call void @use_obj32(i32 addrspace(1)* %ptr2)
+  ret void
+}
+
+define void @test_gep_smallint_array([3 x i32] addrspace(1)* %base) gc "statepoint-example" {
+; CHECK-LABEL: test_gep_smallint_array
+entry:
+  %ptr = getelementptr [3 x i32], [3 x i32] addrspace(1)* %base, i32 0, i32 2
+; CHECK: getelementptr
+  call void @do_safepoint() [ "deopt"() ]
+
+; CHECK: gc.relocate
+; CHECK: bitcast
+; CHECK: getelementptr
+  call void @use_obj32(i32 addrspace(1)* %ptr)
+  ret void
+}
+
+declare i32 @fake_personality_function()
+
+define void @test_invoke(i32 addrspace(1)* %base) gc "statepoint-example" personality i32 ()* @fake_personality_function {
+; CHECK-LABEL: test_invoke
+entry:
+  %ptr.gep = getelementptr i32, i32 addrspace(1)* %base, i32 15
+; CHECK: getelementptr
+  %ptr.cast = bitcast i32 addrspace(1)* %ptr.gep to i64 addrspace(1)*
+; CHECK: bitcast
+  %ptr.cast2 = bitcast i32 addrspace(1)* %ptr.gep to i16 addrspace(1)*
+; CHECK: bitcast
+  invoke void @do_safepoint() [ "deopt"() ]
+          to label %normal unwind label %exception
+
+normal:
+; CHECK: normal:
+; CHECK: gc.relocate
+; CHECK: bitcast
+; CHECK: getelementptr
+; CHECK: bitcast
+; CHECK: getelementptr
+; CHECK: bitcast
+  call void @use_obj64(i64 addrspace(1)* %ptr.cast)
+  call void @use_obj16(i16 addrspace(1)* %ptr.cast2)
+  ret void
+
+exception:
+; CHECK: exception:
+  %landing_pad4 = landingpad token
+          cleanup
+; CHECK: gc.relocate
+; CHECK: bitcast
+; CHECK: getelementptr
+; CHECK: bitcast
+; CHECK: getelementptr
+; CHECK: bitcast
+  call void @use_obj64(i64 addrspace(1)* %ptr.cast)
+  call void @use_obj16(i16 addrspace(1)* %ptr.cast2)
+  ret void
+}
+
+define void @test_loop(i32 addrspace(1)* %base) gc "statepoint-example" {
+; CHECK-LABEL: test_loop
+entry:
+  %ptr.gep = getelementptr i32, i32 addrspace(1)* %base, i32 15
+; CHECK: getelementptr
+  br label %loop
+
+loop:                                             ; preds = %loop, %entry
+; CHECK: phi i32 addrspace(1)* [ %ptr.gep, %entry ], [ %ptr.gep.remat, %loop ]
+; CHECK: phi i32 addrspace(1)* [ %base, %entry ], [ %base.relocated.casted, %loop ]
+  call void @use_obj32(i32 addrspace(1)* %ptr.gep)
+  call void @do_safepoint() [ "deopt"() ]
+; CHECK: gc.relocate
+; CHECK: bitcast
+; CHECK: getelementptr
+  br label %loop
+}
+
+define void @test_too_long(i32 addrspace(1)* %base) gc "statepoint-example" {
+; CHECK-LABEL: test_too_long
+entry:
+  %ptr.gep = getelementptr i32, i32 addrspace(1)* %base, i32 15
+  %ptr.gep1 = getelementptr i32, i32 addrspace(1)* %ptr.gep, i32 15
+  %ptr.gep2 = getelementptr i32, i32 addrspace(1)* %ptr.gep1, i32 15
+  %ptr.gep3 = getelementptr i32, i32 addrspace(1)* %ptr.gep2, i32 15
+  %ptr.gep4 = getelementptr i32, i32 addrspace(1)* %ptr.gep3, i32 15
+  %ptr.gep5 = getelementptr i32, i32 addrspace(1)* %ptr.gep4, i32 15
+  %ptr.gep6 = getelementptr i32, i32 addrspace(1)* %ptr.gep5, i32 15
+  %ptr.gep7 = getelementptr i32, i32 addrspace(1)* %ptr.gep6, i32 15
+  %ptr.gep8 = getelementptr i32, i32 addrspace(1)* %ptr.gep7, i32 15
+  %ptr.gep9 = getelementptr i32, i32 addrspace(1)* %ptr.gep8, i32 15
+  %ptr.gep10 = getelementptr i32, i32 addrspace(1)* %ptr.gep9, i32 15
+  %ptr.gep11 = getelementptr i32, i32 addrspace(1)* %ptr.gep10, i32 15
+  call void @do_safepoint() [ "deopt"() ]
+; CHECK: gc.relocate
+; CHECK: bitcast
+; CHECK: gc.relocate
+; CHECK: bitcast
+  call void @use_obj32(i32 addrspace(1)* %ptr.gep11)
+  ret void
+}
+
+
+declare i32 addrspace(1)* @new_instance() nounwind "gc-leaf-function"
+
+; remat the gep in presence of base pointer which is a phi node.
+; FIXME: We should remove the extra basephi.base as well.
+define void @contains_basephi(i1 %cond) gc "statepoint-example" {
+; CHECK-LABEL: contains_basephi
+entry:
+  %base1 = call i32 addrspace(1)* @new_instance()
+  %base2 = call i32 addrspace(1)* @new_instance()
+  br i1 %cond, label %here, label %there
+
+here:
+  br label %merge
+
+there:
+  br label %merge
+
+merge:
+  ; CHECK: %basephi.base = phi i32 addrspace(1)* [ %base1, %here ], [ %base2, %there ], !is_base_value !0
+  ; CHECK: %basephi = phi i32 addrspace(1)* [ %base1, %here ], [ %base2, %there ]
+  ; CHECK: %ptr.gep = getelementptr i32, i32 addrspace(1)* %basephi, i32 15
+  ; CHECK: %statepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint
+  ; CHECK: %basephi.base.relocated = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %statepoint_token, i32 7, i32 7) ; (%basephi.base, %basephi.base)
+  ; CHECK: %basephi.base.relocated.casted = bitcast i8 addrspace(1)* %basephi.base.relocated to i32 addrspace(1)*
+  ; CHECK: %ptr.gep.remat = getelementptr i32, i32 addrspace(1)* %basephi.base.relocated.casted, i32 15
+  ; CHECK: call void @use_obj32(i32 addrspace(1)* %ptr.gep.remat)
+
+
+
+  %basephi = phi i32 addrspace(1)* [ %base1, %here ], [ %base2, %there ]
+  %ptr.gep = getelementptr i32, i32 addrspace(1)* %basephi, i32 15
+  call void @do_safepoint() ["deopt"() ]
+  call void @use_obj32(i32 addrspace(1)* %ptr.gep)
+  ret void
+}
+
+
+define void @test_intersecting_chains_with_phi(i1 %cond) gc "statepoint-example" {
+; CHECK-LABEL: test_intersecting_chains_with_phi
+entry:
+  %base1 = call i32 addrspace(1)* @new_instance()
+  %base2 = call i32 addrspace(1)* @new_instance()
+  br i1 %cond, label %here, label %there
+
+here:
+  br label %merge
+
+there:
+  br label %merge
+
+merge:
+  %basephi = phi i32 addrspace(1)* [ %base1, %here ], [ %base2, %there ]
+  %ptr.gep = getelementptr i32, i32 addrspace(1)* %basephi, i32 15
+  %ptr.cast = bitcast i32 addrspace(1)* %ptr.gep to i64 addrspace(1)*
+  %ptr.cast2 = bitcast i32 addrspace(1)* %ptr.gep to i16 addrspace(1)*
+  call void @do_safepoint() [ "deopt"() ]
+  ; CHECK: statepoint
+  ; CHECK: %ptr.gep.remat1 = getelementptr i32, i32 addrspace(1)* %basephi.base.relocated.casted, i32 15
+  ; CHECK: %ptr.cast.remat = bitcast i32 addrspace(1)* %ptr.gep.remat1 to i64 addrspace(1)*
+  ; CHECK: %ptr.gep.remat = getelementptr i32, i32 addrspace(1)* %basephi.base.relocated.casted, i32 15
+  ; CHECK: %ptr.cast2.remat = bitcast i32 addrspace(1)* %ptr.gep.remat to i16 addrspace(1)*
+  ; CHECK: call void @use_obj64(i64 addrspace(1)* %ptr.cast.remat)
+  ; CHECK: call void @use_obj16(i16 addrspace(1)* %ptr.cast2.remat)
+  call void @use_obj64(i64 addrspace(1)* %ptr.cast)
+  call void @use_obj16(i16 addrspace(1)* %ptr.cast2)
+  ret void
+}

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/rewrite-invoke.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/rewrite-invoke.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/rewrite-invoke.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/rewrite-invoke.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,33 @@
+; RUN: opt -rewrite-statepoints-for-gc -verify -S < %s | FileCheck %s
+; RUN: opt -passes=rewrite-statepoints-for-gc -verify -S < %s | FileCheck %s
+
+declare i8 addrspace(1)* @gc_call()
+
+declare i32* @fake_personality_function()
+
+define i8 addrspace(1)* @test(i1 %c) gc "statepoint-example" personality i32* ()* @fake_personality_function {
+; CHECK-LABEL: @test(
+entry:
+  br i1 %c, label %gc_invoke, label %normal_dest
+
+gc_invoke:
+; CHECK: [[TOKEN:%[^ ]+]] = invoke token {{[^@]+}}@llvm.experimental.gc.statepoint{{[^@]+}}@gc_call
+  %obj = invoke i8 addrspace(1)* @gc_call() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+          to label %normal_dest unwind label %unwind_dest
+
+unwind_dest:
+; CHECK: unwind_dest:
+  %lpad = landingpad { i8*, i32 }
+          cleanup
+  resume { i8*, i32 } undef
+
+; CHECK: [[NORMAL_DEST_SPLIT:[^:]+:]]
+; CHECK-NEXT: [[RET_VAL:%[^ ]+]] = call i8 addrspace(1)* @llvm.experimental.gc.result.p1i8(token [[TOKEN]])
+; CHECK-NEXT: br label %normal_dest
+
+normal_dest:
+; CHECK: normal_dest:
+; CHECK-NEXT: %merge = phi i8 addrspace(1)* [ null, %entry ], [ %obj2, %normal_dest1 ]
+  %merge = phi i8 addrspace(1)* [ null, %entry ], [ %obj, %gc_invoke ]
+  ret i8 addrspace(1)* %merge
+}

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/scalar-base-vector.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/scalar-base-vector.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/scalar-base-vector.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/scalar-base-vector.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,143 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -rewrite-statepoints-for-gc -S | FileCheck  %s
+; RUN: opt < %s -passes=rewrite-statepoints-for-gc -S | FileCheck  %s
+
+declare void @do_safepoint()
+declare i8 addrspace(1)* @def_ptr()
+
+define i32 addrspace(1)* @test1(i8 addrspace(1)* %base1, <2 x i64> %offsets) gc "statepoint-example" {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[FIRST:%.*]], label [[SECOND:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, i8 addrspace(1)* ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_p1i8f(i64 2882400000, i32 0, i8 addrspace(1)* ()* @def_ptr, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[BASE21:%.*]] = call i8 addrspace(1)* @llvm.experimental.gc.result.p1i8(token [[STATEPOINT_TOKEN]])
+; CHECK-NEXT:    br label [[SECOND]]
+; CHECK:       second:
+; CHECK-NEXT:    [[PHI_BASE:%.*]] = phi i8 addrspace(1)* [ [[BASE1:%.*]], [[ENTRY:%.*]] ], [ [[BASE21]], [[FIRST]] ], !is_base_value !0
+; CHECK-NEXT:    [[PHI:%.*]] = phi i8 addrspace(1)* [ [[BASE1]], [[ENTRY]] ], [ [[BASE21]], [[FIRST]] ]
+; CHECK-NEXT:    [[BASE_I32:%.*]] = bitcast i8 addrspace(1)* [[PHI]] to i32 addrspace(1)*
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast i8 addrspace(1)* [[PHI_BASE]] to i32 addrspace(1)*
+; CHECK-NEXT:    [[DOTSPLATINSERT_BASE:%.*]] = insertelement <2 x i32 addrspace(1)*> zeroinitializer, i32 addrspace(1)* [[CAST]], i32 0, !is_base_value !0
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32 addrspace(1)*> undef, i32 addrspace(1)* [[BASE_I32]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT_BASE:%.*]] = shufflevector <2 x i32 addrspace(1)*> [[DOTSPLATINSERT_BASE]], <2 x i32 addrspace(1)*> zeroinitializer, <2 x i32> zeroinitializer, !is_base_value !0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32 addrspace(1)*> [[DOTSPLATINSERT]], <2 x i32 addrspace(1)*> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC:%.*]] = getelementptr i32, <2 x i32 addrspace(1)*> [[DOTSPLAT]], <2 x i64> [[OFFSETS:%.*]]
+; CHECK-NEXT:    [[PTR_BASE:%.*]] = extractelement <2 x i32 addrspace(1)*> [[DOTSPLAT_BASE]], i32 1, !is_base_value !0
+; CHECK-NEXT:    [[PTR:%.*]] = extractelement <2 x i32 addrspace(1)*> [[VEC]], i32 1
+; CHECK-NEXT:    [[STATEPOINT_TOKEN2:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @do_safepoint, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* [[PTR]], i32 addrspace(1)* [[PTR_BASE]])
+; CHECK-NEXT:    [[PTR_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN2]], i32 13, i32 12)
+; CHECK-NEXT:    [[PTR_RELOCATED_CASTED:%.*]] = bitcast i8 addrspace(1)* [[PTR_RELOCATED]] to i32 addrspace(1)*
+; CHECK-NEXT:    [[PTR_BASE_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN2]], i32 13, i32 13)
+; CHECK-NEXT:    [[PTR_BASE_RELOCATED_CASTED:%.*]] = bitcast i8 addrspace(1)* [[PTR_BASE_RELOCATED]] to i32 addrspace(1)*
+; CHECK-NEXT:    ret i32 addrspace(1)* [[PTR_RELOCATED_CASTED]]
+;
+entry:
+  br i1 undef, label %first, label %second
+
+first:
+  %base2 = call i8 addrspace(1)* @def_ptr() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  br label %second
+
+second:
+  %phi = phi i8 addrspace(1)* [ %base1, %entry ], [ %base2, %first ]
+  %base.i32 = bitcast i8 addrspace(1)* %phi to i32 addrspace(1)*
+  %vec = getelementptr i32, i32 addrspace(1)* %base.i32, <2 x i64> %offsets
+  %ptr = extractelement <2 x i32 addrspace(1)*> %vec, i32 1
+  call void @do_safepoint() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  ret i32 addrspace(1)* %ptr
+}
+
+define i32 addrspace(1)* @test2(i8 addrspace(1)* %base, <2 x i64> %offsets) gc "statepoint-example" {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BASE_I32:%.*]] = bitcast i8 addrspace(1)* [[BASE:%.*]] to i32 addrspace(1)*
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast i8 addrspace(1)* [[BASE]] to i32 addrspace(1)*
+; CHECK-NEXT:    [[DOTSPLATINSERT_BASE:%.*]] = insertelement <2 x i32 addrspace(1)*> zeroinitializer, i32 addrspace(1)* [[CAST]], i32 0, !is_base_value !0
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32 addrspace(1)*> undef, i32 addrspace(1)* [[BASE_I32]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT_BASE:%.*]] = shufflevector <2 x i32 addrspace(1)*> [[DOTSPLATINSERT_BASE]], <2 x i32 addrspace(1)*> zeroinitializer, <2 x i32> zeroinitializer, !is_base_value !0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32 addrspace(1)*> [[DOTSPLATINSERT]], <2 x i32 addrspace(1)*> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC:%.*]] = getelementptr i32, <2 x i32 addrspace(1)*> [[DOTSPLAT]], <2 x i64> [[OFFSETS:%.*]]
+; CHECK-NEXT:    [[PTR_BASE:%.*]] = extractelement <2 x i32 addrspace(1)*> [[DOTSPLAT_BASE]], i32 1, !is_base_value !0
+; CHECK-NEXT:    [[PTR:%.*]] = extractelement <2 x i32 addrspace(1)*> [[VEC]], i32 1
+; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @do_safepoint, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* [[PTR]], i32 addrspace(1)* [[PTR_BASE]])
+; CHECK-NEXT:    [[PTR_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 8, i32 7)
+; CHECK-NEXT:    [[PTR_RELOCATED_CASTED:%.*]] = bitcast i8 addrspace(1)* [[PTR_RELOCATED]] to i32 addrspace(1)*
+; CHECK-NEXT:    [[PTR_BASE_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 8, i32 8)
+; CHECK-NEXT:    [[PTR_BASE_RELOCATED_CASTED:%.*]] = bitcast i8 addrspace(1)* [[PTR_BASE_RELOCATED]] to i32 addrspace(1)*
+; CHECK-NEXT:    ret i32 addrspace(1)* [[PTR_RELOCATED_CASTED]]
+;
+entry:
+  %base.i32 = bitcast i8 addrspace(1)* %base to i32 addrspace(1)*
+  %vec = getelementptr i32, i32 addrspace(1)* %base.i32, <2 x i64> %offsets
+  %ptr = extractelement <2 x i32 addrspace(1)*> %vec, i32 1
+  call void @do_safepoint()
+  ret i32 addrspace(1)* %ptr
+}
+
+define i32 addrspace(1)* @test3(<2 x i8 addrspace(1)*> %base, <2 x i64> %offsets) gc "statepoint-example" {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BASE_I32:%.*]] = bitcast <2 x i8 addrspace(1)*> [[BASE:%.*]] to <2 x i32 addrspace(1)*>
+; CHECK-NEXT:    [[VEC:%.*]] = getelementptr i32, <2 x i32 addrspace(1)*> [[BASE_I32]], <2 x i64> [[OFFSETS:%.*]]
+; CHECK-NEXT:    [[BASE_EE:%.*]] = extractelement <2 x i8 addrspace(1)*> [[BASE]], i32 1, !is_base_value !0
+; CHECK-NEXT:    [[PTR:%.*]] = extractelement <2 x i32 addrspace(1)*> [[VEC]], i32 1
+; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @do_safepoint, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* [[PTR]], i8 addrspace(1)* [[BASE_EE]])
+; CHECK-NEXT:    [[PTR_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 8, i32 7)
+; CHECK-NEXT:    [[PTR_RELOCATED_CASTED:%.*]] = bitcast i8 addrspace(1)* [[PTR_RELOCATED]] to i32 addrspace(1)*
+; CHECK-NEXT:    [[BASE_EE_RELOCATED:%.*]] = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token [[STATEPOINT_TOKEN]], i32 8, i32 8)
+; CHECK-NEXT:    ret i32 addrspace(1)* [[PTR_RELOCATED_CASTED]]
+;
+entry:
+  %base.i32 = bitcast <2 x i8 addrspace(1)*> %base to <2 x i32 addrspace(1)*>
+  %vec = getelementptr i32, <2 x i32 addrspace(1)*> %base.i32, <2 x i64> %offsets
+  %ptr = extractelement <2 x i32 addrspace(1)*> %vec, i32 1
+  call void @do_safepoint()
+  ret i32 addrspace(1)* %ptr
+}
+
+define i32 addrspace(1)* @test4(i8 addrspace(1)* %base, <2 x i64> %offsets) gc "statepoint-example" {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BASE_I32:%.*]] = bitcast i8 addrspace(1)* [[BASE:%.*]] to i32 addrspace(1)*
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast i8 addrspace(1)* [[BASE]] to i32 addrspace(1)*
+; CHECK-NEXT:    [[DOTSPLATINSERT_BASE:%.*]] = insertelement <2 x i32 addrspace(1)*> zeroinitializer, i32 addrspace(1)* [[CAST]], i32 0, !is_base_value !0
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32 addrspace(1)*> undef, i32 addrspace(1)* [[BASE_I32]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT_BASE:%.*]] = shufflevector <2 x i32 addrspace(1)*> [[DOTSPLATINSERT_BASE]], <2 x i32 addrspace(1)*> zeroinitializer, <2 x i32> zeroinitializer, !is_base_value !0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32 addrspace(1)*> [[DOTSPLATINSERT]], <2 x i32 addrspace(1)*> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC:%.*]] = getelementptr i32, <2 x i32 addrspace(1)*> [[DOTSPLAT]], <2 x i64> [[OFFSETS:%.*]]
+; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @do_safepoint, i32 0, i32 0, i32 0, i32 0, <2 x i32 addrspace(1)*> [[VEC]], <2 x i32 addrspace(1)*> [[DOTSPLAT_BASE]])
+; CHECK-NEXT:    [[VEC_RELOCATED:%.*]] = call coldcc <2 x i8 addrspace(1)*> @llvm.experimental.gc.relocate.v2p1i8(token [[STATEPOINT_TOKEN]], i32 8, i32 7)
+; CHECK-NEXT:    [[VEC_RELOCATED_CASTED:%.*]] = bitcast <2 x i8 addrspace(1)*> [[VEC_RELOCATED]] to <2 x i32 addrspace(1)*>
+; CHECK-NEXT:    [[DOTSPLAT_BASE_RELOCATED:%.*]] = call coldcc <2 x i8 addrspace(1)*> @llvm.experimental.gc.relocate.v2p1i8(token [[STATEPOINT_TOKEN]], i32 8, i32 8)
+; CHECK-NEXT:    [[DOTSPLAT_BASE_RELOCATED_CASTED:%.*]] = bitcast <2 x i8 addrspace(1)*> [[DOTSPLAT_BASE_RELOCATED]] to <2 x i32 addrspace(1)*>
+; CHECK-NEXT:    [[PTR:%.*]] = extractelement <2 x i32 addrspace(1)*> [[VEC_RELOCATED_CASTED]], i32 1
+; CHECK-NEXT:    ret i32 addrspace(1)* [[PTR]]
+;
+entry:
+  %base.i32 = bitcast i8 addrspace(1)* %base to i32 addrspace(1)*
+  %vec = getelementptr i32, i32 addrspace(1)* %base.i32, <2 x i64> %offsets
+  call void @do_safepoint()
+  %ptr = extractelement <2 x i32 addrspace(1)*> %vec, i32 1
+  ret i32 addrspace(1)* %ptr
+}
+
+define i32 addrspace(1)* @test5(<2 x i8 addrspace(1)*> %base, <2 x i64> %offsets) gc "statepoint-example" {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BASE_I32:%.*]] = bitcast <2 x i8 addrspace(1)*> [[BASE:%.*]] to <2 x i32 addrspace(1)*>
+; CHECK-NEXT:    [[VEC:%.*]] = getelementptr i32, <2 x i32 addrspace(1)*> [[BASE_I32]], <2 x i64> [[OFFSETS:%.*]]
+; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* @do_safepoint, i32 0, i32 0, i32 0, i32 0, <2 x i8 addrspace(1)*> [[BASE]])
+; CHECK-NEXT:    [[BASE_RELOCATED:%.*]] = call coldcc <2 x i8 addrspace(1)*> @llvm.experimental.gc.relocate.v2p1i8(token [[STATEPOINT_TOKEN]], i32 7, i32 7)
+; CHECK-NEXT:    [[BASE_I32_REMAT:%.*]] = bitcast <2 x i8 addrspace(1)*> [[BASE_RELOCATED]] to <2 x i32 addrspace(1)*>
+; CHECK-NEXT:    [[VEC_REMAT:%.*]] = getelementptr i32, <2 x i32 addrspace(1)*> [[BASE_I32_REMAT]], <2 x i64> [[OFFSETS]]
+; CHECK-NEXT:    [[PTR:%.*]] = extractelement <2 x i32 addrspace(1)*> [[VEC_REMAT]], i32 0
+; CHECK-NEXT:    ret i32 addrspace(1)* [[PTR]]
+;
+entry:
+  %base.i32 = bitcast <2 x i8 addrspace(1)*> %base to <2 x i32 addrspace(1)*>
+  %vec = getelementptr i32, <2 x i32 addrspace(1)*> %base.i32, <2 x i64> %offsets
+  call void @do_safepoint()
+  %ptr = extractelement <2 x i32 addrspace(1)*> %vec, i32 0
+  ret i32 addrspace(1)* %ptr
+}

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/statepoint-attrs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/statepoint-attrs.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/statepoint-attrs.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/statepoint-attrs.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,17 @@
+; RUN: opt -S -rewrite-statepoints-for-gc < %s | FileCheck %s
+; RUN: opt -S -passes=rewrite-statepoints-for-gc < %s | FileCheck %s
+; Ensure statepoints copy (valid) attributes from callsites.
+
+declare void @f(i8 addrspace(1)* %obj)
+
+; copy over norecurse noimplicitfloat to statepoint call
+define void @test1(i8 addrspace(1)* %arg) gc "statepoint-example" {
+; CHECK-LABEL: test1(
+; CHECK: call token (i64, i32, void (i8 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidp1i8f(i64 2882400000, i32 0, void (i8 addrspace(1)*)* @f, i32 1, i32 0, i8 addrspace(1)* %arg, i32 0, i32 0, i8 addrspace(1)* %arg) #1
+
+ call void @f(i8 addrspace(1)* %arg) #1
+ ret void
+}
+
+
+attributes #1 = { norecurse noimplicitfloat }

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/statepoint-calling-conventions.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/statepoint-calling-conventions.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/statepoint-calling-conventions.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/statepoint-calling-conventions.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,43 @@
+; RUN: opt -rewrite-statepoints-for-gc -S < %s | FileCheck %s
+; RUN: opt -passes=rewrite-statepoints-for-gc -S < %s | FileCheck %s
+
+; Ensure that the gc.statepoint calls / invokes we generate carry over
+; the right calling conventions.
+
+define i64 addrspace(1)* @test_invoke_format(i64 addrspace(1)* %obj, i64 addrspace(1)* %obj1) gc "statepoint-example" personality i32 ()* @personality {
+; CHECK-LABEL: @test_invoke_format(
+; CHECK-LABEL: entry:
+; CHECK: invoke coldcc token (i64, i32, i64 addrspace(1)* (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_p1i64p1i64f(i64 2882400000, i32 0, i64 addrspace(1)* (i64 addrspace(1)*)* @callee, i32 1, i32 0, i64 addrspace(1)* %obj, i32 0, i32 0
+entry:
+  %ret_val = invoke coldcc i64 addrspace(1)* @callee(i64 addrspace(1)* %obj)
+               to label %normal_return unwind label %exceptional_return
+
+normal_return:
+  ret i64 addrspace(1)* %ret_val
+
+exceptional_return:
+  %landing_pad4 = landingpad token
+          cleanup
+  ret i64 addrspace(1)* %obj1
+}
+
+define i64 addrspace(1)* @test_call_format(i64 addrspace(1)* %obj, i64 addrspace(1)* %obj1) gc "statepoint-example" {
+; CHECK-LABEL: @test_call_format(
+; CHECK-LABEL: entry:
+; CHECK: call coldcc token (i64, i32, i64 addrspace(1)* (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_p1i64p1i64f(i64 2882400000, i32 0, i64 addrspace(1)* (i64 addrspace(1)*)* @callee, i32 1, i32 0, i64 addrspace(1)* %obj, i32 0, i32 0
+entry:
+  %ret_val = call coldcc i64 addrspace(1)* @callee(i64 addrspace(1)* %obj)
+  ret i64 addrspace(1)* %ret_val
+}
+
+; This function is inlined when inserting a poll.
+declare void @do_safepoint()
+define void @gc.safepoint_poll() {
+; CHECK-LABEL: gc.safepoint_poll
+entry:
+  call void @do_safepoint()
+  ret void
+}
+
+declare coldcc i64 addrspace(1)* @callee(i64 addrspace(1)*)
+declare i32 @personality()

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/statepoint-coreclr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/statepoint-coreclr.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/statepoint-coreclr.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/statepoint-coreclr.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,32 @@
+; RUN: opt < %s -S -rewrite-statepoints-for-gc | FileCheck %s
+; RUN: opt < %s -S -passes=rewrite-statepoints-for-gc | FileCheck %s
+
+; Basic test to make sure that safepoints are placed
+; for CoreCLR GC
+
+declare void @foo()
+
+define void @test_simple_call() gc "coreclr" {
+; CHECK-LABEL: test_simple_call
+entry:
+  br label %other
+other:
+; CHECK-LABEL: other
+; CHECK: statepoint
+; CHECK-NOT: gc.result
+  call void @foo()
+  ret void
+}
+
+; This function is inlined when inserting a poll.  To avoid recursive
+; issues, make sure we don't place safepoints in it.
+declare void @do_safepoint()
+define void @gc.safepoint_poll() {
+; CHECK-LABEL: gc.safepoint_poll
+; CHECK-LABEL: entry
+; CHECK-NEXT: do_safepoint
+; CHECK-NEXT: ret void
+entry:
+  call void @do_safepoint()
+  ret void
+}

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/statepoint-format.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/statepoint-format.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/statepoint-format.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/statepoint-format.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,43 @@
+; RUN: opt -rewrite-statepoints-for-gc -S < %s | FileCheck %s
+; RUN: opt -passes=rewrite-statepoints-for-gc -S < %s | FileCheck %s
+
+; Ensure that the gc.statepoint calls / invokes we generate have the
+; set of arguments we expect it to have.
+
+define i64 addrspace(1)* @test_invoke_format(i64 addrspace(1)* %obj, i64 addrspace(1)* %obj1) gc "statepoint-example" personality i32 ()* @personality {
+; CHECK-LABEL: @test_invoke_format(
+; CHECK-LABEL: entry:
+; CHECK: invoke token (i64, i32, i64 addrspace(1)* (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_p1i64p1i64f(i64 2882400000, i32 0, i64 addrspace(1)* (i64 addrspace(1)*)* @callee, i32 1, i32 0, i64 addrspace(1)* %obj, i32 0, i32 0, i64 addrspace(1)* %obj1, i64 addrspace(1)* %obj)
+entry:
+  %ret_val = invoke i64 addrspace(1)* @callee(i64 addrspace(1)* %obj)
+               to label %normal_return unwind label %exceptional_return
+
+normal_return:
+  ret i64 addrspace(1)* %ret_val
+
+exceptional_return:
+  %landing_pad4 = landingpad token
+          cleanup
+  ret i64 addrspace(1)* %obj1
+}
+
+define i64 addrspace(1)* @test_call_format(i64 addrspace(1)* %obj, i64 addrspace(1)* %obj1) gc "statepoint-example" {
+; CHECK-LABEL: @test_call_format(
+; CHECK-LABEL: entry:
+; CHECK: call token (i64, i32, i64 addrspace(1)* (i64 addrspace(1)*)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_p1i64p1i64f(i64 2882400000, i32 0, i64 addrspace(1)* (i64 addrspace(1)*)* @callee, i32 1, i32 0, i64 addrspace(1)* %obj, i32 0, i32 0, i64 addrspace(1)* %obj)
+entry:
+  %ret_val = call i64 addrspace(1)* @callee(i64 addrspace(1)* %obj)
+  ret i64 addrspace(1)* %ret_val
+}
+
+; This function is inlined when inserting a poll.
+declare void @do_safepoint()
+define void @gc.safepoint_poll() {
+; CHECK-LABEL: gc.safepoint_poll
+entry:
+  call void @do_safepoint()
+  ret void
+}
+
+declare i64 addrspace(1)* @callee(i64 addrspace(1)*)
+declare i32 @personality()

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/two-invokes-one-landingpad.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/two-invokes-one-landingpad.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/two-invokes-one-landingpad.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/two-invokes-one-landingpad.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,34 @@
+; RUN: opt < %s -rewrite-statepoints-for-gc -S | FileCheck %s
+; RUN: opt < %s -passes=rewrite-statepoints-for-gc -S | FileCheck %s
+
+declare void @some_call(i64 addrspace(1)*)
+
+declare i32 @dummy_personality_function()
+
+define i64 addrspace(1)* @test(i64 addrspace(1)* %obj, i64 addrspace(1)* %obj1)
+  gc "statepoint-example"
+  personality i32 ()* @dummy_personality_function {
+entry:
+  invoke void @some_call(i64 addrspace(1)* %obj) [ "deopt"() ]
+          to label %second_invoke unwind label %exceptional_return
+
+second_invoke:                                    ; preds = %entry
+  invoke void @some_call(i64 addrspace(1)* %obj) [ "deopt"() ]
+          to label %normal_return unwind label %exceptional_return
+
+normal_return:                                    ; preds = %second_invoke
+  ret i64 addrspace(1)* %obj
+
+; CHECK: exceptional_return1:
+; CHECK-NEXT: %lpad2 = landingpad token
+
+; CHECK: exceptional_return.split-lp:
+; CHECK-NEXT: %lpad.split-lp = landingpad token
+
+; CHECK: exceptional_return:
+; CHECK-NOT: phi token
+
+exceptional_return:                               ; preds = %second_invoke, %entry
+  %lpad = landingpad token cleanup
+  ret i64 addrspace(1)* %obj1
+}

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/unreachable-regression.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/unreachable-regression.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/unreachable-regression.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/unreachable-regression.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,34 @@
+; RUN: opt -S -rewrite-statepoints-for-gc < %s | FileCheck %s
+; RUN: opt -S -passes=rewrite-statepoints-for-gc < %s | FileCheck %s
+;
+; Regression test:
+;   After the rewritable callsite collection if any callsite was found
+;   in a block that was reported unreachable by DominanceTree then
+;   removeUnreachableBlocks() was called. But it is stronger than
+;   DominatorTree::isReachableFromEntry(), i.e. removeUnreachableBlocks
+;   can remove some blocks for which isReachableFromEntry() returns true.
+;   This resulted in stale pointers to the collected but removed
+;   callsites. Such stale pointers caused crash when accessed.
+declare void @f(i8 addrspace(1)* %obj)
+
+define void @test(i8 addrspace(1)* %arg) gc "statepoint-example" {
+; CHECK-LABEL: test(
+; CHECK-NEXT: @f
+ call void @f(i8 addrspace(1)* %arg) #1
+ br i1 true, label %not_zero, label %zero
+
+not_zero:
+ ret void
+
+; This block is reachable but removed by removeUnreachableBlocks()
+zero:
+; CHECK-NOT: @f
+ call void @f(i8 addrspace(1)* %arg) #1
+ ret void
+
+unreach:
+ call void @f(i8 addrspace(1)* %arg) #1
+ ret void
+}
+
+attributes #1 = { norecurse noimplicitfloat }

Added: llvm/trunk/test/Transforms/RewriteStatepointsForGC/vector-bitcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/RewriteStatepointsForGC/vector-bitcast.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/RewriteStatepointsForGC/vector-bitcast.ll (added)
+++ llvm/trunk/test/Transforms/RewriteStatepointsForGC/vector-bitcast.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,29 @@
+; RUN: opt -S -rewrite-statepoints-for-gc < %s | FileCheck %s
+; RUN: opt -S -passes=rewrite-statepoints-for-gc < %s | FileCheck %s
+;
+; A test to make sure that we can look through bitcasts of
+; vector types when a base pointer is contained in a vector.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare i8 addrspace(1)* @foo()
+
+; Function Attrs: uwtable
+define i32 @test() gc "statepoint-example" {
+; CHECK-LABEL: @test
+entry:
+; CHECK-LABEL: entry
+; CHECK: %bc = bitcast
+; CHECK: %[[p1:[A-Za-z0-9_]+]] = extractelement
+; CHECK: %[[p2:[A-Za-z0-9_]+]] = extractelement
+; CHECK: llvm.experimental.gc.statepoint
+; CHECK: %[[p2]].relocated = {{.+}} @llvm.experimental.gc.relocate
+; CHECK: %[[p1]].relocated = {{.+}} @llvm.experimental.gc.relocate
+; CHECK: load atomic
+  %bc = bitcast <8 x i8 addrspace(1)*> undef to <8 x i32 addrspace(1)*>
+  %ptr= extractelement <8 x i32 addrspace(1)*> %bc, i32 7
+  %0 = call i8 addrspace(1)* @foo() [ "deopt"() ]
+  %1 = load atomic i32, i32 addrspace(1)* %ptr unordered, align 4
+  ret i32 %1
+}

Added: llvm/trunk/test/Transforms/SCCP/2002-05-02-MissSecondInst.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/2002-05-02-MissSecondInst.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/2002-05-02-MissSecondInst.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/2002-05-02-MissSecondInst.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,8 @@
+; RUN: opt < %s -sccp -S | not grep sub
+
+define void @test3(i32, i32) {
+	add i32 0, 0		; <i32>:3 [#uses=0]
+	sub i32 0, 4		; <i32>:4 [#uses=0]
+	ret void
+}
+

Added: llvm/trunk/test/Transforms/SCCP/2002-05-20-MissedIncomingValue.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/2002-05-20-MissedIncomingValue.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/2002-05-20-MissedIncomingValue.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/2002-05-20-MissedIncomingValue.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,19 @@
+; This test shows a case where SCCP is incorrectly eliminating the PHI node
+; because it thinks it has a constant 0 value, when it really doesn't.
+
+; RUN: opt < %s -sccp -S | grep phi
+
+define i32 @test(i32 %A, i1 %c) {
+bb1:
+	br label %BB2
+BB2:		; preds = %BB4, %bb1
+	%V = phi i32 [ 0, %bb1 ], [ %A, %BB4 ]		; <i32> [#uses=1]
+	br label %BB3
+BB3:		; preds = %BB2
+	br i1 %c, label %BB4, label %BB5
+BB4:		; preds = %BB3
+	br label %BB2
+BB5:		; preds = %BB3
+	ret i32 %V
+}
+

Added: llvm/trunk/test/Transforms/SCCP/2002-05-21-InvalidSimplify.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/2002-05-21-InvalidSimplify.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/2002-05-21-InvalidSimplify.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/2002-05-21-InvalidSimplify.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,33 @@
+; This test shows SCCP "proving" that the loop (from bb6 to 14) loops infinitely
+; this is in fact NOT the case, so the return should still be alive in the code
+; after sccp and CFG simplification have been performed.
+;
+; RUN: opt < %s -sccp -simplifycfg -S | \
+; RUN:   grep ret
+
+define void @old_main() {
+bb3:
+	br label %bb6
+bb6:		; preds = %bb14, %bb3
+	%reg403 = phi i32 [ %reg155, %bb14 ], [ 0, %bb3 ]		; <i32> [#uses=1]
+	%reg155 = add i32 %reg403, 1		; <i32> [#uses=2]
+	br label %bb11
+bb11:		; preds = %bb11, %bb6
+	%reg407 = phi i32 [ %reg408, %bb11 ], [ 0, %bb6 ]		; <i32> [#uses=2]
+	%reg408 = add i32 %reg407, 1		; <i32> [#uses=1]
+	%cond550 = icmp sle i32 %reg407, 1		; <i1> [#uses=1]
+	br i1 %cond550, label %bb11, label %bb12
+bb12:		; preds = %bb11
+	br label %bb13
+bb13:		; preds = %bb13, %bb12
+	%reg409 = phi i32 [ %reg410, %bb13 ], [ 0, %bb12 ]		; <i32> [#uses=1]
+	%reg410 = add i32 %reg409, 1		; <i32> [#uses=2]
+	%cond552 = icmp sle i32 %reg410, 2		; <i1> [#uses=1]
+	br i1 %cond552, label %bb13, label %bb14
+bb14:		; preds = %bb13
+	%cond553 = icmp sle i32 %reg155, 31		; <i1> [#uses=1]
+	br i1 %cond553, label %bb6, label %bb15
+bb15:		; preds = %bb14
+	ret void
+}
+

Added: llvm/trunk/test/Transforms/SCCP/2002-08-30-GetElementPtrTest.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/2002-08-30-GetElementPtrTest.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/2002-08-30-GetElementPtrTest.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/2002-08-30-GetElementPtrTest.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,9 @@
+; RUN: opt < %s -sccp -S | not grep %X
+
+ at G = external global [40 x i32]		; <[40 x i32]*> [#uses=1]
+
+define i32* @test() {
+	%X = getelementptr [40 x i32], [40 x i32]* @G, i64 0, i64 0		; <i32*> [#uses=1]
+	ret i32* %X
+}
+

Added: llvm/trunk/test/Transforms/SCCP/2003-06-24-OverdefinedPHIValue.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/2003-06-24-OverdefinedPHIValue.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/2003-06-24-OverdefinedPHIValue.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/2003-06-24-OverdefinedPHIValue.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,30 @@
+; RUN: opt < %s -sccp -simplifycfg -S | \
+; RUN:   not grep then:
+
+define void @cprop_test11(i32* %data.1) {
+entry:
+	%tmp.1 = load i32, i32* %data.1		; <i32> [#uses=3]
+	%tmp.41 = icmp sgt i32 %tmp.1, 1		; <i1> [#uses=1]
+	br i1 %tmp.41, label %no_exit, label %loopexit
+no_exit:		; preds = %endif, %then, %entry
+	%j.0 = phi i32 [ %j.0, %endif ], [ %i.0, %then ], [ 1, %entry ]		; <i32> [#uses=3]
+	%i.0 = phi i32 [ %inc, %endif ], [ %inc1, %then ], [ 1, %entry ]		; <i32> [#uses=4]
+	%tmp.8.not = icmp ne i32 %j.0, 0		; <i1> [#uses=1]
+	br i1 %tmp.8.not, label %endif, label %then
+then:		; preds = %no_exit
+	%inc1 = add i32 %i.0, 1		; <i32> [#uses=3]
+	%tmp.42 = icmp slt i32 %inc1, %tmp.1		; <i1> [#uses=1]
+	br i1 %tmp.42, label %no_exit, label %loopexit
+endif:		; preds = %no_exit
+	%inc = add i32 %i.0, 1		; <i32> [#uses=3]
+	%tmp.4 = icmp slt i32 %inc, %tmp.1		; <i1> [#uses=1]
+	br i1 %tmp.4, label %no_exit, label %loopexit
+loopexit:		; preds = %endif, %then, %entry
+	%j.1 = phi i32 [ 1, %entry ], [ %j.0, %endif ], [ %i.0, %then ]		; <i32> [#uses=1]
+	%i.1 = phi i32 [ 1, %entry ], [ %inc, %endif ], [ %inc1, %then ]		; <i32> [#uses=1]
+	%tmp.17 = getelementptr i32, i32* %data.1, i64 1		; <i32*> [#uses=1]
+	store i32 %j.1, i32* %tmp.17
+	%tmp.23 = getelementptr i32, i32* %data.1, i64 2		; <i32*> [#uses=1]
+	store i32 %i.1, i32* %tmp.23
+	ret void
+}

Added: llvm/trunk/test/Transforms/SCCP/2003-08-26-InvokeHandling.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/2003-08-26-InvokeHandling.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/2003-08-26-InvokeHandling.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/2003-08-26-InvokeHandling.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,23 @@
+; The PHI cannot be eliminated from this testcase, SCCP is mishandling invoke's!
+; RUN: opt < %s -sccp -S | grep phi
+
+declare void @foo()
+
+define i32 @test(i1 %cond) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+Entry:
+	br i1 %cond, label %Inv, label %Cont
+Inv:		; preds = %Entry
+	invoke void @foo( )
+			to label %Ok unwind label %LPad
+Ok:		; preds = %Inv
+	br label %Cont
+LPad:
+        %val = landingpad { i8*, i32 }
+                 catch i8* null
+        br label %Cont
+Cont:		; preds = %Ok, %Inv, %Entry
+	%X = phi i32 [ 0, %Entry ], [ 1, %Ok ], [ 0, %LPad ]		; <i32> [#uses=1]
+	ret i32 %X
+}
+
+declare i32 @__gxx_personality_v0(...)

Added: llvm/trunk/test/Transforms/SCCP/2004-11-16-DeadInvoke.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/2004-11-16-DeadInvoke.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/2004-11-16-DeadInvoke.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/2004-11-16-DeadInvoke.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,18 @@
+; RUN: opt < %s -sccp -disable-output
+
+declare i32 @foo()
+
+define void @caller() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+	br i1 true, label %T, label %F
+F:		; preds = %0
+	%X = invoke i32 @foo( )
+			to label %T unwind label %LP		; <i32> [#uses=0]
+LP:
+        %val = landingpad { i8*, i32 }
+                 catch i8* null
+        br label %T
+T:
+	ret void
+}
+
+declare i32 @__gxx_personality_v0(...)

Added: llvm/trunk/test/Transforms/SCCP/2004-12-10-UndefBranchBug.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/2004-12-10-UndefBranchBug.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/2004-12-10-UndefBranchBug.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/2004-12-10-UndefBranchBug.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,12 @@
+; RUN: opt < %s -sccp -S | grep "ret i32 1"
+
+; This function definitely returns 1, even if we don't know the direction
+; of the branch.
+
+define i32 @foo() {
+	br i1 undef, label %T, label %T
+T:		; preds = %0, %0
+	%X = add i32 0, 1		; <i32> [#uses=1]
+	ret i32 %X
+}
+

Added: llvm/trunk/test/Transforms/SCCP/2006-10-23-IPSCCP-Crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/2006-10-23-IPSCCP-Crash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/2006-10-23-IPSCCP-Crash.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/2006-10-23-IPSCCP-Crash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,103 @@
+; RUN: opt < %s -sccp -disable-output
+; END.
+target datalayout = "E-p:32:32"
+target triple = "powerpc-unknown-linux-gnu"
+	%struct.pat_list = type { i32, %struct.pat_list* }
+ at JUMP = external global i32		; <i32*> [#uses=1]
+ at old_D_pat = external global [16 x i8]		; <[16 x i8]*> [#uses=0]
+
+define void @asearch1(i32 %D) {
+entry:
+	%tmp80 = icmp ult i32 0, %D		; <i1> [#uses=1]
+	br i1 %tmp80, label %bb647.preheader, label %cond_true81.preheader
+cond_true81.preheader:		; preds = %entry
+	ret void
+bb647.preheader:		; preds = %entry
+	%tmp3.i = call i32 @read( )		; <i32> [#uses=1]
+	%tmp6.i = add i32 %tmp3.i, 0		; <i32> [#uses=1]
+	%tmp653 = icmp sgt i32 %tmp6.i, 0		; <i1> [#uses=1]
+	br i1 %tmp653, label %cond_true654, label %UnifiedReturnBlock
+cond_true612:		; preds = %cond_true654
+	ret void
+cond_next624:		; preds = %cond_true654
+	ret void
+cond_true654:		; preds = %bb647.preheader
+	br i1 undef, label %cond_true612, label %cond_next624
+UnifiedReturnBlock:		; preds = %bb647.preheader
+	ret void
+}
+
+define void @bitap(i32 %D) {
+entry:
+	%tmp29 = icmp eq i32 0, 0		; <i1> [#uses=1]
+	br i1 %tmp29, label %cond_next50, label %cond_next37
+cond_next37:		; preds = %entry
+	ret void
+cond_next50:		; preds = %entry
+	%tmp52 = icmp sgt i32 %D, 0		; <i1> [#uses=1]
+	br i1 %tmp52, label %cond_true53, label %cond_next71
+cond_true53:		; preds = %cond_next50
+	%tmp54 = load i32, i32* @JUMP		; <i32> [#uses=1]
+	%tmp55 = icmp eq i32 %tmp54, 1		; <i1> [#uses=1]
+	br i1 %tmp55, label %cond_true56, label %cond_next63
+cond_true56:		; preds = %cond_true53
+	%tmp57 = bitcast i32 %D to i32		; <i32> [#uses=1]
+	call void @asearch1( i32 %tmp57 )
+	ret void
+cond_next63:		; preds = %cond_true53
+	ret void
+cond_next71:		; preds = %cond_next50
+	ret void
+}
+
+declare i32 @read()
+
+define void @initial_value() {
+entry:
+	ret void
+}
+
+define void @main() {
+entry:
+	br label %cond_next252
+cond_next208:		; preds = %cond_true260
+	%tmp229 = call i32 @atoi( )		; <i32> [#uses=1]
+	br label %cond_next252
+bb217:		; preds = %cond_true260
+	ret void
+cond_next252:		; preds = %cond_next208, %entry
+	%D.0.0 = phi i32 [ 0, %entry ], [ %tmp229, %cond_next208 ]		; <i32> [#uses=1]
+	%tmp254 = getelementptr i8*, i8** null, i32 1		; <i8**> [#uses=1]
+	%tmp256 = load i8*, i8** %tmp254		; <i8*> [#uses=1]
+	%tmp258 = load i8, i8* %tmp256		; <i8> [#uses=1]
+	%tmp259 = icmp eq i8 %tmp258, 45		; <i1> [#uses=1]
+	br i1 %tmp259, label %cond_true260, label %bb263
+cond_true260:		; preds = %cond_next252
+	%tmp205818 = icmp sgt i8 0, -1		; <i1> [#uses=1]
+	br i1 %tmp205818, label %cond_next208, label %bb217
+bb263:		; preds = %cond_next252
+	%tmp265 = icmp eq i32 0, 0		; <i1> [#uses=1]
+	br i1 %tmp265, label %cond_next276, label %cond_true266
+cond_true266:		; preds = %bb263
+	ret void
+cond_next276:		; preds = %bb263
+	%tmp278 = icmp eq i32 0, 0		; <i1> [#uses=1]
+	br i1 %tmp278, label %cond_next298, label %cond_true279
+cond_true279:		; preds = %cond_next276
+	ret void
+cond_next298:		; preds = %cond_next276
+	call void @bitap( i32 %D.0.0 )
+	ret void
+}
+
+declare i32 @atoi()
+
+define void @subset_pset() {
+entry:
+	ret void
+}
+
+define void @strcmp() {
+entry:
+	ret void
+}

Added: llvm/trunk/test/Transforms/SCCP/2006-12-04-PackedType.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/2006-12-04-PackedType.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/2006-12-04-PackedType.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/2006-12-04-PackedType.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,140 @@
+; Test VectorType handling by SCCP.
+; SCCP ignores VectorTypes until PR 1034 is fixed
+;
+; RUN: opt < %s -sccp
+; END.
+
+target datalayout = "E-p:32:32"
+target triple = "powerpc-unknown-linux-gnu"
+	%struct.GLDAlphaTest = type { float, i16, i8, i8 }
+	%struct.GLDArrayRange = type { i8, i8, i8, i8 }
+	%struct.GLDBlendMode = type { i16, i16, i16, i16, %struct.GLTColor4, i16, i16, i8, i8, i8, i8 }
+	%struct.GLDBufferRec = type opaque
+	%struct.GLDBufferstate = type { %struct.GLTDimensions, %struct.GLTDimensions, %struct.GLTFixedColor4, %struct.GLTFixedColor4, i8, i8, i8, i8, [2 x %struct.GLSBuffer], [4 x %struct.GLSBuffer], %struct.GLSBuffer, %struct.GLSBuffer, %struct.GLSBuffer, [4 x %struct.GLSBuffer*], %struct.GLSBuffer*, %struct.GLSBuffer*, %struct.GLSBuffer*, i8, i8 }
+	%struct.GLDClearColor = type { double, %struct.GLTColor4, %struct.GLTColor4, float, i32 }
+	%struct.GLDClipPlane = type { i32, [6 x %struct.GLTColor4] }
+	%struct.GLDColorBuffer = type { i16, i16, [4 x i16] }
+	%struct.GLDColorMatrix = type { [16 x float]*, %struct.GLDImagingColorScale }
+	%struct.GLDContextRec = type { float, float, float, float, float, float, float, float, %struct.GLTColor4, %struct.GLTColor4, %struct.GLVMFPContext, %struct.GLDTextureMachine, %struct.GLGProcessor, %struct._GLVMConstants*, void (%struct.GLDContextRec*, i32, i32, %struct.GLVMFragmentAttribRec*, %struct.GLVMFragmentAttribRec*, i32)*, %struct._GLVMFunction*, void (%struct.GLDContextRec*, %struct.GLDVertex*)*, void (%struct.GLDContextRec*, %struct.GLDVertex*, %struct.GLDVertex*)*, void (%struct.GLDContextRec*, %struct.GLDVertex*, %struct.GLDVertex*, %struct.GLDVertex*)*, %struct._GLVMFunction*, %struct._GLVMFunction*, %struct._GLVMFunction*, i32, i32, i32, float, float, float, i32, %struct.GLSDrawable, %struct.GLDFramebufferAttachment, %struct.GLDFormat, %struct.GLDBufferstate, %struct.GLDSharedRec*, %struct.GLDState*, %struct.GLDPluginState*, %struct.GLTDimensions, %struct.GLTColor4*, %struct.GLTColor4*, %struct.GLVMFragmentAttribRec*, %struct.GLVMFragmentAttribRec*, %struct.GLVMFragmentAttribRec*, %struct.GLDPipelineProgramRec*, %struct.GLDStateProgramRec, %struct.GLVMTextures, { [4 x i8*], i8*, i8* }, [64 x float], %struct.GLDStippleData, i16, i8, i8, i32, %struct.GLDFramebufferRec*, i8, %struct.GLDQueryRec*, %struct.GLDQueryRec* }
+	%struct.GLDConvolution = type { %struct.GLTColor4, %struct.GLDImagingColorScale, i16, i16, float*, i32, i32 }
+	%struct.GLDDepthTest = type { i16, i16, i8, i8, i8, i8, double, double }
+	%struct.GLDFogMode = type { %struct.GLTColor4, float, float, float, float, float, i16, i16, i16, i8, i8 }
+	%struct.GLDFormat = type { i32, i32, i32, i32, i32, i32, i32, i32, i8, i8, i8, i8, i32, i32, i32 }
+	%struct.GLDFramebufferAttachment = type { i32, i32, i32, i32, i32, i32 }
+	%struct.GLDFramebufferData = type { [6 x %struct.GLDFramebufferAttachment], [4 x i16], i16, i16, i16, i16, i32 }
+	%struct.GLDFramebufferRec = type { %struct.GLDFramebufferData*, %struct.GLDPluginFramebufferData*, %struct.GLDPixelFormat }
+	%struct.GLDHintMode = type { i16, i16, i16, i16, i16, i16, i16, i16, i16, i16 }
+	%struct.GLDHistogram = type { %struct.GLTFixedColor4*, i32, i16, i8, i8 }
+	%struct.GLDImagingColorScale = type { { float, float }, { float, float }, { float, float }, { float, float } }
+	%struct.GLDImagingSubset = type { %struct.GLDConvolution, %struct.GLDConvolution, %struct.GLDConvolution, %struct.GLDColorMatrix, %struct.GLDMinmax, %struct.GLDHistogram, %struct.GLDImagingColorScale, %struct.GLDImagingColorScale, %struct.GLDImagingColorScale, %struct.GLDImagingColorScale, i32 }
+	%struct.GLDLight = type { %struct.GLTColor4, %struct.GLTColor4, %struct.GLTColor4, %struct.GLTColor4, %struct.GLTCoord3, float, float, float, float, float, %struct.GLTCoord3, float, float, float, float, float }
+	%struct.GLDLightModel = type { %struct.GLTColor4, [8 x %struct.GLDLight], [2 x %struct.GLDMaterial], i32, i16, i16, i16, i8, i8, i8, i8, i8, i8 }
+	%struct.GLDLightProduct = type { %struct.GLTColor4, %struct.GLTColor4, %struct.GLTColor4 }
+	%struct.GLDLineMode = type { float, i32, i16, i16, i8, i8, i8, i8 }
+	%struct.GLDLogicOp = type { i16, i8, i8 }
+	%struct.GLDMaskMode = type { i32, [3 x i32], i8, i8, i8, i8, i8, i8, i8, i8 }
+	%struct.GLDMaterial = type { %struct.GLTColor4, %struct.GLTColor4, %struct.GLTColor4, %struct.GLTColor4, float, float, float, float, [8 x %struct.GLDLightProduct], %struct.GLTColor4, [6 x i32], [2 x i32] }
+	%struct.GLDMinmax = type { %struct.GLDMinmaxTable*, i16, i8, i8 }
+	%struct.GLDMinmaxTable = type { %struct.GLTColor4, %struct.GLTColor4 }
+	%struct.GLDMipmaplevel = type { [4 x i32], [4 x float], [4 x i32], [4 x i32], [4 x float], [4 x i32], [3 x i32], i32, float*, float*, float*, i32, i32, i8*, i16, i16, i16, i16 }
+	%struct.GLDMultisample = type { float, i8, i8, i8, i8, i8, i8, i8, i8 }
+	%struct.GLDPipelineProgramData = type { i16, i16, i32, %struct._PPStreamToken*, i64, %struct.GLDShaderSourceData*, %struct.GLTColor4*, i32 }
+	%struct.GLDPipelineProgramRec = type { %struct.GLDPipelineProgramData*, %struct._PPStreamToken*, %struct._PPStreamToken*, %struct._GLVMFunction*, i32, i32, i32 }
+	%struct.GLDPipelineProgramState = type { i8, i8, i8, i8, %struct.GLTColor4* }
+	%struct.GLDPixelFormat = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 }
+	%struct.GLDPixelMap = type { i32*, float*, float*, float*, float*, float*, float*, float*, float*, i32*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
+	%struct.GLDPixelMode = type { float, float, %struct.GLDPixelStore, %struct.GLDPixelTransfer, %struct.GLDPixelMap, %struct.GLDImagingSubset, i32, i32 }
+	%struct.GLDPixelPack = type { i32, i32, i32, i32, i32, i32, i32, i32, i8, i8, i8, i8 }
+	%struct.GLDPixelStore = type { %struct.GLDPixelPack, %struct.GLDPixelPack }
+	%struct.GLDPixelTransfer = type { float, float, float, float, float, float, float, float, float, float, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float }
+	%struct.GLDPluginFramebufferData = type { [6 x %struct.GLDTextureRec*], i32, i32 }
+	%struct.GLDPluginProgramData = type { [3 x %struct.GLDPipelineProgramRec*], %struct.GLDBufferRec**, i32 }
+	%struct.GLDPluginState = type { [16 x [5 x %struct.GLDTextureRec*]], [3 x %struct.GLDTextureRec*], [16 x %struct.GLDTextureRec*], [3 x %struct.GLDPipelineProgramRec*], %struct.GLDProgramRec*, %struct.GLDVertexArrayRec*, [16 x %struct.GLDBufferRec*], %struct.GLDFramebufferRec*, %struct.GLDFramebufferRec* }
+	%struct.GLDPointMode = type { float, float, float, float, %struct.GLTCoord3, float, i8, i8, i8, i8, i16, i16, i32, i16, i16 }
+	%struct.GLDPolygonMode = type { [128 x i8], float, float, i16, i16, i16, i16, i8, i8, i8, i8, i8, i8, i8, i8 }
+	%struct.GLDProgramData = type { i32, [16 x i32], i32, i32, i32, i32 }
+	%struct.GLDProgramRec = type { %struct.GLDProgramData*, %struct.GLDPluginProgramData*, i32 }
+	%struct.GLDQueryRec = type { i32, i32, %struct.GLDQueryRec* }
+	%struct.GLDRect = type { i32, i32, i32, i32, i32, i32 }
+	%struct.GLDRegisterCombiners = type { i8, i8, i8, i8, i32, [2 x %struct.GLTColor4], [8 x %struct.GLDRegisterCombinersPerStageState], %struct.GLDRegisterCombinersFinalStageState }
+	%struct.GLDRegisterCombinersFinalStageState = type { i8, i8, i8, i8, [7 x %struct.GLDRegisterCombinersPerVariableState] }
+	%struct.GLDRegisterCombinersPerPortionState = type { [4 x %struct.GLDRegisterCombinersPerVariableState], i8, i8, i8, i8, i16, i16, i16, i16, i16, i16 }
+	%struct.GLDRegisterCombinersPerStageState = type { [2 x %struct.GLDRegisterCombinersPerPortionState], [2 x %struct.GLTColor4] }
+	%struct.GLDRegisterCombinersPerVariableState = type { i16, i16, i16, i16 }
+	%struct.GLDScissorTest = type { %struct.GLTFixedColor4, i8, i8, i8, i8 }
+	%struct.GLDShaderSourceData = type { i32, i32, i8*, i32*, i32, i32, i8*, i32*, i8* }
+	%struct.GLDSharedRec = type opaque
+	%struct.GLDState = type { i16, i16, i32, i32, i32, [256 x %struct.GLTColor4], [128 x %struct.GLTColor4], %struct.GLDViewport, %struct.GLDTransform, %struct.GLDLightModel, i32*, i32, i32, i32, %struct.GLDAlphaTest, %struct.GLDBlendMode, %struct.GLDClearColor, %struct.GLDColorBuffer, %struct.GLDDepthTest, %struct.GLDArrayRange, %struct.GLDFogMode, %struct.GLDHintMode, %struct.GLDLineMode, %struct.GLDLogicOp, %struct.GLDMaskMode, %struct.GLDPixelMode, %struct.GLDPointMode, %struct.GLDPolygonMode, %struct.GLDScissorTest, i32, %struct.GLDStencilTest, [16 x %struct.GLDTextureMode], %struct.GLDArrayRange, [8 x %struct.GLDTextureCoordGen], %struct.GLDClipPlane, %struct.GLDMultisample, %struct.GLDRegisterCombiners, %struct.GLDArrayRange, %struct.GLDArrayRange, [3 x %struct.GLDPipelineProgramState], %struct.GLDTransformFeedback }
+	%struct.GLDStateProgramRec = type { %struct.GLDPipelineProgramData*, %struct.GLDPipelineProgramRec* }
+	%struct.GLDStencilTest = type { [3 x { i32, i32, i16, i16, i16, i16 }], i32, [4 x i8] }
+	%struct.GLDStippleData = type { i32, i16, i16, [32 x [32 x i8]] }
+	%struct.GLDTextureCoordGen = type { { i16, i16, %struct.GLTColor4, %struct.GLTColor4 }, { i16, i16, %struct.GLTColor4, %struct.GLTColor4 }, { i16, i16, %struct.GLTColor4, %struct.GLTColor4 }, { i16, i16, %struct.GLTColor4, %struct.GLTColor4 }, i8, i8, i8, i8 }
+	%struct.GLDTextureGeomState = type { i16, i16, i16, i16, i16, i8, i8, i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, [6 x i16], [6 x i16] }
+	%struct.GLDTextureLevel = type { i32, i32, i16, i16, i16, i8, i8, i16, i16, i16, i16, i8* }
+	%struct.GLDTextureMachine = type { [8 x %struct.GLDTextureRec*], %struct.GLDTextureRec*, i8, i8, i8, i8 }
+	%struct.GLDTextureMode = type { %struct.GLTColor4, i32, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, float, float, float, i16, i16, i16, i16, i16, i16, [4 x i16], i8, i8, i8, i8, [3 x float], [4 x float], float, float }
+	%struct.GLDTextureParamState = type { i16, i16, i16, i16, i16, i16, %struct.GLTColor4, float, float, float, float, i16, i16, i16, i16, float, i16, i8, i8, i32, i8* }
+	%struct.GLDTextureRec = type { %struct.GLDTextureState*, i32, [2 x float], float, i32, float, float, float, float, float, float, %struct.GLDMipmaplevel*, %struct.GLDMipmaplevel*, i32, i32, i32, i32, i32, i32, %struct.GLDTextureParamState, i32, [2 x %struct._PPStreamToken] }
+	%struct.GLDTextureState = type { i16, i16, i16, float, i32, i16, %struct.GLISWRSurface*, i8, i8, i8, i8, %struct.GLDTextureParamState, %struct.GLDTextureGeomState, %struct.GLDTextureLevel, [6 x [15 x %struct.GLDTextureLevel]] }
+	%struct.GLDTransform = type { [24 x [16 x float]], [24 x [16 x float]], [16 x float], float, float, float, float, i32, float, i16, i16, i8, i8, i8, i8 }
+	%struct.GLDTransformFeedback = type { i8, i8, i8, [16 x i32], [16 x i32] }
+	%struct.GLDVertex = type { %struct.GLTColor4, %struct.GLTColor4, %struct.GLTColor4, %struct.GLTColor4, %struct.GLTColor4, %struct.GLTCoord3, float, %struct.GLTColor4, float, float, float, i8, i8, i8, i8, [4 x float], [2 x %struct.GLDMaterial*], i32, i32, [8 x %struct.GLTColor4] }
+	%struct.GLDVertexArrayRec = type opaque
+	%struct.GLDViewport = type { float, float, float, float, float, float, float, float, double, double, i32, i32, i32, i32, float, float, float, float }
+	%struct.GLGColorTable = type { i32, i32, i32, i8* }
+	%struct.GLGOperation = type { i8*, i8*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, %struct.GLGColorTable, %struct.GLGColorTable, %struct.GLGColorTable }
+	%struct.GLGProcessor = type { void (%struct.GLDPixelMode*, %struct.GLGOperation*, %struct._GLGFunctionKey*)*, %struct._GLVMFunction*, %struct._GLGFunctionKey* }
+	%struct.GLISWRSurface = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8*, i8*, i8*, [4 x i8*], i32 }
+	%struct.GLIWindow = type { i32, i32, i32 }
+	%struct.GLSBuffer = type { i8* }
+	%struct.GLSDrawable = type { %struct.GLSWindowRec* }
+	%struct.GLSWindowRec = type { %struct.GLTDimensions, %struct.GLTDimensions, i32, i32, %struct.GLSDrawable, [2 x i8*], i8*, i8*, i8*, [4 x i8*], i32, i32, i32, i32, [4 x i32], i16, i16, i16, %struct.GLIWindow, i32, i32, i8*, i8* }
+	%struct.GLTColor4 = type { float, float, float, float }
+	%struct.GLTCoord3 = type { float, float, float }
+	%struct.GLTDimensions = type { i32, i32 }
+	%struct.GLTFixedColor4 = type { i32, i32, i32, i32 }
+	%struct.GLVMFPContext = type { float, i32, i32, i32 }
+	%struct.GLVMFragmentAttribRec = type { <4 x float>, <4 x float>, <4 x float>, <4 x float>, [8 x <4 x float>] }
+	%struct.GLVMTextures = type { [8 x %struct.GLDTextureRec*] }
+	%struct._GLGFunctionKey = type opaque
+	%struct._GLVMConstants = type opaque
+	%struct._GLVMFunction = type opaque
+	%struct._PPStreamToken = type { { i16, i8, i8, i32 } }
+
+define void @gldLLVMVecPointRender(%struct.GLDContextRec* %ctx) {
+entry:
+	%tmp.uip = getelementptr %struct.GLDContextRec, %struct.GLDContextRec* %ctx, i32 0, i32 22		; <i32*> [#uses=1]
+	%tmp = load i32, i32* %tmp.uip		; <i32> [#uses=3]
+	%tmp91 = lshr i32 %tmp, 5		; <i32> [#uses=1]
+	%tmp92 = trunc i32 %tmp91 to i1		; <i1> [#uses=1]
+	br i1 %tmp92, label %cond_true93, label %cond_next116
+cond_true93:		; preds = %entry
+	%tmp.upgrd.1 = getelementptr %struct.GLDContextRec, %struct.GLDContextRec* %ctx, i32 0, i32 31, i32 14		; <i32*> [#uses=1]
+	%tmp95 = load i32, i32* %tmp.upgrd.1		; <i32> [#uses=1]
+	%tmp95.upgrd.2 = sitofp i32 %tmp95 to float		; <float> [#uses=1]
+	%tmp108 = fmul float undef, %tmp95.upgrd.2		; <float> [#uses=1]
+	br label %cond_next116
+cond_next116:		; preds = %cond_true93, %entry
+	%point_size.2 = phi float [ %tmp108, %cond_true93 ], [ undef, %entry ]		; <float> [#uses=2]
+	%tmp457 = fcmp olt float %point_size.2, 1.000000e+00		; <i1> [#uses=1]
+	%tmp460 = lshr i32 %tmp, 6		; <i32> [#uses=1]
+	%tmp461 = trunc i32 %tmp460 to i1		; <i1> [#uses=1]
+	br i1 %tmp457, label %cond_true458, label %cond_next484
+cond_true458:		; preds = %cond_next116
+	br i1 %tmp461, label %cond_true462, label %cond_next487
+cond_true462:		; preds = %cond_true458
+	%tmp26 = bitcast i32 %tmp to i32		; <i32> [#uses=1]
+	%tmp465 = and i32 %tmp26, 128		; <i32> [#uses=1]
+	%tmp466 = icmp eq i32 %tmp465, 0		; <i1> [#uses=1]
+	br i1 %tmp466, label %cond_true467, label %cond_next487
+cond_true467:		; preds = %cond_true462
+	ret void
+cond_next484:		; preds = %cond_next116
+	%tmp486 = fmul float %point_size.2, 5.000000e-01		; <float> [#uses=1]
+	br label %cond_next487
+cond_next487:		; preds = %cond_next484, %cond_true462, %cond_true458
+	%radius.0 = phi float [ %tmp486, %cond_next484 ], [ 5.000000e-01, %cond_true458 ], [ 5.000000e-01, %cond_true462 ]		; <float> [#uses=2]
+	%tmp494 = insertelement <4 x float> zeroinitializer, float %radius.0, i32 2		; <<4 x float>> [#uses=1]
+	%tmp495 = insertelement <4 x float> %tmp494, float %radius.0, i32 3		; <<4 x float>> [#uses=0]
+	ret void
+}

Added: llvm/trunk/test/Transforms/SCCP/2006-12-19-UndefBug.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/2006-12-19-UndefBug.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/2006-12-19-UndefBug.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/2006-12-19-UndefBug.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,8 @@
+; RUN: opt < %s -sccp -S | \
+; RUN:   grep "ret i1 false"
+
+define i1 @foo() {
+	%X = and i1 false, undef		; <i1> [#uses=1]
+	ret i1 %X
+}
+

Added: llvm/trunk/test/Transforms/SCCP/2007-05-16-InvokeCrash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/2007-05-16-InvokeCrash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/2007-05-16-InvokeCrash.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/2007-05-16-InvokeCrash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,45 @@
+; RUN: opt < %s -sccp -disable-output
+; PR1431
+
+define void @_ada_bench() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+	br label %cond_next
+cond_next:		; preds = %cond_next, %entry
+	%indvar46 = phi i32 [ 0, %entry ], [ %indvar.next47, %cond_next ]		; <i32> [#uses=1]
+	%indvar.next47 = add i32 %indvar46, 1		; <i32> [#uses=2]
+	%exitcond48 = icmp eq i32 %indvar.next47, 10000		; <i1> [#uses=1]
+	br i1 %exitcond48, label %cond_next40, label %cond_next
+cond_next40:		; preds = %cond_next40, %cond_next
+	%indvar43 = phi i32 [ %indvar.next44, %cond_next40 ], [ 0, %cond_next ]		; <i32> [#uses=1]
+	%indvar.next44 = add i32 %indvar43, 1		; <i32> [#uses=2]
+	%exitcond45 = icmp eq i32 %indvar.next44, 10000		; <i1> [#uses=1]
+	br i1 %exitcond45, label %cond_next53, label %cond_next40
+cond_next53:		; preds = %cond_next53, %cond_next40
+	%indvar41 = phi i32 [ %indvar.next42, %cond_next53 ], [ 0, %cond_next40 ]		; <i32> [#uses=1]
+	%indvar.next42 = add i32 %indvar41, 1		; <i32> [#uses=2]
+	%exitcond = icmp eq i32 %indvar.next42, 10000		; <i1> [#uses=1]
+	br i1 %exitcond, label %bb67, label %cond_next53
+bb67:		; preds = %cond_next53
+	%tmp112 = invoke double @sin( double 5.000000e-01 )
+			to label %bb114 unwind label %cleanup		; <double> [#uses=0]
+bb114:		; preds = %bb67
+	%tmp147 = invoke double @log( double 5.000000e-01 )
+			to label %bb149 unwind label %cleanup		; <double> [#uses=0]
+bb149:		; preds = %bb114
+	%tmp175 = invoke double @sqrt( double 5.000000e-01 )
+			to label %bb177 unwind label %cleanup		; <double> [#uses=0]
+bb177:		; preds = %bb149
+	unreachable
+cleanup:		; preds = %bb149, %bb114, %bb67
+        %val = landingpad { i8*, i32 }
+                 cleanup
+	resume { i8*, i32 } %val
+}
+
+declare double @sin(double)
+
+declare double @log(double)
+
+declare double @sqrt(double)
+
+declare i32 @__gxx_personality_v0(...)

Added: llvm/trunk/test/Transforms/SCCP/2008-01-27-UndefCorrelate.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/2008-01-27-UndefCorrelate.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/2008-01-27-UndefCorrelate.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/2008-01-27-UndefCorrelate.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,36 @@
+; RUN: opt < %s -sccp -S | grep undef | count 1
+; PR1938
+
+define i32 @main() {
+entry:
+	br label %bb
+
+bb:
+	%indvar = phi i32 [ 0, %entry ], [ %k, %bb.backedge ]
+	%k = add i32 %indvar, 1
+	br i1 undef, label %cond_true, label %cond_false
+
+cond_true:
+	%tmp97 = icmp slt i32 %k, 10
+	br i1 %tmp97, label %bb.backedge, label %bb12
+
+bb.backedge:
+	br label %bb
+
+cond_false:
+	%tmp9 = icmp slt i32 %k, 10
+	br i1 %tmp9, label %bb.backedge, label %bb12
+
+bb12:
+	%tmp14 = icmp eq i32 %k, 10
+	br i1 %tmp14, label %cond_next18, label %cond_true17
+
+cond_true17:
+	tail call void @abort( )
+	unreachable
+
+cond_next18:
+	ret i32 0
+}
+
+declare void @abort()

Added: llvm/trunk/test/Transforms/SCCP/2008-04-22-multiple-ret-sccp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/2008-04-22-multiple-ret-sccp.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/2008-04-22-multiple-ret-sccp.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/2008-04-22-multiple-ret-sccp.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,11 @@
+; RUN: opt < %s -sccp -S | grep "ret i32 %Z"
+; rdar://5778210
+
+declare {i32, i32} @bar(i32 %A) 
+
+define i32 @foo() {
+	%X = call {i32, i32} @bar(i32 17)
+        %Y = extractvalue {i32, i32} %X, 0
+	%Z = add i32 %Y, %Y
+	ret i32 %Z
+}

Added: llvm/trunk/test/Transforms/SCCP/2008-05-23-UndefCallFold.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/2008-05-23-UndefCallFold.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/2008-05-23-UndefCallFold.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/2008-05-23-UndefCallFold.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,14 @@
+; RUN: opt < %s -sccp -S | not grep "ret i32 undef"
+; PR2358
+target datalayout =
+"e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i686-pc-linux-gnu"
+
+define i32 @x(i32 %b) {
+entry:
+ %val = call i32 @llvm.cttz.i32(i32 undef, i1 true)
+ ret i32 %val
+}
+
+declare i32 @llvm.cttz.i32(i32, i1)
+

Added: llvm/trunk/test/Transforms/SCCP/2009-01-14-IPSCCP-Invoke.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/2009-01-14-IPSCCP-Invoke.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/2009-01-14-IPSCCP-Invoke.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/2009-01-14-IPSCCP-Invoke.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,26 @@
+; RUN: opt < %s -ipsccp -S | grep "ret i32 42"
+; RUN: opt < %s -ipsccp -S | grep "ret i32 undef"
+; PR3325
+
+define i32 @main() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+	%tmp1 = invoke i32 @f()
+			to label %UnifiedReturnBlock unwind label %lpad
+
+lpad:
+        %val = landingpad { i8*, i32 }
+                 cleanup
+	unreachable
+
+UnifiedReturnBlock:
+	ret i32 %tmp1
+}
+
+define internal i32 @f() {
+       ret i32 42
+}
+
+declare i8* @__cxa_begin_catch(i8*) nounwind
+
+declare void @__cxa_end_catch()
+
+declare i32 @__gxx_personality_v0(...)

Added: llvm/trunk/test/Transforms/SCCP/2009-05-27-VectorOperandZero.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/2009-05-27-VectorOperandZero.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/2009-05-27-VectorOperandZero.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/2009-05-27-VectorOperandZero.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,10 @@
+; RUN: opt < %s -sccp -disable-output
+; PR4277
+
+define i32 @main() nounwind {
+entry:
+	%0 = tail call signext i8 (...) @sin() nounwind
+	ret i32 0
+}
+
+declare signext i8 @sin(...)

Added: llvm/trunk/test/Transforms/SCCP/apint-array.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/apint-array.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/apint-array.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/apint-array.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,23 @@
+; RUN: opt < %s -sccp -S | grep "ret i101 12"
+
+ at Y = constant [6 x i101] [ i101 12, i101 123456789000000, i101 -12,i101 
+-123456789000000, i101 0,i101 9123456789000000]
+
+define i101 @array()
+{
+Head:
+   %A = getelementptr [6 x i101], [6 x i101]* @Y, i32 0, i32 1
+
+   %B = load i101, i101* %A
+   %C = icmp sge i101 %B, 1
+   br i1 %C, label %True, label %False
+True:
+   %D = and i101 %B, 1
+   %E = trunc i101 %D to i32
+   %F = getelementptr [6 x i101], [6 x i101]* @Y, i32 0, i32 %E
+   %G = load i101, i101* %F
+   br label %False
+False:
+   %H = phi i101 [%G, %True], [-1, %Head]
+   ret i101 %H
+}

Added: llvm/trunk/test/Transforms/SCCP/apint-basictest.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/apint-basictest.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/apint-basictest.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/apint-basictest.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,16 @@
+; This is a basic sanity check for constant propagation.  The add instruction
+; should be eliminated.
+
+; RUN: opt < %s -sccp -S | not grep add
+
+define i128 @test(i1 %B) {
+	br i1 %B, label %BB1, label %BB2
+BB1:
+	%Val = add i128 0, 1
+	br label %BB3
+BB2:
+	br label %BB3
+BB3:
+	%Ret = phi i128 [%Val, %BB1], [2, %BB2]
+	ret i128 %Ret
+}

Added: llvm/trunk/test/Transforms/SCCP/apint-basictest2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/apint-basictest2.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/apint-basictest2.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/apint-basictest2.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,17 @@
+; This is a basic sanity check for constant propagation.  The add instruction
+; and phi instruction should be eliminated.
+
+; RUN: opt < %s -sccp -S | not grep phi
+; RUN: opt < %s -sccp -S | not grep add
+
+define i128 @test(i1 %B) {
+	br i1 %B, label %BB1, label %BB2
+BB1:
+	%Val = add i128 0, 1
+	br label %BB3
+BB2:
+	br label %BB3
+BB3:
+	%Ret = phi i128 [%Val, %BB1], [1, %BB2]
+	ret i128 %Ret
+}

Added: llvm/trunk/test/Transforms/SCCP/apint-basictest3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/apint-basictest3.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/apint-basictest3.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/apint-basictest3.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,23 @@
+; This is a basic sanity check for constant propagation.  It tests the basic
+; arithmatic operations.
+
+
+; RUN: opt < %s -sccp -S | not grep mul
+; RUN: opt < %s -sccp -S | not grep umod
+
+define i128 @test(i1 %B) {
+	br i1 %B, label %BB1, label %BB2
+BB1:
+	%t1 = add i128 0, 1
+        %t2 = sub i128 0, %t1
+        %t3 = mul i128 %t2, -1
+	br label %BB3
+BB2:
+        %f1 = udiv i128 -1, 1
+        %f2 = add i128 %f1, 1
+        %f3 = urem i128 %f2, 2121
+	br label %BB3
+BB3:
+	%Ret = phi i128 [%t3, %BB1], [%f3, %BB2]
+	ret i128 %Ret
+}

Added: llvm/trunk/test/Transforms/SCCP/apint-basictest4.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/apint-basictest4.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/apint-basictest4.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/apint-basictest4.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,25 @@
+; This is a basic sanity check for constant propagation.  It tests the basic
+; logic operations.
+
+
+; RUN: opt < %s -sccp -S | not grep and
+; RUN: opt < %s -sccp -S | not grep trunc
+; RUN: opt < %s -sccp -S | grep "ret i100 -1"
+
+define i100 @test(i133 %A) {
+        %B = and i133 0, %A
+        %C = icmp sgt i133 %B, 0
+	br i1 %C, label %BB1, label %BB2
+BB1:
+        %t3 = xor i133 %B, -1
+        %t4 = trunc i133 %t3 to i100
+	br label %BB3
+BB2:
+        %f1 = or i133 -1, %A
+        %f2 = lshr i133 %f1, 33
+        %f3 = trunc i133 %f2 to i100
+	br label %BB3
+BB3:
+	%Ret = phi i100 [%t4, %BB1], [%f3, %BB2]
+	ret i100 %Ret
+}

Added: llvm/trunk/test/Transforms/SCCP/apint-bigarray.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/apint-bigarray.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/apint-bigarray.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/apint-bigarray.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,23 @@
+; RUN: opt < %s -sccp -S | not grep %X
+
+ at G =  global [1000000 x i10000] zeroinitializer
+
+define internal i10000* @test(i10000 %Arg) {
+	%X = getelementptr [1000000 x i10000], [1000000 x i10000]* @G, i32 0, i32 999
+        store i10000 %Arg, i10000* %X
+	ret i10000* %X
+}
+
+define i10000 @caller()
+{
+        %Y = call i10000* @test(i10000 -1)
+        %Z = load i10000, i10000* %Y
+        ret i10000 %Z 
+}
+
+define i10000 @caller2()
+{
+        %Y = call i10000* @test(i10000 1)
+        %Z = load i10000, i10000* %Y
+        ret i10000 %Z 
+}

Added: llvm/trunk/test/Transforms/SCCP/apint-bigint.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/apint-bigint.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/apint-bigint.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/apint-bigint.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,9 @@
+; RUN: opt < %s -sccp -S | not grep xor
+
+define i11129 @test1() {
+        %B = shl i11129 1, 11128 
+        %C = sub i11129 %B, 1
+        %D = xor i11129 %B, %C
+        
+	ret i11129 %D
+}

Added: llvm/trunk/test/Transforms/SCCP/apint-bigint2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/apint-bigint2.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/apint-bigint2.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/apint-bigint2.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,40 @@
+; RUN: opt < %s -sccp -S | FileCheck %s
+
+ at Y = constant [6 x i101] [ i101 12, i101 123456789000000, i101 -12,
+                           i101 -123456789000000, i101 0,i101 9123456789000000]
+
+; CHECK-LABEL: @array
+; CHECK-NEXT: ret i101 123456789000000
+define i101 @array() {
+   %A = getelementptr [6 x i101], [6 x i101]* @Y, i32 0, i32 1
+   %B = load i101, i101* %A
+   %D = and i101 %B, 1
+   %DD = or i101 %D, 1
+   %E = trunc i101 %DD to i32
+   %F = getelementptr [6 x i101], [6 x i101]* @Y, i32 0, i32 %E
+   %G = load i101, i101* %F
+
+   ret i101 %G
+}
+
+; CHECK-LABEL: @large_aggregate
+; CHECK-NEXT: ret i101 undef
+define i101 @large_aggregate() {
+  %B = load i101, i101* undef
+  %D = and i101 %B, 1
+  %DD = or i101 %D, 1
+  %F = getelementptr [6 x i101], [6 x i101]* @Y, i32 0, i32 5
+  %G = getelementptr i101, i101* %F, i101 %DD
+  %L3 = load i101, i101* %G
+  ret i101 %L3
+}
+
+; CHECK-LABEL: @index_too_large
+; CHECK-NEXT: store i101* getelementptr (i101, i101* getelementptr ([6 x i101], [6 x i101]* @Y, i32 0, i32 -1), i101 9224497936761618431), i101** undef
+; CHECK-NEXT: ret void
+define void @index_too_large() {
+  %ptr1 = getelementptr [6 x i101], [6 x i101]* @Y, i32 0, i32 -1
+  %ptr2 = getelementptr i101, i101* %ptr1, i101 9224497936761618431
+  store i101* %ptr2, i101** undef
+  ret void
+}

Added: llvm/trunk/test/Transforms/SCCP/apint-ipsccp1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/apint-ipsccp1.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/apint-ipsccp1.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/apint-ipsccp1.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,24 @@
+; RUN: opt < %s -ipsccp -S | grep -v "ret i512 undef" | \
+; RUN:   grep "ret i8 2"
+
+define internal i512 @test(i1 %B) {
+	br i1 %B, label %BB1, label %BB2
+BB1:
+	%Val = add i512 0, 1
+	br label %BB3
+BB2:
+	br label %BB3
+BB3:
+	%Ret = phi i512 [%Val, %BB1], [2, %BB2]
+	ret i512 %Ret
+}
+
+define i8 @caller()
+{
+    %t1 = and i2 2, 1
+    %t11 = trunc i2 %t1 to i1
+    %t2 = call i512 @test(i1 %t11)
+    %t3 = trunc i512 %t2 to i8
+    ret i8 %t3
+}
+

Added: llvm/trunk/test/Transforms/SCCP/apint-ipsccp2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/apint-ipsccp2.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/apint-ipsccp2.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/apint-ipsccp2.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,19 @@
+; RUN: opt < %s -ipsccp -S | grep -v "ret i101 0" | \
+; RUN:    grep -v "ret i101 undef" | not grep ret
+
+
+define internal i101 @bar(i101 %A) {
+	%x = icmp eq i101 %A, 0
+	br i1 %x, label %T, label %F
+T:
+	%B = call i101 @bar(i101 0)
+	ret i101 0
+F:      ; unreachable
+	%C = call i101 @bar(i101 1)
+	ret i101 %C
+}
+
+define i101 @foo() {
+	%X = call i101 @bar(i101 0)
+	ret i101 %X
+}

Added: llvm/trunk/test/Transforms/SCCP/apint-ipsccp3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/apint-ipsccp3.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/apint-ipsccp3.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/apint-ipsccp3.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,23 @@
+; RUN: opt < %s -ipsccp -S | not grep global
+
+ at G = internal global i66 undef
+
+
+
+define void @foo() {
+	%X = load i66, i66* @G
+	store i66 %X, i66* @G
+	ret void
+}
+
+define i66 @bar() {
+	%V = load i66, i66* @G
+	%C = icmp eq i66 %V, 17
+	br i1 %C, label %T, label %F
+T:
+	store i66 17, i66* @G
+	ret i66 %V
+F:
+	store i66 123, i66* @G
+	ret i66 0
+}

Added: llvm/trunk/test/Transforms/SCCP/apint-ipsccp4.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/apint-ipsccp4.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/apint-ipsccp4.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/apint-ipsccp4.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,49 @@
+; This test makes sure that these instructions are properly constant propagated.
+
+; RUN: opt < %s -ipsccp -S | not grep load
+; RUN: opt < %s -ipsccp -S | not grep add
+; RUN: opt < %s -ipsccp -S | not grep phi
+
+
+ at Y = constant [2 x { i212, float }] [ { i212, float } { i212 12, float 1.0 }, 
+                                     { i212, float } { i212 37, float 2.0 } ]
+
+define internal float @test2() {
+	%A = getelementptr [2 x { i212, float}], [2 x { i212, float}]* @Y, i32 0, i32 1, i32 1
+	%B = load float, float* %A
+	ret float %B
+}
+
+define internal float  @test3() {
+	%A = getelementptr [2 x { i212, float}], [2 x { i212, float}]* @Y, i32 0, i32 0, i32 1
+	%B = load float, float* %A
+	ret float %B
+}
+
+define internal float @test()
+{
+   %A = call float @test2()
+   %B = call float @test3()
+
+   %E = fdiv float %B, %A
+   ret float %E
+}
+
+define float @All()
+{
+  %A = call float @test()
+  %B = fcmp oge float %A, 1.0
+  br i1 %B, label %T, label %F
+T:
+  %C = fadd float %A, 1.0
+  br label %exit
+F:
+  %D = fadd float %A, 2.0
+  br label %exit
+exit:
+  %E = phi float [%C, %T], [%D, %F]
+  ret float %E
+}
+
+
+

Added: llvm/trunk/test/Transforms/SCCP/apint-load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/apint-load.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/apint-load.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/apint-load.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,36 @@
+; This test makes sure that these instructions are properly constant propagated.
+
+; RUN: opt < %s -ipsccp -S | not grep load
+; RUN: opt < %s -ipsccp -S | not grep fdiv
+
+ at X = constant i212 42
+ at Y = constant [2 x { i212, float }] [ { i212, float } { i212 12, float 1.0 }, 
+                                     { i212, float } { i212 37, float 0x3FF3B2FEC0000000 } ]
+define i212 @test1() {
+	%B = load i212, i212* @X
+	ret i212 %B
+}
+
+define internal float @test2() {
+	%A = getelementptr [2 x { i212, float}], [2 x { i212, float}]* @Y, i32 0, i32 1, i32 1
+	%B = load float, float* %A
+	ret float %B
+}
+
+define internal i212 @test3() {
+	%A = getelementptr [2 x { i212, float}], [2 x { i212, float}]* @Y, i32 0, i32 0, i32 0
+	%B = load i212, i212* %A
+	ret i212 %B
+}
+
+define float @All()
+{
+   %A = call float @test2()
+   %B = call i212 @test3()
+   %C = mul i212 %B, -1234567
+   %D = sitofp i212 %C to float
+   %E = fdiv float %A, %D
+   ret float %E
+}
+
+

Added: llvm/trunk/test/Transforms/SCCP/apint-phi.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/apint-phi.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/apint-phi.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/apint-phi.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,19 @@
+; RUN: opt < %s -sccp -S | not grep phi
+
+define i999 @test(i999%A, i1 %c) {
+bb1:
+	br label %BB2
+BB2:
+	%V = phi i999 [2, %bb1], [%A, %BB4]
+	br label %BB3
+
+BB3:
+        %E = trunc i999 %V to i1
+        %F = and i1 %E, %c
+	br i1 %F, label %BB4, label %BB5
+BB4:
+	br label %BB2
+
+BB5:
+	ret i999 %V
+}

Added: llvm/trunk/test/Transforms/SCCP/apint-select.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/apint-select.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/apint-select.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/apint-select.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,21 @@
+; RUN: opt < %s -sccp -S | not grep select
+
+ at A = constant i32 10
+
+define i712 @test1() {
+        %P = getelementptr i32, i32* @A, i32 0
+        %B = ptrtoint i32* %P to i64
+        %BB = and i64 %B, undef
+        %C = icmp sge i64 %BB, 0
+	%X = select i1 %C, i712 0, i712 1
+	ret i712 %X
+}
+
+
+
+define i712 @test2(i1 %C) {
+	%X = select i1 %C, i712 0, i712 undef
+	ret i712 %X
+}
+
+

Added: llvm/trunk/test/Transforms/SCCP/atomic-load-store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/atomic-load-store.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/atomic-load-store.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/atomic-load-store.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,30 @@
+; RUN: opt < %s -ipsccp -S | FileCheck %s
+
+; This transformation is safe for atomic loads and stores; check that it works.
+
+ at G = internal global i32 17
+ at C = internal constant i32 222
+
+define i32 @test1() {
+	%V = load atomic i32, i32* @G seq_cst, align 4
+	%C = icmp eq i32 %V, 17
+	br i1 %C, label %T, label %F
+T:
+	store atomic i32 17, i32* @G seq_cst, align 4
+	ret i32 %V
+F:	
+	store atomic i32 123, i32* @G seq_cst, align 4
+	ret i32 0
+}
+; CHECK-LABEL: define i32 @test1(
+; CHECK-NOT: store
+; CHECK: ret i32 17
+
+define i32 @test2() {
+	%V = load atomic i32, i32* @C seq_cst, align 4
+	ret i32 %V
+}
+
+; CHECK-LABEL: define i32 @test2(
+; CHECK-NOT: load
+; CHECK: ret i32 222

Added: llvm/trunk/test/Transforms/SCCP/atomic.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/atomic.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/atomic.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/atomic.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,9 @@
+; RUN: opt < %s -sccp -S | FileCheck %s
+
+define i1 @test_cmpxchg(i32* %addr, i32 %desired, i32 %new) {
+; CHECK-LABEL: @test_cmpxchg
+; CHECK: cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
+  %val = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
+  %res = extractvalue { i32, i1 } %val, 1
+  ret i1 %res
+}

Added: llvm/trunk/test/Transforms/SCCP/bitcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/bitcast.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/bitcast.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/bitcast.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,9 @@
+; RUN: opt < %s -ipsccp -S | FileCheck %s
+
+define i128 @vector_to_int_cast() {
+  %A = bitcast <4 x i32> <i32 1073741824, i32 1073741824, i32 1073741824, i32 1073741824> to i128
+  ret i128 %A
+}
+
+; CHECK: define i128 @vector_to_int_cast(
+; CHECK-NEXT:  ret i128 85070591750041656499021422275829170176

Added: llvm/trunk/test/Transforms/SCCP/calltest.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/calltest.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/calltest.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/calltest.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,31 @@
+; RUN: opt < %s -sccp -loop-deletion -simplifycfg -S | FileCheck %s
+
+declare double @sqrt(double) readnone nounwind
+%empty = type {}
+declare %empty @has_side_effects()
+
+define double @test_0(i32 %param) {
+; CHECK-LABEL: @test_0(
+; CHECK-NOT: br
+entry:
+; No matter how hard you try, sqrt(1.0) is always 1.0.  This allows the
+; optimizer to delete this loop.
+
+	br label %Loop
+Loop:		; preds = %Loop, %entry
+	%I2 = phi i32 [ 0, %entry ], [ %I3, %Loop ]		; <i32> [#uses=1]
+	%V = phi double [ 1.000000e+00, %entry ], [ %V2, %Loop ]		; <double> [#uses=2]
+	%V2 = call double @sqrt( double %V )		; <double> [#uses=1]
+	%I3 = add i32 %I2, 1		; <i32> [#uses=2]
+	%tmp.7 = icmp ne i32 %I3, %param		; <i1> [#uses=1]
+	br i1 %tmp.7, label %Loop, label %Exit
+Exit:		; preds = %Loop
+	ret double %V
+}
+
+define i32 @test_1() {
+; CHECK-LABEL: @test_1(
+; CHECK: call %empty @has_side_effects()
+  %1 = call %empty @has_side_effects()
+  ret i32 0
+}

Added: llvm/trunk/test/Transforms/SCCP/comdat-ipo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/comdat-ipo.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/comdat-ipo.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/comdat-ipo.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,28 @@
+; RUN: opt < %s -ipsccp -S | FileCheck %s
+
+; See PR26774
+
+define i32 @baz() {
+  ret i32 10
+}
+
+; We can const-prop @baz's return value *into* @foo, but cannot
+; constprop @foo's return value into bar.
+
+define linkonce_odr i32 @foo() {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  %val = call i32 @baz()
+; CHECK-NEXT:  ret i32 10
+
+  %val = call i32 @baz()
+  ret i32 %val
+}
+
+define i32 @bar() {
+; CHECK-LABEL: @bar(
+; CHECK-NEXT:  %val = call i32 @foo()
+; CHECK-NEXT:  ret i32 %val
+
+  %val = call i32 @foo()
+  ret i32 %val
+}

Added: llvm/trunk/test/Transforms/SCCP/constant-struct.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/constant-struct.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/constant-struct.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/constant-struct.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,72 @@
+; Test that constant structs are folded.
+; RUN: opt %s -sccp -S | FileCheck %s
+
+define internal {i64} @struct1() {
+  %a = insertvalue {i64} undef, i64 24, 0
+  ret {i64} %a
+}
+
+; CHECK: define internal { i64 } @struct1() {
+; CHECK-NEXT:   ret { i64 } { i64 24 }
+; CHECK-NEXT: }
+
+define internal {i64, i64} @struct2() {
+  %a = insertvalue {i64, i64} undef, i64 24, 0
+  ret {i64, i64} %a
+}
+
+; CHECK: define internal { i64, i64 } @struct2() {
+; CHECK-NEXT:  ret { i64, i64 } { i64 24, i64 undef }
+; CHECK-NEXT: }
+
+define internal {i64, i64, i64} @struct3(i64 %x) {
+  %a = insertvalue {i64, i64, i64} undef, i64 24, 0
+  %b = insertvalue {i64, i64, i64} %a, i64 36, 1
+  %c = insertvalue {i64, i64, i64} %b, i64 %x, 2
+  ret {i64, i64, i64} %c
+}
+
+; CHECK: define internal { i64, i64, i64 } @struct3(i64 %x) {
+; CHECK-NEXT:  %c = insertvalue { i64, i64, i64 } { i64 24, i64 36, i64 undef }, i64 %x, 2
+; CHECK-NEXT:  ret { i64, i64, i64 } %c
+; CHECK-NEXT: }
+
+; Test(s) for overdefined values.
+define internal {i64, i32} @struct4(i32 %x) {
+  %a = insertvalue {i64, i32} {i64 12, i32 24}, i32 %x, 1
+  ret {i64, i32} %a
+}
+
+; CHECK: define internal { i64, i32 } @struct4(i32 %x) {
+; CHECK-NEXT:  %a = insertvalue { i64, i32 } { i64 12, i32 24 }, i32 %x, 1
+; CHECK-NEXT:  ret { i64, i32 } %a
+; CHECK-NEXT: }
+
+define internal {i32} @struct5(i32 %x) {
+  %a = insertvalue {i32} undef, i32 %x, 0
+  ret {i32} %a
+}
+
+; CHECK: define internal { i32 } @struct5(i32 %x) {
+; CHECK-NEXT:  %a = insertvalue { i32 } undef, i32 %x, 0
+; CHECK-NEXT:  ret { i32 } %a
+; CHECK-NEXT: }
+
+
+define internal {i32} @struct6({i32} %x) {
+  %a = insertvalue {i32} %x, i32 12, 0
+  ret {i32} %a
+}
+
+; CHECK: define internal { i32 } @struct6({ i32 } %x) {
+; CHECK-NEXT:  ret { i32 } { i32 12 }
+; CHECK-NEXT: }
+
+define internal {i16} @struct7() {
+  %a = insertvalue {i16} {i16 4}, i16 7, 0
+  ret {i16} %a
+}
+
+; CHECK: define internal { i16 } @struct7() {
+; CHECK-NEXT:  ret { i16 } { i16 7 }
+; CHECK-NEXT: }

Added: llvm/trunk/test/Transforms/SCCP/crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/crash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/crash.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/crash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,34 @@
+; RUN: opt -sccp -S < %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+target triple = "x86_64-apple-darwin10.0"
+
+define void @test1(i8 %arg) {
+entry:
+  br i1 undef, label %return, label %bb
+
+bb:   
+  br label %bb34
+
+bb23: 
+  %c = icmp eq i8 %arg, undef 
+  br i1 %c, label %bb34, label %bb23
+
+bb34:
+  %Kind.1 = phi i32 [ undef, %bb ], [ %ins174, %bb23 ] 
+  %mask173 = or i32 %Kind.1, 7
+  %ins174 = and i32 %mask173, -249
+  br label %bb23
+
+return:
+  ret void
+}
+
+define i32 @test2([4 x i32] %A) {
+  %B = extractvalue [4 x i32] %A, 1
+  ret i32 %B
+}
+
+define x86_mmx @test3() {
+  %load = load x86_mmx, x86_mmx* null
+  ret x86_mmx %load
+}

Added: llvm/trunk/test/Transforms/SCCP/definite-initializer.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/definite-initializer.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/definite-initializer.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/definite-initializer.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,11 @@
+; RUN: opt -S -ipsccp < %s | FileCheck %s
+ at d = internal externally_initialized global i32 0, section ".openbsd.randomdata", align 4
+
+; CHECK-LABEL: @test1(
+define i32 @test1() {
+entry:
+  %load = load i32, i32* @d, align 4
+  ret i32 %load
+; CHECK: %[[load:.*]] = load i32, i32* @d, align 4
+; CHECK: ret i32 %[[load]]
+}

Added: llvm/trunk/test/Transforms/SCCP/dont-zap-return.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/dont-zap-return.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/dont-zap-return.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/dont-zap-return.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,24 @@
+; RUN: opt -ipsccp < %s -S | FileCheck %s
+
+define internal {i32, i32} @identity(i32 %patatino) {
+  %foo = insertvalue {i32, i32} {i32 1, i32 undef}, i32 %patatino, 1
+  ret {i32, i32} %foo
+}
+
+; Check that the return value is not transformed to undef
+; CHECK: define internal { i32, i32 } @identity(i32 %patatino) {
+; CHECK-NEXT:  %foo = insertvalue { i32, i32 } { i32 1, i32 undef }, i32 %patatino, 1
+; CHECK-NEXT:  ret { i32, i32 } %foo
+; CHECK-NEXT: }
+
+
+define {i32, i32} @caller(i32 %pat) {
+  %S1 = call {i32, i32} @identity(i32 %pat)
+  ret {i32, i32} %S1
+}
+
+; Check that we don't invent values and propagate them.
+; CHECK: define { i32, i32 } @caller(i32 %pat) {
+; CHECK-NEXT:  %S1 = call { i32, i32 } @identity(i32 %pat)
+; CHECK-NEXT:  ret { i32, i32 } %S1
+; CHECK-NEXT: }

Added: llvm/trunk/test/Transforms/SCCP/global-alias-constprop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/global-alias-constprop.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/global-alias-constprop.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/global-alias-constprop.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,12 @@
+; RUN: opt < %s -sccp -S | FileCheck %s
+; RUN: opt < %s -passes=sccp -S | FileCheck %s
+
+ at 0 = private unnamed_addr constant [2 x i32] [i32 -1, i32 1]
+@"\01??_7A@@6B@" = unnamed_addr alias i32, getelementptr inbounds ([2 x i32], [2 x i32]* @0, i32 0, i32 1)
+
+; CHECK: ret i32 1
+
+define i32 @main() {
+  %a = load i32, i32* @"\01??_7A@@6B@"
+  ret i32 %a
+}

Added: llvm/trunk/test/Transforms/SCCP/indirectbr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/indirectbr.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/indirectbr.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/indirectbr.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,76 @@
+; RUN: opt -S -sccp < %s | FileCheck %s
+
+declare void @BB0_f()
+declare void @BB1_f()
+
+; Make sure we can eliminate what is in BB0 as we know that the indirectbr is going to BB1.
+;
+; CHECK-LABEL: define void @indbrtest1(
+; CHECK-NOT: call void @BB0_f()
+; CHECK: ret void
+define void @indbrtest1() {
+entry:
+  indirectbr i8* blockaddress(@indbrtest1, %BB1), [label %BB0, label %BB1]
+BB0:
+  call void @BB0_f()
+  br label %BB1
+BB1:
+  call void @BB1_f()
+  ret void
+}
+
+; Make sure we can eliminate what is in BB0 as we know that the indirectbr is going to BB1
+; by looking through the casts. The casts should be folded away when they are visited
+; before the indirectbr instruction.
+;
+; CHECK-LABEL: define void @indbrtest2(
+; CHECK-NOT: call void @BB0_f()
+; CHECK: ret void
+define void @indbrtest2() {
+entry:
+  %a = ptrtoint i8* blockaddress(@indbrtest2, %BB1) to i64
+  %b = inttoptr i64 %a to i8*
+  %c = bitcast i8* %b to i8*
+  indirectbr i8* %b, [label %BB0, label %BB1]
+BB0:
+  call void @BB0_f()
+  br label %BB1
+BB1:
+  call void @BB1_f()
+  ret void
+}
+
+; Make sure we can not eliminate BB0 as we do not know the target of the indirectbr.
+;
+; CHECK-LABEL: define void @indbrtest3(
+; CHECK: call void @BB0_f()
+; CHECK: ret void
+define void @indbrtest3(i8** %Q) {
+entry:
+  %t = load i8*, i8** %Q
+  indirectbr i8* %t, [label %BB0, label %BB1]
+BB0:
+  call void @BB0_f()
+  br label %BB1
+BB1:
+  call void @BB1_f()
+  ret void
+}
+
+; Make sure we eliminate BB1 as we pick the first successor on undef.
+;
+; CHECK-LABEL: define void @indbrtest4(
+; CHECK: call void @BB0_f()
+; CHECK: ret void
+define void @indbrtest4(i8** %Q) {
+entry:
+  indirectbr i8* undef, [label %BB0, label %BB1]
+BB0:
+  call void @BB0_f()
+  br label %BB1
+BB1:
+  call void @BB1_f()
+  ret void
+}
+
+

Added: llvm/trunk/test/Transforms/SCCP/ip-constant-ranges.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/ip-constant-ranges.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/ip-constant-ranges.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/ip-constant-ranges.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,198 @@
+; RUN: opt < %s -ipsccp -S | FileCheck %s
+
+; Constant range for %a is [1, 48) and for %b is [301, 1000)
+; CHECK-LABEL: f1
+; CHECK: ret i32 undef
+define internal i32 @f1(i32 %a, i32 %b) {
+entry:
+  %cmp.a = icmp sgt i32 %a, 300
+  %cmp.b = icmp sgt i32 %b, 300
+  %cmp.a2 = icmp ugt i32 %a, 300
+  %cmp.b2 = icmp ugt i32 %b, 300
+
+  %a.1 = select i1 %cmp.a, i32 1, i32 2
+  %b.1 = select i1 %cmp.b, i32 1, i32 2
+  %a.2 = select i1 %cmp.a2, i32 1, i32 2
+  %b.2 = select i1 %cmp.b2, i32 1, i32 2
+  %res1 = add i32 %a.1, %b.1
+  %res2 = add i32 %a.2, %b.2
+  %res3 = add i32 %res1, %res2
+  ret i32 %res3
+}
+
+; Constant range for %x is [47, 302)
+; CHECK-LABEL: f2
+; CHECK: %cmp = icmp sgt i32 %x, 300
+; CHECK: %res1 = select i1 %cmp, i32 1, i32 2
+; CHECK-NEXT: %res4 = select i1 %cmp4, i32 3, i32 4
+; CHECK-NEXT: %res6 = add i32 %res1, 3
+; CHECK-NEXT: %res7 = add i32 5, %res4
+; CHECK-NEXT: %res = add i32 %res6, 5
+; CHECK-NEXT: ret i32 %res
+define internal i32 @f2(i32 %x) {
+entry:
+  %cmp = icmp sgt i32 %x, 300
+  %cmp2 = icmp ne i32 %x, 10
+  %cmp3 = icmp sge i32 %x, 47
+  %cmp4 = icmp ugt i32 %x, 300
+  %cmp5 = icmp uge i32 %x, 47
+  %res1 = select i1 %cmp, i32 1, i32 2
+  %res2 = select i1 %cmp2, i32 3, i32 4
+  %res3 = select i1 %cmp3, i32 5, i32 6
+  %res4 = select i1 %cmp4, i32 3, i32 4
+  %res5 = select i1 %cmp5, i32 5, i32 6
+
+  %res6 = add i32 %res1, %res2
+  %res7 = add i32 %res3, %res4
+  %res = add i32 %res6, %res5
+  ret i32 %res
+}
+
+define i32 @caller1() {
+entry:
+  %call1 = tail call i32 @f1(i32 1, i32 301)
+  %call2 = tail call i32 @f1(i32 47, i32 999)
+  %call3 = tail call i32 @f2(i32 47)
+  %call4 = tail call i32 @f2(i32 301)
+  %res.1 = add nsw i32 12, %call3
+  %res.2 = add nsw i32 %res.1, %call4
+  ret i32 %res.2
+}
+
+; x is overdefined, because constant ranges are only used for parameter
+; values.
+; CHECK-LABEL: f3
+; CHECK: %cmp = icmp sgt i32 %x, 300
+; CHECK: %res = select i1 %cmp, i32 1, i32 2
+; CHECK: ret i32 %res
+define internal i32 @f3(i32 %x) {
+entry:
+  %cmp = icmp sgt i32 %x, 300
+  %res = select i1 %cmp, i32 1, i32 2
+  ret i32 %res
+}
+
+; The phi node could be converted in a ConstantRange.
+define i32 @caller2(i1 %cmp) {
+entry:
+  br i1 %cmp, label %if.true, label %end
+
+if.true:
+  br label %end
+
+end:
+  %res = phi i32 [ 0, %entry], [ 1, %if.true ]
+  %call1 = tail call i32 @f3(i32 %res)
+  ret i32 %call1
+}
+
+; CHECK-LABEL: f4
+; CHECK: %cmp = icmp sgt i32 %x, 300
+; CHECK: %res = select i1 %cmp, i32 1, i32 2
+; CHECK: ret i32 %res
+define internal i32 @f4(i32 %x) {
+entry:
+  %cmp = icmp sgt i32 %x, 300
+  %res = select i1 %cmp, i32 1, i32 2
+  ret i32 %res
+}
+
+; ICmp could introduce bounds on ConstantRanges.
+define i32 @caller3(i32 %x) {
+entry:
+  %cmp = icmp sgt i32 %x, 300
+  br i1 %cmp, label %if.true, label %end
+
+if.true:
+  %x.1 = tail call i32 @f4(i32 %x)
+  br label %end
+
+end:
+  %res = phi i32 [ 0, %entry], [ %x.1, %if.true ]
+  ret i32 %res
+}
+
+; Check to make sure we do not attempt to access lattice values in unreachable
+; blocks.
+define i32 @test_unreachable() {
+entry:
+  call i1 @test_unreachable_callee(i32 1)
+  call i1 @test_unreachable_callee(i32 2)
+  ret i32 1
+}
+
+define internal i1 @test_unreachable_callee(i32 %a) {
+entry:
+  ret i1 true
+
+unreachablebb:
+  %cmp = icmp eq i32 undef, %a
+  unreachable
+}
+
+; Check that we do not attempt to get range info for non-integer types and
+; crash.
+define double @test_struct({ double, double } %test) {
+    %v = extractvalue { double, double } %test, 0
+    %r = fmul double %v, %v
+    ret double %r
+}
+
+; Constant range for %x is [47, 302)
+; CHECK-LABEL: @f5
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %cmp = icmp sgt i32 %x, undef
+; CHECK-NEXT: %res1 = select i1 %cmp, i32 1, i32 2
+; CHECK-NEXT: %res = add i32 %res1, 3
+; CHECK-NEXT: ret i32 %res
+define internal i32 @f5(i32 %x) {
+entry:
+  %cmp = icmp sgt i32 %x, undef
+  %cmp2 = icmp ne i32 undef, %x
+  %res1 = select i1 %cmp, i32 1, i32 2
+  %res2 = select i1 %cmp2, i32 3, i32 4
+
+  %res = add i32 %res1, %res2
+  ret i32 %res
+}
+
+define i32 @caller4() {
+entry:
+  %call1 = tail call i32 @f5(i32 47)
+  %call2 = tail call i32 @f5(i32 301)
+  %res = add nsw i32 %call1, %call2
+  ret i32 %res
+}
+
+; Make sure we do re-evaluate the function after ParamState changes.
+; CHECK-LABEL: @recursive_f
+; CHECK-LABEL: entry:
+; CHECK:  %cmp = icmp eq i32 %i, 0
+; CHECK-NEXT: br i1 %cmp, label %if.then, label %if.else
+define internal i32 @recursive_f(i32 %i) {
+entry:
+  %cmp = icmp eq i32 %i, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  br label %return
+
+if.else:                                          ; preds = %entry
+  %sub = sub nsw i32 %i, 1
+  %call = call i32 @recursive_f(i32 %sub)
+  %add = add i32 %i, %call
+  br label %return
+
+return:                                           ; preds = %if.else, %if.then
+  %retval.0 = phi i32 [ 0, %if.then ], [ %add, %if.else ]
+  ret i32 %retval.0
+}
+
+; CHECK-LABEL: @caller5
+; CHECK: %call = call i32 @recursive_f(i32 42)
+; CHECK-NEXT: ret i32 %call
+define i32 @caller5() {
+entry:
+  %call = call i32 @recursive_f(i32 42)
+  ret i32 %call
+}

Added: llvm/trunk/test/Transforms/SCCP/ipsccp-addr-taken.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/ipsccp-addr-taken.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/ipsccp-addr-taken.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/ipsccp-addr-taken.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,28 @@
+; RUN: opt -ipsccp -S < %s | FileCheck %s
+; PR7876
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin10.0.0"
+
+define internal i32 @foo() nounwind noinline ssp {
+entry:
+  ret i32 0
+; CHECK-LABEL: @foo(
+; CHECK: entry:
+; CHECK: ret i32 0
+}
+
+declare i32 @bar() 
+
+define internal i32 @test(i32 %c) nounwind noinline ssp {
+bb:
+  %tmp1 = icmp ne i32 %c, 0                       ; <i1> [#uses=1]
+  %tmp2 = select i1 %tmp1, i32 ()* @foo, i32 ()* @bar ; <i32 ()*> [#uses=1]
+  %tmp3 = tail call i32 %tmp2() nounwind          ; <i32> [#uses=1]
+  ret i32 %tmp3
+}
+
+define i32 @main() nounwind ssp {
+bb:
+  %tmp = tail call i32 @test(i32 1)               ; <i32> [#uses=1]
+  ret i32 %tmp
+}

Added: llvm/trunk/test/Transforms/SCCP/ipsccp-basic.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/ipsccp-basic.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/ipsccp-basic.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/ipsccp-basic.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,273 @@
+; RUN: opt < %s -ipsccp -S | FileCheck %s
+; RUN: opt < %s -enable-debugify -ipsccp -debugify-quiet -disable-output
+
+;;======================== test1
+
+define internal i32 @test1a(i32 %A) {
+	%X = add i32 1, 2
+	ret i32 %A
+}
+; CHECK-LABEL: define internal i32 @test1a(
+; CHECK: ret i32 undef
+
+define i32 @test1b() {
+	%X = call i32 @test1a( i32 17 )
+	ret i32 %X
+
+; CHECK-LABEL: define i32 @test1b(
+; CHECK: ret i32 17
+}
+
+
+
+;;======================== test2
+
+define internal i32 @test2a(i32 %A) {
+	%C = icmp eq i32 %A, 0	
+	br i1 %C, label %T, label %F
+T:
+	%B = call i32 @test2a( i32 0 )
+	ret i32 0
+F:
+	%C.upgrd.1 = call i32 @test2a(i32 1)
+	ret i32 %C.upgrd.1
+}
+; CHECK-LABEL: define internal i32 @test2a(
+; CHECK-NEXT: br label %T
+; CHECK: ret i32 undef
+
+
+define i32 @test2b() {
+	%X = call i32 @test2a(i32 0)
+	ret i32 %X
+}
+; CHECK-LABEL: define i32 @test2b(
+; CHECK-NEXT: %X = call i32 @test2a(i32 0)
+; CHECK-NEXT: ret i32 0
+
+
+;;======================== test3
+
+ at G = internal global i32 undef
+
+define void @test3a() {
+	%X = load i32, i32* @G
+	store i32 %X, i32* @G
+	ret void
+}
+; CHECK-LABEL: define void @test3a(
+; CHECK-NEXT: ret void
+
+
+define i32 @test3b() {
+	%V = load i32, i32* @G
+	%C = icmp eq i32 %V, 17
+	br i1 %C, label %T, label %F
+T:
+	store i32 17, i32* @G
+	ret i32 %V
+F:	
+	store i32 123, i32* @G
+	ret i32 0
+}
+; CHECK-LABEL: define i32 @test3b(
+; CHECK-NOT: store
+; CHECK: ret i32 0
+
+
+;;======================== test4
+
+define internal {i64,i64} @test4a() {
+  %a = insertvalue {i64,i64} undef, i64 4, 1
+  %b = insertvalue {i64,i64} %a, i64 5, 0
+  ret {i64,i64} %b
+}
+
+; CHECK-LABEL: define internal { i64, i64 } @test4a(
+; CHECK-NEXT:   ret { i64, i64 } undef
+; CHECK-NEXT: }
+
+define i64 @test4b() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+  %a = invoke {i64,i64} @test4a()
+          to label %A unwind label %B
+A:
+  %b = extractvalue {i64,i64} %a, 0
+  %c = call i64 @test4c(i64 %b)
+  ret i64 %c
+B:
+  %val = landingpad { i8*, i32 }
+           catch i8* null
+  ret i64 0
+}
+; CHECK: define i64 @test4b()
+; CHECK:   %c = call i64 @test4c(i64 5)
+; CHECK-NEXT:  ret i64 5
+
+
+define internal i64 @test4c(i64 %a) {
+  ret i64 %a
+}
+; CHECK-LABEL: define internal i64 @test4c(
+; CHECK: ret i64 undef
+
+
+
+;;======================== test5
+
+; PR4313
+define internal {i64,i64} @test5a() {
+  %a = insertvalue {i64,i64} undef, i64 4, 1
+  %b = insertvalue {i64,i64} %a, i64 5, 0
+  ret {i64,i64} %b
+}
+
+define i64 @test5b() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+  %a = invoke {i64,i64} @test5a()
+          to label %A unwind label %B
+A:
+  %c = call i64 @test5c({i64,i64} %a)
+  ret i64 %c
+B:
+  %val = landingpad { i8*, i32 }
+           catch i8* null
+  ret i64 0
+}
+
+; CHECK: define i64 @test5b()
+; CHECK:     A:
+; CHECK-NEXT:  %c = call i64 @test5c({ i64, i64 } { i64 5, i64 4 })
+; CHECK-NEXT:  ret i64 5
+
+define internal i64 @test5c({i64,i64} %a) {
+  %b = extractvalue {i64,i64} %a, 0
+  ret i64 %b
+}
+
+
+;;======================== test6
+
+define i64 @test6a() {
+  ret i64 0
+}
+
+define i64 @test6b() {
+  %a = call i64 @test6a()
+  ret i64 %a
+}
+; CHECK-LABEL: define i64 @test6b(
+; CHECK: ret i64 0
+
+;;======================== test7
+
+
+%T = type {i32,i32}
+
+define internal %T @test7a(i32 %A) {
+  %X = add i32 1, %A
+  %mrv0 = insertvalue %T undef, i32 %X, 0
+  %mrv1 = insertvalue %T %mrv0, i32 %A, 1
+  ret %T %mrv1
+; CHECK-LABEL: @test7a(
+; CHECK-NEXT: ret %T undef
+}
+
+define i32 @test7b() {
+	%X = call %T @test7a(i32 17)
+        %Y = extractvalue %T %X, 0
+	%Z = add i32 %Y, %Y
+	ret i32 %Z
+; CHECK-LABEL: define i32 @test7b(
+; CHECK-NEXT: call %T @test7a(i32 17)
+; CHECK-NEXT: ret i32 36
+}
+
+;;======================== test8
+
+
+define internal {} @test8a(i32 %A, i32* %P) {
+  store i32 %A, i32* %P
+  ret {} {}
+; CHECK-LABEL: @test8a(
+; CHECK-NEXT: store i32 5, 
+; CHECK-NEXT: ret 
+}
+
+define void @test8b(i32* %P) {
+    %X = call {} @test8a(i32 5, i32* %P)
+    ret void
+; CHECK-LABEL: define void @test8b(
+; CHECK-NEXT: call {} @test8a
+; CHECK-NEXT: ret void
+}
+
+;;======================== test9
+
+ at test9g = internal global {  } zeroinitializer
+
+define void @test9() {
+entry:
+        %local_foo = alloca {  }
+        load {  }, {  }* @test9g
+        store {  } %0, {  }* %local_foo
+        ret void
+}
+
+; CHECK-LABEL: define void @test9(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %local_foo = alloca {}
+; CHECK-NEXT:  store {} zeroinitializer, {}* %local_foo
+; CHECK-NEXT: ret void
+
+declare i32 @__gxx_personality_v0(...)
+
+;;======================== test10
+
+define i32 @test10a() nounwind {
+entry:
+  %call = call i32 @test10b(i32 undef)
+  ret i32 %call
+; CHECK-LABEL: define i32 @test10a(
+; CHECK: ret i32 0
+}
+
+define internal i32 @test10b(i32 %x) nounwind {
+entry:
+  %r = and i32 %x, 1
+  ret i32 %r
+; CHECK-LABEL: define internal i32 @test10b(
+; CHECK: ret i32 undef
+}
+
+;;======================== test11
+
+define i64 @test11a() {
+  %xor = xor i64 undef, undef
+  ret i64 %xor
+; CHECK-LABEL: define i64 @test11a
+; CHECK: ret i64 0
+}
+
+define i64 @test11b() {
+  %call1 = call i64 @test11a()
+  %call2 = call i64 @llvm.ctpop.i64(i64 %call1)
+  ret i64 %call2
+; CHECK-LABEL: define i64 @test11b
+; CHECK: %[[call1:.*]] = call i64 @test11a()
+; CHECK-NOT: call i64 @llvm.ctpop.i64
+; CHECK-NEXT: ret i64 0
+}
+
+declare i64 @llvm.ctpop.i64(i64)
+
+;;======================== test12
+;; Ensure that a struct as an arg to a potentially constant-foldable
+;; function does not crash SCCP (for now it'll just ignores it)
+
+define i1 @test12() {
+  %c = call i1 @llvm.is.constant.sl_i32i32s({i32, i32} {i32 -1, i32 32})
+  ret i1 %c
+; CHECK-LABEL: define i1 @test12
+; CHECK: ret i1 %c
+}
+
+declare i1 @llvm.is.constant.sl_i32i32s({i32, i32} %a)

Added: llvm/trunk/test/Transforms/SCCP/ipsccp-branch-unresolved-undef.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/ipsccp-branch-unresolved-undef.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/ipsccp-branch-unresolved-undef.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/ipsccp-branch-unresolved-undef.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -ipsccp | FileCheck %s
+
+define void @main() {
+; CHECK-LABEL: @main(
+; CHECK:         %call = call i1 @patatino(i1 undef)
+; CHECK-NEXT:    ret void
+;
+  %call = call i1 @patatino(i1 undef)
+  ret void
+}
+
+define internal i1 @patatino(i1 %a) {
+; CHECK-LABEL: define internal i1 @patatino(
+; CHECK-NEXT:    br label [[ONFALSE:%.*]]
+; CHECK-EMPTY:
+; CHECK-NEXT:  onfalse:
+; CHECK-NEXT:    ret i1 undef
+  br i1 %a, label %ontrue, label %onfalse
+ontrue:
+  ret i1 false
+onfalse:
+  ret i1 false
+}

Added: llvm/trunk/test/Transforms/SCCP/ipsccp-phi-one-pred-dead.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/ipsccp-phi-one-pred-dead.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/ipsccp-phi-one-pred-dead.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/ipsccp-phi-one-pred-dead.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -ipsccp | FileCheck %s
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test() {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label %Flow5.pre
+; CHECK:       Flow6:
+; CHECK-NEXT:    br i1 undef, label %end1, label %end2
+; CHECK:       Flow5.pre:
+; CHECK-NEXT:    br label %Flow5
+; CHECK:       Flow5:
+; CHECK-NEXT:    br label %Flow6
+; CHECK:       end1:
+; CHECK-NEXT:    unreachable
+; CHECK:       end2:
+; CHECK-NEXT:    unreachable
+;
+entry:
+  br i1 true, label %Flow5.pre, label %Flow5.pre.unreachable
+
+Flow5.pre.unreachable:
+  br label %Flow5
+
+Flow6:
+  br i1 %0, label %end1, label %end2
+
+Flow5.pre:
+  br label %Flow5
+
+Flow5:
+  %0 = phi i1 [ undef, %Flow5.pre ], [ false, %Flow5.pre.unreachable ]
+  br label %Flow6
+
+end1:
+  unreachable
+
+end2:
+  unreachable
+}

Added: llvm/trunk/test/Transforms/SCCP/ipsccp-preserve-analysis.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/ipsccp-preserve-analysis.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/ipsccp-preserve-analysis.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/ipsccp-preserve-analysis.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,56 @@
+; Basic test to check that DominatorTreeAnalysis is preserved by IPSCCP and
+; the following analysis can re-use it. The test contains two trivial functions
+; IPSCCP can simplify, so we can test the case where IPSCCP makes changes.
+
+; RUN: opt -disable-verify -debug-pass-manager \
+; RUN:     -passes='function(require<domtree>,require<postdomtree>),ipsccp,function(require<domtree>,require<postdomtree>)' -S  %s 2>&1 \
+; RUN:     | FileCheck -check-prefixes='IR,NEW-PM' %s
+
+; RUN: opt -passes='function(require<postdomtree>),ipsccp,function(verify<domtree>)' -S  %s | FileCheck -check-prefixes='IR' %s
+
+; NEW-PM: Starting llvm::Module pass manager run.
+; NEW-PM: Running analysis: DominatorTreeAnalysis on f1
+; NEW-PM: Running analysis: PostDominatorTreeAnalysis on f1
+; NEW-PM: Running analysis: DominatorTreeAnalysis on f2
+; NEW-PM: Running analysis: PostDominatorTreeAnalysis on f2
+; NEW-PM: Running pass: IPSCCPPass
+; NEW-PM-DAG: Running analysis: AssumptionAnalysis on f1
+; NEW-PM-DAG: Running analysis: AssumptionAnalysis on f2
+; NEW-PM-NEXT: Invalidating all non-preserved analyses for:
+; NEW-PM-NEXT: Invalidating all non-preserved analyses for: f1
+; NEW-PM-NEXT: Invalidating all non-preserved analyses for: f2
+; NEW-PM-NEXT: Running pass: ModuleToFunctionPassAdaptor
+; NEW-PM-NOT: Running analysis:
+
+; IR-LABEL: @f1
+; IR-LABEL: entry:
+; IR-NEXT: br label %bb2
+; IR-LABEL: bb2:
+; IR-NEXT: undef
+
+; IR-LABEL: @f2
+; IR-NOT: icmp
+; IR:    br label %bbtrue
+; IR-LABEL: bbtrue:
+; IR-NEXT:   ret i32 0
+define internal i32 @f1() readnone {
+entry:
+  br i1 false, label %bb1, label %bb2
+bb1:
+  ret i32 10
+bb2:
+  ret i32 10
+}
+
+define i32 @f2(i32 %n) {
+  %i = call i32 @f1()
+  %cmp = icmp eq i32 %i, 10
+  br i1 %cmp, label %bbtrue, label %bbfalse
+
+bbtrue:
+  ret i32 0
+
+bbfalse:
+  %res = add i32 %n, %i
+  ret i32 %res
+}

Added: llvm/trunk/test/Transforms/SCCP/ipsccp-ssa-copy-nested-conds.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/ipsccp-ssa-copy-nested-conds.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/ipsccp-ssa-copy-nested-conds.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/ipsccp-ssa-copy-nested-conds.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,50 @@
+; RUN: opt < %s -ipsccp -S | FileCheck %s
+; RUN: opt < %s -passes=ipsccp -S | FileCheck %s
+
+; Test for PR39772
+; CHECK-LABEL: cleanup:
+; CHECK-NEXT:   %retval.0 = phi i32 [ 0, %if.then ], [ %add, %if.then7 ], [ %add8, %if.else ]
+
+
+%struct.Node = type { %struct.Node*, %struct.Node*, i32 }
+
+define i32 @check(%struct.Node* %node) {
+entry:
+  %cmp = icmp eq %struct.Node* %node, null
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  br label %cleanup
+
+if.end:                                           ; preds = %entry
+  %left = getelementptr inbounds %struct.Node, %struct.Node* %node, i32 0, i32 0
+  %0 = load %struct.Node*, %struct.Node** %left
+  %call = call i32 @check(%struct.Node* %0)
+  %right = getelementptr inbounds %struct.Node, %struct.Node* %node, i32 0, i32 1
+  %1 = load %struct.Node*, %struct.Node** %right
+  %call1 = call i32 @check(%struct.Node* %1)
+  %2 = load %struct.Node*, %struct.Node** %right
+  %height = getelementptr inbounds %struct.Node, %struct.Node* %2, i32 0, i32 2
+  %3 = load i32, i32* %height
+  %cmp3 = icmp ne i32 %3, %call1
+  br i1 %cmp3, label %if.then4, label %if.end5
+
+if.then4:                                         ; preds = %if.end
+  unreachable
+
+if.end5:                                          ; preds = %if.end
+  %cmp6 = icmp sgt i32 %call, %call1
+  br i1 %cmp6, label %if.then7, label %if.else
+
+if.then7:                                         ; preds = %if.end5
+  %add = add nsw i32 %call, 1
+  br label %cleanup
+
+if.else:                                          ; preds = %if.end5
+  %add8 = add nsw i32 %call1, 1
+  br label %cleanup
+
+cleanup:                                          ; preds = %if.else, %if.then7, %if.then
+  %retval.0 = phi i32 [ 0, %if.then ], [ %add, %if.then7 ], [ %add8, %if.else ]
+  ret i32 %retval.0
+}

Added: llvm/trunk/test/Transforms/SCCP/latticeval-invalidate.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/latticeval-invalidate.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/latticeval-invalidate.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/latticeval-invalidate.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,41 @@
+; RUN: opt -S -sccp %s
+
+ at A = external constant i32
+
+define void @test1() {
+BB4:
+  %A20 = alloca i1
+  %A15 = alloca i64
+  %A7 = alloca i64
+  %A3 = alloca i32**
+  %P = getelementptr i32, i32* @A, i32 0
+  %B = ptrtoint i32* %P to i64
+  %B8 = shl i64 %B, 9223372036854775807
+  %G10 = getelementptr i32*, i32** undef, i64 %B
+  %B10 = urem i64 %B, %B8
+  %B12 = shl i64 %B, %B
+  %BB = and i64 %B, %B8
+  %B1 = xor i64 %B, %B
+  %B23 = lshr i64 %B8, undef
+  %C5 = icmp uge i64 %B, %B10
+  %C17 = fcmp ord double 4.940660e-324, 0x7FEFFFFFFFFFFFFF
+  %C2 = icmp uge i1 %C17, false
+  %G = getelementptr i32, i32* %P, i1 %C17
+  %X = select i1 false, i712 0, i712 1
+  %C4 = icmp ule i1 true, false
+  %B3 = xor i1 %C17, %C2
+  %C33 = icmp slt i1 false, %C5
+  %B15 = sub i64 %B8, %B23
+  %C18 = icmp slt i64 undef, %BB
+  %G29 = getelementptr i32**, i32*** undef, i64 %B15
+  %C35 = icmp eq i1 %C17, undef
+  %C31 = icmp ult i1 %C35, %C5
+  %C29 = icmp sle i1 true, %C5
+  %C16 = icmp ne i16 -1, -32768
+  %A24 = alloca i1
+  %A21 = alloca i1
+  %A25 = alloca i32**
+  %C7 = icmp ule i1 %C4, %B3
+  %C14 = icmp slt i64 %B8, 0
+  ret void
+}

Added: llvm/trunk/test/Transforms/SCCP/loadtest.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/loadtest.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/loadtest.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/loadtest.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,46 @@
+; This test makes sure that these instructions are properly constant propagated.
+
+; RUN: opt < %s -data-layout="e-p:32:32" -debugify -sccp -S | FileCheck %s
+; RUN: opt < %s -data-layout="E-p:32:32" -debugify -sccp -S | FileCheck %s
+
+ at X = constant i32 42		; <i32*> [#uses=1]
+ at Y = constant [2 x { i32, float }] [ { i32, float } { i32 12, float 1.000000e+00 }, { i32, float } { i32 37, float 0x3FF3B2FEC0000000 } ]		; <[2 x { i32, float }]*> [#uses=2]
+
+define i32 @test1() {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 42
+; CHECK-NEXT: ret
+	%B = load i32, i32* @X		; <i32> [#uses=1]
+	ret i32 %B
+}
+
+define float @test2() {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT: call void @llvm.dbg.value(metadata float* getelementptr
+; CHECK-NEXT: call void @llvm.dbg.value(metadata float 0x3FF3B2FEC0000000
+; CHECK-NEXT: ret
+	%A = getelementptr [2 x { i32, float }], [2 x { i32, float }]* @Y, i64 0, i64 1, i32 1		; <float*> [#uses=1]
+	%B = load float, float* %A		; <float> [#uses=1]
+	ret float %B
+}
+
+define i32 @test3() {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT: call void @llvm.dbg.value(metadata i32* getelementptr
+; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 12
+; CHECK-NEXT: ret
+	%A = getelementptr [2 x { i32, float }], [2 x { i32, float }]* @Y, i64 0, i64 0, i32 0		; <i32*> [#uses=1]
+	%B = load i32, i32* %A
+	ret i32 %B
+}
+
+define i8 @test4() {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT: call void @llvm.dbg.value(metadata i8* bitcast
+; CHECK-NEXT: call void @llvm.dbg.value(metadata i8
+; CHECK-NEXT: ret
+	%A = bitcast i32* @X to i8*
+	%B = load i8, i8* %A
+	ret i8 %B
+}
+

Added: llvm/trunk/test/Transforms/SCCP/logical-nuke.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/logical-nuke.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/logical-nuke.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/logical-nuke.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,39 @@
+; RUN: opt < %s -sccp -S | FileCheck %s
+
+; Test that SCCP has basic knowledge of when and/or/mul nuke overdefined values.
+
+; CHECK-LABEL: test
+; CHECK: ret i32 0
+ define i32 @test(i32 %X) {
+  %Y = and i32 %X, 0
+  ret i32 %Y
+}
+
+; CHECK-LABEL: test2
+; CHECK: ret i32 -1
+define i32 @test2(i32 %X) {
+  %Y = or i32 -1, %X
+  ret i32 %Y
+}
+
+; CHECK-LABEL: test3
+; CHECK: ret i32 0
+define i32 @test3(i32 %X) {
+  %Y = and i32 undef, %X
+  ret i32 %Y
+}
+
+; CHECK-LABEL: test4
+; CHECK: ret i32 -1
+define i32 @test4(i32 %X) {
+  %Y = or i32 %X, undef
+  ret i32 %Y
+}
+
+; X * 0 = 0 even if X is overdefined.
+; CHECK-LABEL: test5
+; CHECK: ret i32 0
+define i32 @test5(i32 %foo) {
+  %patatino = mul i32 %foo, 0
+  ret i32 %patatino
+}

Added: llvm/trunk/test/Transforms/SCCP/overdefined-div.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/overdefined-div.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/overdefined-div.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/overdefined-div.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,32 @@
+; RUN: opt < %s -sccp -S | FileCheck %s
+
+; Test that SCCP has basic knowledge of when div can nuke overdefined values.
+
+; 0 / X = 0 even if X is overdefined.
+; CHECK-LABEL: test1
+; CHECK-NEXT: ret i32 0
+define i32 @test1(i32 %foo) {
+  %tinkywinky = udiv i32 0, %foo
+  ret i32 %tinkywinky
+}
+
+; CHECK-LABEL: test2
+; CHECK-NEXT: ret i32 0
+define i32 @test2(i32 %foo) {
+  %tinkywinky = sdiv i32 0, %foo
+  ret i32 %tinkywinky
+}
+
+; CHECK-LABEL: test3
+; CHECK: ret i32 %tinkywinky
+define i32 @test3(i32 %foo) {
+  %tinkywinky = udiv i32 %foo, 0
+  ret i32 %tinkywinky
+}
+
+; CHECK-LABEL: test4
+; CHECK: ret i32 %tinkywinky
+define i32 @test4(i32 %foo) {
+  %tinkywinky = sdiv i32 %foo, 0
+  ret i32 %tinkywinky
+}

Added: llvm/trunk/test/Transforms/SCCP/pr27712.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/pr27712.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/pr27712.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/pr27712.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,30 @@
+; RUN: opt -sccp -S < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @main() {
+entry:
+  br label %lbl_1154
+
+lbl_1154:
+  %b0.0 = phi i32 [ -119, %entry ], [ 0, %lbl_1154 ]
+  %cmp11 = icmp slt i32 %b0.0, 0
+  %shl.op = shl i32 33554432, %b0.0
+  %cmp1445 = icmp ult i32 %shl.op, 33554432
+  %cmp14 = or i1 %cmp11, %cmp1445
+  br i1 %cmp14, label %lbl_1154, label %if.end19
+
+if.end19:
+  br i1 %cmp11, label %if.then22, label %cleanup26
+
+if.then22:
+  tail call void @abort()
+  unreachable
+
+cleanup26:
+  ret i32 %shl.op
+}
+; CHECK-LABEL: define i32 @main(
+; CHECK-NOT: ret i32 undef
+
+declare void @abort()

Added: llvm/trunk/test/Transforms/SCCP/pr35357.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/pr35357.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/pr35357.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/pr35357.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,24 @@
+; RUN: opt -S %s -ipsccp | FileCheck %s
+
+ at a = internal global i32 2
+
+define i32 @patatino() {
+; CHECK: @patatino(
+; CHECK: call void @f(i32 undef, i32 1)
+; CHECK-NEXT: call void @f(i32 2, i32 0)
+; CHECK-NEXT: ret i32 0
+entry:
+  call void @f(i32 undef, i32 1)
+  %0 = load i32, i32* @a
+  call void @f(i32 %0, i32 0)
+  ret i32 0
+}
+
+define internal void @f(i32 %c, i32 %d) {
+; CHECK: @f(
+; CHECK:    ret void
+;
+entry:
+  %cmp = icmp ne i32 %c, %d
+  ret void
+}

Added: llvm/trunk/test/Transforms/SCCP/preserve-analysis.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/preserve-analysis.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/preserve-analysis.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/preserve-analysis.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,34 @@
+; RUN: opt < %s -debug-pass=Structure -globals-aa -loop-vectorize -sccp -loop-vectorize -globals-aa 2>&1 -S | FileCheck %s
+; RUN: opt < %s -debug-pass-manager -passes='loop-vectorize,sccp,loop-vectorize' 2>&1 -S | FileCheck --check-prefix=NEW-PM %s
+
+; Check CFG-only analysis are preserved by SCCP by running it between 2
+; loop-vectorize runs.
+
+; CHECK: Globals Alias Analysis
+; CHECK: Dominator Tree Construction
+; CHECK: Natural Loop Information
+; CHECK: Sparse Conditional Constant Propagation
+; CHECK-NOT: Dominator Tree Construction
+; CHECK-NOT: Natural Loop Information
+; CHECK-NOT: Globals Alias Analysis
+; CHECK: Loop Vectorization
+
+; NEW-PM-DAG: Running analysis: LoopAnalysis on test
+; NEW-PM-DAG: Running analysis: DominatorTreeAnalysis on test
+; NEW-PM-DAG: Running analysis: AssumptionAnalysis on test
+; NEW-PM-DAG: Running analysis: TargetLibraryAnalysis on test
+; NEW-PM-DAG: Running analysis: TargetIRAnalysis on test
+; NEW-PM: Running pass: SCCPPass on test
+; NEW-PM-NOT: Running analysis: LoopAnalysis on test
+; NEW-PM-NOT: Running analysis: DominatorTreeAnalysis on test
+; NEW-PM-NOT: Running analysis: AssumptionAnalysis on test
+; NEW-PM-NOT: Running analysis: TargetLibraryAnalysis on test
+; NEW-PM-NOT: Running analysis: TargetIRAnalysis on test
+; NEW-PM: Finished llvm::Function pass manager run.
+
+
+define i32 @test() {
+entry:
+  %res = add i32 1, 10
+  ret i32 %res
+}

Added: llvm/trunk/test/Transforms/SCCP/return-zapped.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/return-zapped.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/return-zapped.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/return-zapped.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,62 @@
+; RUN: opt < %s -S -ipsccp | FileCheck %s
+
+; After the first round of Solver.Solve(), the return value of @testf still
+; undefined as we hit a branch on undef. Therefore the conditional branch on
+; @testf's return value in @bar is unknown. In ResolvedUndefsIn, we force the
+; false branch to be feasible. We later discover that @testf actually
+; returns true, so we end up with an unfolded "br i1 true".
+define void @test1() {
+; CHECK-LABEL: @test1(
+; CHECK-LABEL: if.then:
+; CHECK:         [[CALL:%.+]] = call i1 @testf()
+; CHECK-NEXT:    br i1 true, label %if.end, label %if.then
+;
+entry:
+  br label %if.then
+if.then:                                          ; preds = %entry, %if.then
+  %foo = phi i32 [ 0, %entry], [ %next, %if.then]
+  %next = add i32 %foo, 1
+  %call = call i1 @testf()
+  br i1 %call, label %if.end, label %if.then
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define internal i1 @testf() {
+; CHECK-LABEL: define internal i1 @testf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[IF_END3:%.*]]
+; CHECK:       if.end3:
+; CHECK-NEXT:    ret i1 undef
+;
+entry:
+  br i1 undef, label %if.then1, label %if.end3
+
+if.then1:                                         ; preds = %if.end
+  br label %if.end3
+
+if.end3:                                          ; preds = %if.then1, %entry
+  ret i1 true
+}
+
+
+; Call sites in unreachable blocks should not be a problem.
+; CHECK-LABEL: define i1 @test2() {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   br label %if.end
+; CHECK-LABEL: if.end:                                           ; preds = %entry
+; CHECK-NEXT:   %call2 = call i1 @testf()
+; CHECK-NEXT:   ret i1 true
+define i1 @test2() {
+entry:
+  br label %if.end
+
+if.then:                                          ; preds = %entry, %if.then
+  %call = call i1 @testf()
+  br i1 %call, label %if.end, label %if.then
+
+if.end:                                           ; preds = %if.then, %entry
+  %call2 = call i1 @testf()
+  ret i1 %call2
+}

Added: llvm/trunk/test/Transforms/SCCP/retvalue-undef.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/retvalue-undef.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/retvalue-undef.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/retvalue-undef.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,32 @@
+; RUN: opt -ipsccp -S < %s | FileCheck %s
+; PR6414
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal i32 ()* @f() {
+  ret i32 ()* @g
+}
+
+define internal i32 @g() {
+  ret i32 8
+}
+
+; CHECK: internal i32 @g()
+; CHECK-NEXT: ret i32 8
+
+define internal void @outer_mod() {
+  %1 = call i32 ()* () @f()                      ; <i32 ()*> [#uses=1]
+  %2 = call i32 %1()                              ; <i32> [#uses=0]
+  ret void
+}
+
+define internal void @module_init() {
+  call void @register_outer_mod(void ()* @outer_mod)
+  ret void
+}
+
+declare void @register_outer_mod(void ()*)
+
+define i32 @main() {
+  ret i32 0
+}

Added: llvm/trunk/test/Transforms/SCCP/sccptest.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/sccptest.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/sccptest.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/sccptest.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,58 @@
+; RUN: opt < %s -sccp -S | FileCheck %s
+
+; This is a basic sanity check for constant propagation.  The add instruction 
+; should be eliminated.
+
+define i32 @test1(i1 %B) {
+	br i1 %B, label %BB1, label %BB2
+BB1:		; preds = %0
+	%Val = add i32 0, 0		; <i32> [#uses=1]
+	br label %BB3
+BB2:		; preds = %0
+	br label %BB3
+BB3:		; preds = %BB2, %BB1
+	%Ret = phi i32 [ %Val, %BB1 ], [ 1, %BB2 ]		; <i32> [#uses=1]
+	ret i32 %Ret
+        
+; CHECK-LABEL: @test1(
+; CHECK: %Ret = phi i32 [ 0, %BB1 ], [ 1, %BB2 ]
+}
+
+; This is the test case taken from appel's book that illustrates a hard case
+; that SCCP gets right.
+;
+define i32 @test2(i32 %i0, i32 %j0) {
+; CHECK-LABEL: @test2(
+BB1:
+	br label %BB2
+BB2:
+	%j2 = phi i32 [ %j4, %BB7 ], [ 1, %BB1 ]
+	%k2 = phi i32 [ %k4, %BB7 ], [ 0, %BB1 ]
+	%kcond = icmp slt i32 %k2, 100
+	br i1 %kcond, label %BB3, label %BB4
+BB3:
+	%jcond = icmp slt i32 %j2, 20
+	br i1 %jcond, label %BB5, label %BB6
+; CHECK: BB3:
+; CHECK-NEXT: br i1 true, label %BB5, label %BB6
+BB4:
+	ret i32 %j2
+; CHECK: BB4:
+; CHECK-NEXT: ret i32 1
+BB5:
+	%k3 = add i32 %k2, 1
+	br label %BB7
+BB6:
+	%k5 = add i32 %k2, 1
+	br label %BB7
+; CHECK: BB6:
+; CHECK-NEXT: br label %BB7
+BB7:
+	%j4 = phi i32 [ 1, %BB5 ], [ %k2, %BB6 ]
+	%k4 = phi i32 [ %k3, %BB5 ], [ %k5, %BB6 ]
+	br label %BB2
+; CHECK: BB7:
+; CHECK-NEXT: %k4 = phi i32 [ %k3, %BB5 ], [ undef, %BB6 ]
+; CHECK-NEXT: br label %BB2
+}
+

Added: llvm/trunk/test/Transforms/SCCP/select.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/select.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/select.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/select.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,12 @@
+; RUN: opt < %s -sccp -S | not grep select
+
+define i32 @test1(i1 %C) {
+	%X = select i1 %C, i32 0, i32 0		; <i32> [#uses=1]
+	ret i32 %X
+}
+
+define i32 @test2(i1 %C) {
+	%X = select i1 %C, i32 0, i32 undef		; <i32> [#uses=1]
+	ret i32 %X
+}
+

Added: llvm/trunk/test/Transforms/SCCP/switch-multiple-undef.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/switch-multiple-undef.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/switch-multiple-undef.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/switch-multiple-undef.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,27 @@
+; RUN: opt -S -ipsccp < %s | FileCheck %s
+
+declare void @foo()
+declare void @goo()
+declare void @patatino()
+
+define void @test1(i32 %t) {
+  %choice = icmp eq i32 undef, -1
+  switch i1 %choice, label %first [i1 0, label %second
+                                   i1 1, label %third]
+first:
+  call void @foo()
+  ret void
+second:
+  call void @goo()
+  ret void
+third:
+  call void @patatino()
+  ret void
+}
+
+; CHECK: define void @test1(i32 %t) {
+; CHECK-NEXT:   br label %second
+; CHECK: second:
+; CHECK-NEXT:   call void @goo()
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }

Added: llvm/trunk/test/Transforms/SCCP/switch-undef-constantfoldterminator.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/switch-undef-constantfoldterminator.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/switch-undef-constantfoldterminator.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/switch-undef-constantfoldterminator.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -ipsccp -S | FileCheck %s
+
+; This test case used to end up like this:
+;
+;    While deleting: label %lor.rhs
+;    Use still stuck around after Def is destroyed:  br i1 undef, label %lor.rhs, label %land.end
+;    opt: ../lib/IR/Value.cpp: llvm::Value::~Value(): Assertion `use_empty() && "Uses remain when a value is destroyed!"' failed.
+;
+; due to ConstantFoldTerminator rewriting the switch into
+;
+;    br i1 undef, label %lor.rhs, label %land.end
+;
+; while SCCP implementation relied on the terminator to always be folded into
+; an unconditional branch when ConstantFoldTerminator returned true.
+
+define void @f4() {
+; CHECK-LABEL: define void @f4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = call i16 @f3(i16 undef)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %call = call i16 @f3(i16 undef)
+  ret void
+}
+
+define internal i16 @f3(i16 %p1) {
+; CHECK-LABEL: define internal i16 @f3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LAND_END:%.*]]
+; CHECK:       land.end:
+; CHECK-NEXT:    ret i16 undef
+;
+entry:
+  switch i16 %p1, label %land.end [
+  i16 0, label %land.end
+  i16 1, label %lor.rhs
+  ]
+
+lor.rhs:
+  br label %land.end
+
+land.end:
+  ret i16 0
+}
+

Added: llvm/trunk/test/Transforms/SCCP/switch.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/switch.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/switch.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/switch.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,13 @@
+; RUN: opt -S -sccp < %s | FileCheck %s
+
+; Make sure we always consider the default edge executable for a switch
+; with no cases.
+declare void @foo()
+define void @test1() {
+; CHECK-LABEL: define void @test1(
+; CHECK: call void @foo()
+  switch i32 undef, label %d []
+d:
+  call void @foo()
+  ret void
+}

Added: llvm/trunk/test/Transforms/SCCP/ub-shift.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/ub-shift.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/ub-shift.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/ub-shift.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,69 @@
+; RUN: opt < %s -sccp -S | FileCheck %s
+
+; CHECK-LABEL: shift_undef_64
+define void @shift_undef_64(i64* %p) {
+  %r1 = lshr i64 -1, 4294967296 ; 2^32
+  ; CHECK: store i64 undef
+  store i64 %r1, i64* %p
+
+  %r2 = ashr i64 -1, 4294967297 ; 2^32 + 1
+  ; CHECK: store i64 undef
+  store i64 %r2, i64* %p
+
+  %r3 = shl i64 -1, 4294967298 ; 2^32 + 2
+  ; CHECK: store i64 undef
+  store i64 %r3, i64* %p
+
+  ret void
+}
+
+; CHECK-LABEL: shift_undef_65
+define void @shift_undef_65(i65* %p) {
+  %r1 = lshr i65 2, 18446744073709551617
+  ; CHECK: store i65 undef
+  store i65 %r1, i65* %p
+
+  %r2 = ashr i65 4, 18446744073709551617
+  ; CHECK: store i65 undef
+  store i65 %r2, i65* %p
+
+  %r3 = shl i65 1, 18446744073709551617
+  ; CHECK: store i65 undef
+  store i65 %r3, i65* %p
+
+  ret void
+}
+
+; CHECK-LABEL: shift_undef_256
+define void @shift_undef_256(i256* %p) {
+  %r1 = lshr i256 2, 18446744073709551617
+  ; CHECK: store i256 undef
+  store i256 %r1, i256* %p
+
+  %r2 = ashr i256 4, 18446744073709551618
+  ; CHECK: store i256 undef
+  store i256 %r2, i256* %p
+
+  %r3 = shl i256 1, 18446744073709551619
+  ; CHECK: store i256 undef
+  store i256 %r3, i256* %p
+
+  ret void
+}
+
+; CHECK-LABEL: shift_undef_511
+define void @shift_undef_511(i511* %p) {
+  %r1 = lshr i511 -1, 1208925819614629174706276 ; 2^80 + 100
+  ; CHECK: store i511 undef
+  store i511 %r1, i511* %p
+
+  %r2 = ashr i511 -2, 1208925819614629174706200
+  ; CHECK: store i511 undef
+  store i511 %r2, i511* %p
+
+  %r3 = shl i511 -3, 1208925819614629174706180
+  ; CHECK: store i511 undef
+  store i511 %r3, i511* %p
+
+  ret void
+}

Added: llvm/trunk/test/Transforms/SCCP/undef-resolve.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/undef-resolve.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/undef-resolve.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/undef-resolve.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,182 @@
+; RUN: opt -sccp -S < %s | FileCheck %s
+
+
+; PR6940
+define double @test1() {
+  %t = sitofp i32 undef to double
+  ret double %t
+; CHECK-LABEL: @test1(
+; CHECK: ret double 0.0
+}
+
+
+; rdar://7832370
+; Check that lots of stuff doesn't get turned into undef.
+define i32 @test2() nounwind readnone ssp {
+; CHECK-LABEL: @test2(
+init:
+  br label %control.outer.outer
+
+control.outer.loopexit.us-lcssa:                  ; preds = %control
+  br label %control.outer.loopexit
+
+control.outer.loopexit:                           ; preds = %control.outer.loopexit.us-lcssa.us, %control.outer.loopexit.us-lcssa
+  br label %control.outer.outer.backedge
+
+control.outer.outer:                              ; preds = %control.outer.outer.backedge, %init
+  %switchCond.0.ph.ph = phi i32 [ 2, %init ], [ 3, %control.outer.outer.backedge ] ; <i32> [#uses=2]
+  %i.0.ph.ph = phi i32 [ undef, %init ], [ %i.0.ph.ph.be, %control.outer.outer.backedge ] ; <i32> [#uses=1]
+  %tmp4 = icmp eq i32 %i.0.ph.ph, 0               ; <i1> [#uses=1]
+  br i1 %tmp4, label %control.outer.outer.split.us, label %control.outer.outer.control.outer.outer.split_crit_edge
+
+control.outer.outer.control.outer.outer.split_crit_edge: ; preds = %control.outer.outer
+  br label %control.outer
+
+control.outer.outer.split.us:                     ; preds = %control.outer.outer
+  br label %control.outer.us
+
+control.outer.us:                                 ; preds = %bb3.us, %control.outer.outer.split.us
+  %A.0.ph.us = phi i32 [ %switchCond.0.us, %bb3.us ], [ 4, %control.outer.outer.split.us ] ; <i32> [#uses=2]
+  %switchCond.0.ph.us = phi i32 [ %A.0.ph.us, %bb3.us ], [ %switchCond.0.ph.ph, %control.outer.outer.split.us ] ; <i32> [#uses=1]
+  br label %control.us
+
+bb3.us:                                           ; preds = %control.us
+  br label %control.outer.us
+
+bb0.us:                                           ; preds = %control.us
+  br label %control.us
+
+; CHECK: control.us:                                       ; preds = %bb0.us, %control.outer.us
+; CHECK-NEXT:  %switchCond.0.us = phi i32
+; CHECK-NEXT:  switch i32 %switchCond.0.us
+control.us:                                       ; preds = %bb0.us, %control.outer.us
+  %switchCond.0.us = phi i32 [ %A.0.ph.us, %bb0.us ], [ %switchCond.0.ph.us, %control.outer.us ] ; <i32> [#uses=2]
+  switch i32 %switchCond.0.us, label %control.outer.loopexit.us-lcssa.us [
+    i32 0, label %bb0.us
+    i32 1, label %bb1.us-lcssa.us
+    i32 3, label %bb3.us
+    i32 4, label %bb4.us-lcssa.us
+  ]
+
+control.outer.loopexit.us-lcssa.us:               ; preds = %control.us
+  br label %control.outer.loopexit
+
+bb1.us-lcssa.us:                                  ; preds = %control.us
+  br label %bb1
+
+bb4.us-lcssa.us:                                  ; preds = %control.us
+  br label %bb4
+
+control.outer:                                    ; preds = %bb3, %control.outer.outer.control.outer.outer.split_crit_edge
+  %A.0.ph = phi i32 [ %nextId17, %bb3 ], [ 4, %control.outer.outer.control.outer.outer.split_crit_edge ] ; <i32> [#uses=1]
+  %switchCond.0.ph = phi i32 [ 0, %bb3 ], [ %switchCond.0.ph.ph, %control.outer.outer.control.outer.outer.split_crit_edge ] ; <i32> [#uses=1]
+  br label %control
+
+control:                                          ; preds = %bb0, %control.outer
+  %switchCond.0 = phi i32 [ %A.0.ph, %bb0 ], [ %switchCond.0.ph, %control.outer ] ; <i32> [#uses=2]
+  switch i32 %switchCond.0, label %control.outer.loopexit.us-lcssa [
+    i32 0, label %bb0
+    i32 1, label %bb1.us-lcssa
+    i32 3, label %bb3
+    i32 4, label %bb4.us-lcssa
+  ]
+
+bb4.us-lcssa:                                     ; preds = %control
+  br label %bb4
+
+bb4:                                              ; preds = %bb4.us-lcssa, %bb4.us-lcssa.us
+  br label %control.outer.outer.backedge
+
+control.outer.outer.backedge:                     ; preds = %bb4, %control.outer.loopexit
+  %i.0.ph.ph.be = phi i32 [ 1, %bb4 ], [ 0, %control.outer.loopexit ] ; <i32> [#uses=1]
+  br label %control.outer.outer
+
+bb3:                                              ; preds = %control
+  %nextId17 = add i32 %switchCond.0, -2           ; <i32> [#uses=1]
+  br label %control.outer
+
+bb0:                                              ; preds = %control
+  br label %control
+
+bb1.us-lcssa:                                     ; preds = %control
+  br label %bb1
+
+bb1:                                              ; preds = %bb1.us-lcssa, %bb1.us-lcssa.us
+  ret i32 0
+}
+
+; Make sure SCCP honors the xor "idiom"
+; rdar://9956541
+define i32 @test3() {
+  %t = xor i32 undef, undef
+  ret i32 %t
+; CHECK-LABEL: @test3(
+; CHECK: ret i32 0
+}
+
+; Be conservative with FP ops
+define double @test4(double %x) {
+  %t = fadd double %x, undef
+  ret double %t
+; CHECK-LABEL: @test4(
+; CHECK: fadd double %x, undef
+}
+
+; Make sure casts produce a possible value
+define i32 @test5() {
+  %t = sext i8 undef to i32
+  ret i32 %t
+; CHECK-LABEL: @test5(
+; CHECK: ret i32 0
+}
+
+; Make sure ashr produces a possible value
+define i32 @test6() {
+  %t = ashr i32 undef, 31
+  ret i32 %t
+; CHECK-LABEL: @test6(
+; CHECK: ret i32 0
+}
+
+; Make sure lshr produces a possible value
+define i32 @test7() {
+  %t = lshr i32 undef, 31
+  ret i32 %t
+; CHECK-LABEL: @test7(
+; CHECK: ret i32 0
+}
+
+; icmp eq with undef simplifies to undef
+define i1 @test8() {
+  %t = icmp eq i32 undef, -1
+  ret i1 %t
+; CHECK-LABEL: @test8(
+; CHECK: ret i1 undef
+}
+
+; Make sure we don't conclude that relational comparisons simplify to undef
+define i1 @test9() {
+  %t = icmp ugt i32 undef, -1
+  ret i1 %t
+; CHECK-LABEL: @test9(
+; CHECK: icmp ugt
+}
+
+; Make sure we handle extractvalue
+define i64 @test10() { 
+entry:
+  %e = extractvalue { i64, i64 } undef, 1
+  ret i64 %e
+; CHECK-LABEL: @test10(
+; CHECK: ret i64 undef
+}
+
+ at GV = external global i32
+
+define i32 @test11(i1 %tobool) {
+entry:
+  %shr4 = ashr i32 undef, zext (i1 icmp eq (i32* bitcast (i32 (i1)* @test11 to i32*), i32* @GV) to i32)
+  ret i32 %shr4
+; CHECK-LABEL: @test11(
+; CHECK: ret i32 0
+}

Added: llvm/trunk/test/Transforms/SCCP/vector-bitcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SCCP/vector-bitcast.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SCCP/vector-bitcast.ll (added)
+++ llvm/trunk/test/Transforms/SCCP/vector-bitcast.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,20 @@
+; RUN: opt -sccp -S < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+
+; CHECK: store volatile <2 x i64> zeroinitializer, <2 x i64>* %p
+; rdar://11324230
+
+define void @foo(<2 x i64>* %p) nounwind {
+entry:
+  br label %while.body.i
+
+while.body.i:                                     ; preds = %while.body.i, %entry
+  %vWorkExponent.i.033 = phi <4 x i32> [ %sub.i.i, %while.body.i ], [ <i32 939524096, i32 939524096, i32 939524096, i32 939524096>, %entry ]
+  %sub.i.i = add <4 x i32> %vWorkExponent.i.033, <i32 -8388608, i32 -8388608, i32 -8388608, i32 -8388608>
+  %0 = bitcast <4 x i32> %sub.i.i to <2 x i64>
+  %and.i119.i = and <2 x i64> %0, zeroinitializer
+  store volatile <2 x i64> %and.i119.i, <2 x i64>* %p
+  br label %while.body.i
+}
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic < %s | FileCheck %s
+; RUN: opt -S -slp-vectorizer -mtriple=aarch64-apple-ios -mcpu=cyclone < %s | FileCheck %s
+; Currently disabled for a few subtargets (e.g. Kryo):
+; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=kryo < %s | FileCheck --check-prefix=NO_SLP %s
+; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic -slp-min-reg-size=128 < %s | FileCheck --check-prefix=NO_SLP %s
+
+define void @f(float* %r, float* %w) {
+; CHECK-LABEL: @f(
+; CHECK-NEXT:    [[R0:%.*]] = getelementptr inbounds float, float* [[R:%.*]], i64 0
+; CHECK-NEXT:    [[R1:%.*]] = getelementptr inbounds float, float* [[R]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[R0]] to <2 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP2]], [[TMP2]]
+; CHECK-NEXT:    [[W0:%.*]] = getelementptr inbounds float, float* [[W:%.*]], i64 0
+; CHECK-NEXT:    [[W1:%.*]] = getelementptr inbounds float, float* [[W]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[W0]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP3]], <2 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    ret void
+;
+; NO_SLP-LABEL: @f(
+; NO_SLP-NEXT:    [[R0:%.*]] = getelementptr inbounds float, float* [[R:%.*]], i64 0
+; NO_SLP-NEXT:    [[R1:%.*]] = getelementptr inbounds float, float* [[R]], i64 1
+; NO_SLP-NEXT:    [[F0:%.*]] = load float, float* [[R0]]
+; NO_SLP-NEXT:    [[F1:%.*]] = load float, float* [[R1]]
+; NO_SLP-NEXT:    [[ADD0:%.*]] = fadd float [[F0]], [[F0]]
+; NO_SLP-NEXT:    [[ADD1:%.*]] = fadd float [[F1]], [[F1]]
+; NO_SLP-NEXT:    [[W0:%.*]] = getelementptr inbounds float, float* [[W:%.*]], i64 0
+; NO_SLP-NEXT:    [[W1:%.*]] = getelementptr inbounds float, float* [[W]], i64 1
+; NO_SLP-NEXT:    store float [[ADD0]], float* [[W0]]
+; NO_SLP-NEXT:    store float [[ADD1]], float* [[W1]]
+; NO_SLP-NEXT:    ret void
+;
+  %r0 = getelementptr inbounds float, float* %r, i64 0
+  %r1 = getelementptr inbounds float, float* %r, i64 1
+  %f0 = load float, float* %r0
+  %f1 = load float, float* %r1
+  %add0 = fadd float %f0, %f0
+  %add1 = fadd float %f1, %f1
+  %w0 = getelementptr inbounds float, float* %w, i64 0
+  %w1 = getelementptr inbounds float, float* %w, i64 1
+  store float %add0, float* %w0
+  store float %add1, float* %w1
+  ret void
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/PR38339.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AArch64/PR38339.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/PR38339.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/PR38339.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S -mtriple=aarch64-apple-ios -mcpu=cyclone -o - %s | FileCheck %s
+
+define void @f1(<2 x i16> %x, i16* %a) {
+; CHECK-LABEL: @f1(
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[X:%.*]], <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0
+; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1
+; CHECK-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2
+; CHECK-NEXT:    [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0
+; CHECK-NEXT:    store i16 [[TMP1]], i16* [[A:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>*
+; CHECK-NEXT:    store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP2]], align 2
+; CHECK-NEXT:    ret void
+;
+  %t2 = extractelement <2 x i16> %x, i32 0
+  %t3 = extractelement <2 x i16> %x, i32 1
+  %ptr0 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0
+  %ptr1 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1
+  %ptr2 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2
+  %ptr3 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3
+  store i16 %t2, i16* %a
+  store i16 %t2, i16* %ptr0
+  store i16 %t3, i16* %ptr1
+  store i16 %t3, i16* %ptr2
+  store i16 %t2, i16* %ptr3
+  ret void
+}
+
+define void @f2(<2 x i16> %x, i16* %a) {
+; CHECK-LABEL: @f2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[CONT:%.*]]
+; CHECK:       cont:
+; CHECK-NEXT:    [[XX:%.*]] = phi <2 x i16> [ [[X:%.*]], [[ENTRY:%.*]] ], [ undef, [[CONT]] ]
+; CHECK-NEXT:    [[AA:%.*]] = phi i16* [ [[A:%.*]], [[ENTRY]] ], [ undef, [[CONT]] ]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0
+; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1
+; CHECK-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2
+; CHECK-NEXT:    [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0
+; CHECK-NEXT:    store i16 [[TMP0]], i16* [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>*
+; CHECK-NEXT:    store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP1]], align 2
+; CHECK-NEXT:    [[A_VAL:%.*]] = load i16, i16* [[A]], align 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[A_VAL]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[CONT]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %cont
+
+cont:                                           ; preds = %entry, %cont
+  %xx = phi <2 x i16> [ %x, %entry ], [ undef, %cont ]
+  %aa = phi i16* [ %a, %entry ], [ undef, %cont ]
+  %t2 = extractelement <2 x i16> %xx, i32 0
+  %t3 = extractelement <2 x i16> %xx, i32 1
+  %ptr0 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0
+  %ptr1 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1
+  %ptr2 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2
+  %ptr3 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3
+  store i16 %t2, i16* %a
+  store i16 %t2, i16* %ptr0
+  store i16 %t3, i16* %ptr1
+  store i16 %t3, i16* %ptr2
+  store i16 %t2, i16* %ptr3
+  %a_val = load i16, i16* %a, align 2
+  %cmp = icmp eq i16 %a_val, 0
+  br i1 %cmp, label %cont, label %exit
+
+exit:                                           ; preds = %cont
+  ret void
+}
+
+define void @f3(<2 x i16> %x, i16* %a) {
+; CHECK-LABEL: @f3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[CONT:%.*]]
+; CHECK:       cont:
+; CHECK-NEXT:    [[XX:%.*]] = phi <2 x i16> [ [[X:%.*]], [[ENTRY:%.*]] ], [ undef, [[CONT]] ]
+; CHECK-NEXT:    [[AA:%.*]] = phi i16* [ [[A:%.*]], [[ENTRY]] ], [ undef, [[CONT]] ]
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[REORDER_SHUFFLE]], <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0
+; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1
+; CHECK-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2
+; CHECK-NEXT:    [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0
+; CHECK-NEXT:    store i16 [[TMP0]], i16* [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>*
+; CHECK-NEXT:    store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP1]], align 2
+; CHECK-NEXT:    [[A_VAL:%.*]] = load i16, i16* [[A]], align 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[A_VAL]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[CONT]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %cont
+
+cont:                                           ; preds = %entry, %cont
+  %xx = phi <2 x i16> [ %x, %entry ], [ undef, %cont ]
+  %aa = phi i16* [ %a, %entry ], [ undef, %cont ]
+  %t2 = extractelement <2 x i16> %xx, i32 0
+  %t3 = extractelement <2 x i16> %xx, i32 1
+  %ptr0 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0
+  %ptr1 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1
+  %ptr2 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2
+  %ptr3 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3
+  store i16 %t3, i16* %a
+  store i16 %t3, i16* %ptr0
+  store i16 %t2, i16* %ptr1
+  store i16 %t2, i16* %ptr2
+  store i16 %t3, i16* %ptr3
+  %a_val = load i16, i16* %a, align 2
+  %cmp = icmp eq i16 %a_val, 0
+  br i1 %cmp, label %cont, label %exit
+
+exit:                                           ; preds = %cont
+  ret void
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/commute.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AArch64/commute.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/commute.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/commute.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -slp-vectorizer %s -slp-threshold=-10 | FileCheck %s
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+%structA = type { [2 x float] }
+
+define void @test1(%structA* nocapture readonly %J, i32 %xmin, i32 %ymin) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> undef, i32 [[XMIN:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[YMIN:%.*]], i32 1
+; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH:%.*]]
+; CHECK:       for.body3.lr.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [[STRUCTA:%.*]], %structA* [[J:%.*]], i64 0, i32 0, i64 0
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCTA]], %structA* [[J]], i64 0, i32 0, i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX4]] to <2 x float>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
+; CHECK:       for.end27:
+; CHECK-NEXT:    ret void
+;
+
+entry:
+  br label %for.body3.lr.ph
+
+for.body3.lr.ph:
+  %conv5 = sitofp i32 %ymin to float
+  %conv = sitofp i32 %xmin to float
+  %arrayidx4 = getelementptr inbounds %structA, %structA* %J, i64 0, i32 0, i64 0
+  %0 = load float, float* %arrayidx4, align 4
+  %sub = fsub fast float %conv, %0
+  %arrayidx9 = getelementptr inbounds %structA, %structA* %J, i64 0, i32 0, i64 1
+  %1 = load float, float* %arrayidx9, align 4
+  %sub10 = fsub fast float %conv5, %1
+  %mul11 = fmul fast float %sub, %sub
+  %mul12 = fmul fast float %sub10, %sub10
+  %add = fadd fast float %mul11, %mul12
+  %cmp = fcmp oeq float %add, 0.000000e+00
+  br i1 %cmp, label %for.body3.lr.ph, label %for.end27
+
+for.end27:
+  ret void
+}
+
+define void @test2(%structA* nocapture readonly %J, i32 %xmin, i32 %ymin) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> undef, i32 [[XMIN:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[YMIN:%.*]], i32 1
+; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH:%.*]]
+; CHECK:       for.body3.lr.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [[STRUCTA:%.*]], %structA* [[J:%.*]], i64 0, i32 0, i64 0
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCTA]], %structA* [[J]], i64 0, i32 0, i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX4]] to <2 x float>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
+; CHECK:       for.end27:
+; CHECK-NEXT:    ret void
+;
+
+entry:
+  br label %for.body3.lr.ph
+
+for.body3.lr.ph:
+  %conv5 = sitofp i32 %ymin to float
+  %conv = sitofp i32 %xmin to float
+  %arrayidx4 = getelementptr inbounds %structA, %structA* %J, i64 0, i32 0, i64 0
+  %0 = load float, float* %arrayidx4, align 4
+  %sub = fsub fast float %conv, %0
+  %arrayidx9 = getelementptr inbounds %structA, %structA* %J, i64 0, i32 0, i64 1
+  %1 = load float, float* %arrayidx9, align 4
+  %sub10 = fsub fast float %conv5, %1
+  %mul11 = fmul fast float %sub, %sub
+  %mul12 = fmul fast float %sub10, %sub10
+  %add = fadd fast float %mul12, %mul11         ;;;<---- Operands commuted!!
+  %cmp = fcmp oeq float %add, 0.000000e+00
+  br i1 %cmp, label %for.body3.lr.ph, label %for.end27
+
+for.end27:
+  ret void
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu < %s | FileCheck %s
+
+target datalayout = "e-m:e-i32:64-i128:128-n32:64-S128"
+
+declare void @foo(i64, i64, i64, i64)
+
+define void @test1(<4 x i16> %a, <4 x i16> %b, i64* %p) {
+; Make sure types of sub and its sources are not extended.
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[Z0:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[Z1:%.*]] = zext <4 x i16> [[B:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[SUB0:%.*]] = sub <4 x i32> [[Z0]], [[Z1]]
+; CHECK-NEXT:    [[E0:%.*]] = extractelement <4 x i32> [[SUB0]], i32 0
+; CHECK-NEXT:    [[S0:%.*]] = sext i32 [[E0]] to i64
+; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 [[S0]]
+; CHECK-NEXT:    [[LOAD0:%.*]] = load i64, i64* [[GEP0]]
+; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1
+; CHECK-NEXT:    [[S1:%.*]] = sext i32 [[E1]] to i64
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[S1]]
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, i64* [[GEP1]]
+; CHECK-NEXT:    [[E2:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2
+; CHECK-NEXT:    [[S2:%.*]] = sext i32 [[E2]] to i64
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[S2]]
+; CHECK-NEXT:    [[LOAD2:%.*]] = load i64, i64* [[GEP2]]
+; CHECK-NEXT:    [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3
+; CHECK-NEXT:    [[S3:%.*]] = sext i32 [[E3]] to i64
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[S3]]
+; CHECK-NEXT:    [[LOAD3:%.*]] = load i64, i64* [[GEP3]]
+; CHECK-NEXT:    call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %z0 = zext <4 x i16> %a to <4 x i32>
+  %z1 = zext <4 x i16> %b to <4 x i32>
+  %sub0 = sub <4 x i32> %z0, %z1
+  %e0 = extractelement <4 x i32> %sub0, i32 0
+  %s0 = sext i32 %e0 to i64
+  %gep0 = getelementptr inbounds i64, i64* %p, i64 %s0
+  %load0 = load i64, i64* %gep0
+  %e1 = extractelement <4 x i32> %sub0, i32 1
+  %s1 = sext i32 %e1 to i64
+  %gep1 = getelementptr inbounds i64, i64* %p, i64 %s1
+  %load1 = load i64, i64* %gep1
+  %e2 = extractelement <4 x i32> %sub0, i32 2
+  %s2 = sext i32 %e2 to i64
+  %gep2 = getelementptr inbounds i64, i64* %p, i64 %s2
+  %load2 = load i64, i64* %gep2
+  %e3 = extractelement <4 x i32> %sub0, i32 3
+  %s3 = sext i32 %e3 to i64
+  %gep3 = getelementptr inbounds i64, i64* %p, i64 %s3
+  %load3 = load i64, i64* %gep3
+  call void @foo(i64 %load0, i64 %load1, i64 %load2, i64 %load3)
+  ret void
+}
+
+define void @test2(<4 x i16> %a, <4 x i16> %b, i64 %c0, i64 %c1, i64 %c2, i64 %c3, i64* %p) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[Z0:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[Z1:%.*]] = zext <4 x i16> [[B:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[SUB0:%.*]] = sub <4 x i32> [[Z0]], [[Z1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = sext <4 x i32> [[SUB0]] to <4 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[C0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[C1:%.*]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[C2:%.*]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[C3:%.*]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[TMP0]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[LOAD0:%.*]] = load i64, i64* [[GEP0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP7]]
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, i64* [[GEP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP8]]
+; CHECK-NEXT:    [[LOAD2:%.*]] = load i64, i64* [[GEP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP9]]
+; CHECK-NEXT:    [[LOAD3:%.*]] = load i64, i64* [[GEP3]]
+; CHECK-NEXT:    call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %z0 = zext <4 x i16> %a to <4 x i32>
+  %z1 = zext <4 x i16> %b to <4 x i32>
+  %sub0 = sub <4 x i32> %z0, %z1
+  %e0 = extractelement <4 x i32> %sub0, i32 0
+  %s0 = sext i32 %e0 to i64
+  %a0 = add i64 %s0, %c0
+  %gep0 = getelementptr inbounds i64, i64* %p, i64 %a0
+  %load0 = load i64, i64* %gep0
+  %e1 = extractelement <4 x i32> %sub0, i32 1
+  %s1 = sext i32 %e1 to i64
+  %a1 = add i64 %s1, %c1
+  %gep1 = getelementptr inbounds i64, i64* %p, i64 %a1
+  %load1 = load i64, i64* %gep1
+  %e2 = extractelement <4 x i32> %sub0, i32 2
+  %s2 = sext i32 %e2 to i64
+  %a2 = add i64 %s2, %c2
+  %gep2 = getelementptr inbounds i64, i64* %p, i64 %a2
+  %load2 = load i64, i64* %gep2
+  %e3 = extractelement <4 x i32> %sub0, i32 3
+  %s3 = sext i32 %e3 to i64
+  %a3 = add i64 %s3, %c3
+  %gep3 = getelementptr inbounds i64, i64* %p, i64 %a3
+  %load3 = load i64, i64* %gep3
+  call void @foo(i64 %load0, i64 %load1, i64 %load2, i64 %load3)
+  ret void
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -slp-vectorizer -instcombine -pass-remarks-output=%t | FileCheck %s
+; RUN: cat %t | FileCheck -check-prefix=REMARK %s
+; RUN: opt < %s -S -passes='slp-vectorizer,instcombine' -pass-remarks-output=%t | FileCheck %s
+; RUN: cat %t | FileCheck -check-prefix=REMARK %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; REMARK-LABEL: Function: gather_multiple_use
+; REMARK:       Args:
+; REMARK-NEXT:    - String: 'Vectorized horizontal reduction with cost '
+; REMARK-NEXT:    - Cost: '-7'
+;
+define internal i32 @gather_multiple_use(i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: @gather_multiple_use(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A:%.*]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[B:%.*]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[D:%.*]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr <4 x i32> [[TMP4]], <i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP6:%.*]] = and <4 x i32> [[TMP5]], <i32 65537, i32 65537, i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw <4 x i32> [[TMP6]], <i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i32> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP9]])
+; CHECK-NEXT:    ret i32 [[TMP10]]
+;
+  %tmp00 = lshr i32 %a, 15
+  %tmp01 = and i32 %tmp00, 65537
+  %tmp02 = mul nuw i32 %tmp01, 65535
+  %tmp03 = add i32 %tmp02, %a
+  %tmp04 = xor i32 %tmp03, %tmp02
+  %tmp05 = lshr i32 %c, 15
+  %tmp06 = and i32 %tmp05, 65537
+  %tmp07 = mul nuw i32 %tmp06, 65535
+  %tmp08 = add i32 %tmp07, %c
+  %tmp09 = xor i32 %tmp08, %tmp07
+  %tmp10 = lshr i32 %b, 15
+  %tmp11 = and i32 %tmp10, 65537
+  %tmp12 = mul nuw i32 %tmp11, 65535
+  %tmp13 = add i32 %tmp12, %b
+  %tmp14 = xor i32 %tmp13, %tmp12
+  %tmp15 = lshr i32 %d, 15
+  %tmp16 = and i32 %tmp15, 65537
+  %tmp17 = mul nuw i32 %tmp16, 65535
+  %tmp18 = add i32 %tmp17, %d
+  %tmp19 = xor i32 %tmp18, %tmp17
+  %tmp20 = add i32 %tmp09, %tmp04
+  %tmp21 = add i32 %tmp20, %tmp14
+  %tmp22 = add i32 %tmp21, %tmp19
+  ret i32 %tmp22
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,543 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -slp-vectorizer -dce -instcombine < %s | FileCheck %s --check-prefix=GENERIC
+; RUN: opt -S -mcpu=kryo -slp-vectorizer -dce -instcombine < %s | FileCheck %s --check-prefix=KRYO
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; These tests check that we vectorize the index calculations in the
+; gather-reduce pattern shown below. We check cases having i32 and i64
+; subtraction.
+;
+; int gather_reduce_8x16(short *a, short *b, short *g, int n) {
+;   int sum = 0;
+;   for (int i = 0; i < n ; ++i) {
+;     sum += g[*a++ - b[0]]; sum += g[*a++ - b[1]];
+;     sum += g[*a++ - b[2]]; sum += g[*a++ - b[3]];
+;     sum += g[*a++ - b[4]]; sum += g[*a++ - b[5]];
+;     sum += g[*a++ - b[6]]; sum += g[*a++ - b[7]];
+;   }
+;   return sum;
+; }
+
+define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
+; GENERIC-LABEL: @gather_reduce_8x16_i32(
+; GENERIC-NEXT:  entry:
+; GENERIC-NEXT:    [[CMP_99:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; GENERIC-NEXT:    br i1 [[CMP_99]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; GENERIC:       for.body.preheader:
+; GENERIC-NEXT:    br label [[FOR_BODY:%.*]]
+; GENERIC:       for.cond.cleanup.loopexit:
+; GENERIC-NEXT:    br label [[FOR_COND_CLEANUP]]
+; GENERIC:       for.cond.cleanup:
+; GENERIC-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; GENERIC-NEXT:    ret i32 [[SUM_0_LCSSA]]
+; GENERIC:       for.body:
+; GENERIC-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; GENERIC-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; GENERIC-NEXT:    [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
+; GENERIC-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
+; GENERIC-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
+; GENERIC-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
+; GENERIC-NEXT:    [[TMP3:%.*]] = bitcast i16* [[B:%.*]] to <8 x i16>*
+; GENERIC-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2
+; GENERIC-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i32>
+; GENERIC-NEXT:    [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]]
+; GENERIC-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP6]], i32 0
+; GENERIC-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[G:%.*]], i64 [[TMP8]]
+; GENERIC-NEXT:    [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
+; GENERIC-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP9]] to i32
+; GENERIC-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
+; GENERIC-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP6]], i32 1
+; GENERIC-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP11]]
+; GENERIC-NEXT:    [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
+; GENERIC-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP12]] to i32
+; GENERIC-NEXT:    [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
+; GENERIC-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP6]], i32 2
+; GENERIC-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP14]]
+; GENERIC-NEXT:    [[TMP15:%.*]] = load i16, i16* [[ARRAYIDX19]], align 2
+; GENERIC-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP15]] to i32
+; GENERIC-NEXT:    [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
+; GENERIC-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP6]], i32 3
+; GENERIC-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP17]]
+; GENERIC-NEXT:    [[TMP18:%.*]] = load i16, i16* [[ARRAYIDX28]], align 2
+; GENERIC-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP18]] to i32
+; GENERIC-NEXT:    [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
+; GENERIC-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP6]], i32 4
+; GENERIC-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP20]]
+; GENERIC-NEXT:    [[TMP21:%.*]] = load i16, i16* [[ARRAYIDX37]], align 2
+; GENERIC-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP21]] to i32
+; GENERIC-NEXT:    [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
+; GENERIC-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP6]], i32 5
+; GENERIC-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP23]]
+; GENERIC-NEXT:    [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX46]], align 2
+; GENERIC-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP24]] to i32
+; GENERIC-NEXT:    [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
+; GENERIC-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP6]], i32 6
+; GENERIC-NEXT:    [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP26]]
+; GENERIC-NEXT:    [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
+; GENERIC-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
+; GENERIC-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
+; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
+; GENERIC-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i32 7
+; GENERIC-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]
+; GENERIC-NEXT:    [[TMP30:%.*]] = load i16, i16* [[ARRAYIDX64]], align 2
+; GENERIC-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP30]] to i32
+; GENERIC-NEXT:    [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
+; GENERIC-NEXT:    [[INC]] = add nuw nsw i32 [[I_0103]], 1
+; GENERIC-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
+; GENERIC-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; KRYO-LABEL: @gather_reduce_8x16_i32(
+; KRYO-NEXT:  entry:
+; KRYO-NEXT:    [[CMP_99:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; KRYO-NEXT:    br i1 [[CMP_99]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; KRYO:       for.body.preheader:
+; KRYO-NEXT:    br label [[FOR_BODY:%.*]]
+; KRYO:       for.cond.cleanup.loopexit:
+; KRYO-NEXT:    br label [[FOR_COND_CLEANUP]]
+; KRYO:       for.cond.cleanup:
+; KRYO-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; KRYO-NEXT:    ret i32 [[SUM_0_LCSSA]]
+; KRYO:       for.body:
+; KRYO-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; KRYO-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; KRYO-NEXT:    [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
+; KRYO-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
+; KRYO-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
+; KRYO-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
+; KRYO-NEXT:    [[TMP3:%.*]] = bitcast i16* [[B:%.*]] to <8 x i16>*
+; KRYO-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2
+; KRYO-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i32>
+; KRYO-NEXT:    [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]]
+; KRYO-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP6]], i32 0
+; KRYO-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
+; KRYO-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[G:%.*]], i64 [[TMP8]]
+; KRYO-NEXT:    [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
+; KRYO-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP9]] to i32
+; KRYO-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
+; KRYO-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP6]], i32 1
+; KRYO-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; KRYO-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP11]]
+; KRYO-NEXT:    [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
+; KRYO-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP12]] to i32
+; KRYO-NEXT:    [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
+; KRYO-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP6]], i32 2
+; KRYO-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; KRYO-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP14]]
+; KRYO-NEXT:    [[TMP15:%.*]] = load i16, i16* [[ARRAYIDX19]], align 2
+; KRYO-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP15]] to i32
+; KRYO-NEXT:    [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
+; KRYO-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP6]], i32 3
+; KRYO-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
+; KRYO-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP17]]
+; KRYO-NEXT:    [[TMP18:%.*]] = load i16, i16* [[ARRAYIDX28]], align 2
+; KRYO-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP18]] to i32
+; KRYO-NEXT:    [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
+; KRYO-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP6]], i32 4
+; KRYO-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
+; KRYO-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP20]]
+; KRYO-NEXT:    [[TMP21:%.*]] = load i16, i16* [[ARRAYIDX37]], align 2
+; KRYO-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP21]] to i32
+; KRYO-NEXT:    [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
+; KRYO-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP6]], i32 5
+; KRYO-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
+; KRYO-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP23]]
+; KRYO-NEXT:    [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX46]], align 2
+; KRYO-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP24]] to i32
+; KRYO-NEXT:    [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
+; KRYO-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP6]], i32 6
+; KRYO-NEXT:    [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
+; KRYO-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP26]]
+; KRYO-NEXT:    [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
+; KRYO-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
+; KRYO-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
+; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
+; KRYO-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i32 7
+; KRYO-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
+; KRYO-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]
+; KRYO-NEXT:    [[TMP30:%.*]] = load i16, i16* [[ARRAYIDX64]], align 2
+; KRYO-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP30]] to i32
+; KRYO-NEXT:    [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
+; KRYO-NEXT:    [[INC]] = add nuw nsw i32 [[I_0103]], 1
+; KRYO-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
+; KRYO-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp.99 = icmp sgt i32 %n, 0
+  br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ]
+  ret i32 %sum.0.lcssa
+
+for.body:
+  %i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ]
+  %a.addr.0101 = phi i16* [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ]
+  %incdec.ptr = getelementptr inbounds i16, i16* %a.addr.0101, i64 1
+  %0 = load i16, i16* %a.addr.0101, align 2
+  %conv = zext i16 %0 to i32
+  %incdec.ptr1 = getelementptr inbounds i16, i16* %b, i64 1
+  %1 = load i16, i16* %b, align 2
+  %conv2 = zext i16 %1 to i32
+  %sub = sub nsw i32 %conv, %conv2
+  %arrayidx = getelementptr inbounds i16, i16* %g, i32 %sub
+  %2 = load i16, i16* %arrayidx, align 2
+  %conv3 = zext i16 %2 to i32
+  %add = add nsw i32 %conv3, %sum.0102
+  %incdec.ptr4 = getelementptr inbounds i16, i16* %a.addr.0101, i64 2
+  %3 = load i16, i16* %incdec.ptr, align 2
+  %conv5 = zext i16 %3 to i32
+  %incdec.ptr6 = getelementptr inbounds i16, i16* %b, i64 2
+  %4 = load i16, i16* %incdec.ptr1, align 2
+  %conv7 = zext i16 %4 to i32
+  %sub8 = sub nsw i32 %conv5, %conv7
+  %arrayidx10 = getelementptr inbounds i16, i16* %g, i32 %sub8
+  %5 = load i16, i16* %arrayidx10, align 2
+  %conv11 = zext i16 %5 to i32
+  %add12 = add nsw i32 %add, %conv11
+  %incdec.ptr13 = getelementptr inbounds i16, i16* %a.addr.0101, i64 3
+  %6 = load i16, i16* %incdec.ptr4, align 2
+  %conv14 = zext i16 %6 to i32
+  %incdec.ptr15 = getelementptr inbounds i16, i16* %b, i64 3
+  %7 = load i16, i16* %incdec.ptr6, align 2
+  %conv16 = zext i16 %7 to i32
+  %sub17 = sub nsw i32 %conv14, %conv16
+  %arrayidx19 = getelementptr inbounds i16, i16* %g, i32 %sub17
+  %8 = load i16, i16* %arrayidx19, align 2
+  %conv20 = zext i16 %8 to i32
+  %add21 = add nsw i32 %add12, %conv20
+  %incdec.ptr22 = getelementptr inbounds i16, i16* %a.addr.0101, i64 4
+  %9 = load i16, i16* %incdec.ptr13, align 2
+  %conv23 = zext i16 %9 to i32
+  %incdec.ptr24 = getelementptr inbounds i16, i16* %b, i64 4
+  %10 = load i16, i16* %incdec.ptr15, align 2
+  %conv25 = zext i16 %10 to i32
+  %sub26 = sub nsw i32 %conv23, %conv25
+  %arrayidx28 = getelementptr inbounds i16, i16* %g, i32 %sub26
+  %11 = load i16, i16* %arrayidx28, align 2
+  %conv29 = zext i16 %11 to i32
+  %add30 = add nsw i32 %add21, %conv29
+  %incdec.ptr31 = getelementptr inbounds i16, i16* %a.addr.0101, i64 5
+  %12 = load i16, i16* %incdec.ptr22, align 2
+  %conv32 = zext i16 %12 to i32
+  %incdec.ptr33 = getelementptr inbounds i16, i16* %b, i64 5
+  %13 = load i16, i16* %incdec.ptr24, align 2
+  %conv34 = zext i16 %13 to i32
+  %sub35 = sub nsw i32 %conv32, %conv34
+  %arrayidx37 = getelementptr inbounds i16, i16* %g, i32 %sub35
+  %14 = load i16, i16* %arrayidx37, align 2
+  %conv38 = zext i16 %14 to i32
+  %add39 = add nsw i32 %add30, %conv38
+  %incdec.ptr40 = getelementptr inbounds i16, i16* %a.addr.0101, i64 6
+  %15 = load i16, i16* %incdec.ptr31, align 2
+  %conv41 = zext i16 %15 to i32
+  %incdec.ptr42 = getelementptr inbounds i16, i16* %b, i64 6
+  %16 = load i16, i16* %incdec.ptr33, align 2
+  %conv43 = zext i16 %16 to i32
+  %sub44 = sub nsw i32 %conv41, %conv43
+  %arrayidx46 = getelementptr inbounds i16, i16* %g, i32 %sub44
+  %17 = load i16, i16* %arrayidx46, align 2
+  %conv47 = zext i16 %17 to i32
+  %add48 = add nsw i32 %add39, %conv47
+  %incdec.ptr49 = getelementptr inbounds i16, i16* %a.addr.0101, i64 7
+  %18 = load i16, i16* %incdec.ptr40, align 2
+  %conv50 = zext i16 %18 to i32
+  %incdec.ptr51 = getelementptr inbounds i16, i16* %b, i64 7
+  %19 = load i16, i16* %incdec.ptr42, align 2
+  %conv52 = zext i16 %19 to i32
+  %sub53 = sub nsw i32 %conv50, %conv52
+  %arrayidx55 = getelementptr inbounds i16, i16* %g, i32 %sub53
+  %20 = load i16, i16* %arrayidx55, align 2
+  %conv56 = zext i16 %20 to i32
+  %add57 = add nsw i32 %add48, %conv56
+  %incdec.ptr58 = getelementptr inbounds i16, i16* %a.addr.0101, i64 8
+  %21 = load i16, i16* %incdec.ptr49, align 2
+  %conv59 = zext i16 %21 to i32
+  %22 = load i16, i16* %incdec.ptr51, align 2
+  %conv61 = zext i16 %22 to i32
+  %sub62 = sub nsw i32 %conv59, %conv61
+  %arrayidx64 = getelementptr inbounds i16, i16* %g, i32 %sub62
+  %23 = load i16, i16* %arrayidx64, align 2
+  %conv65 = zext i16 %23 to i32
+  %add66 = add nsw i32 %add57, %conv65
+  %inc = add nuw nsw i32 %i.0103, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
+; GENERIC-LABEL: @gather_reduce_8x16_i64(
+; GENERIC-NEXT:  entry:
+; GENERIC-NEXT:    [[CMP_99:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; GENERIC-NEXT:    br i1 [[CMP_99]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; GENERIC:       for.body.preheader:
+; GENERIC-NEXT:    br label [[FOR_BODY:%.*]]
+; GENERIC:       for.cond.cleanup.loopexit:
+; GENERIC-NEXT:    br label [[FOR_COND_CLEANUP]]
+; GENERIC:       for.cond.cleanup:
+; GENERIC-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; GENERIC-NEXT:    ret i32 [[SUM_0_LCSSA]]
+; GENERIC:       for.body:
+; GENERIC-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; GENERIC-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; GENERIC-NEXT:    [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
+; GENERIC-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
+; GENERIC-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
+; GENERIC-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
+; GENERIC-NEXT:    [[TMP3:%.*]] = bitcast i16* [[B:%.*]] to <8 x i16>*
+; GENERIC-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2
+; GENERIC-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i32>
+; GENERIC-NEXT:    [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]]
+; GENERIC-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP6]], i32 0
+; GENERIC-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[G:%.*]], i64 [[TMP8]]
+; GENERIC-NEXT:    [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
+; GENERIC-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP9]] to i32
+; GENERIC-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
+; GENERIC-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP6]], i32 1
+; GENERIC-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP11]]
+; GENERIC-NEXT:    [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
+; GENERIC-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP12]] to i32
+; GENERIC-NEXT:    [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
+; GENERIC-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP6]], i32 2
+; GENERIC-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP14]]
+; GENERIC-NEXT:    [[TMP15:%.*]] = load i16, i16* [[ARRAYIDX19]], align 2
+; GENERIC-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP15]] to i32
+; GENERIC-NEXT:    [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
+; GENERIC-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP6]], i32 3
+; GENERIC-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP17]]
+; GENERIC-NEXT:    [[TMP18:%.*]] = load i16, i16* [[ARRAYIDX28]], align 2
+; GENERIC-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP18]] to i32
+; GENERIC-NEXT:    [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
+; GENERIC-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP6]], i32 4
+; GENERIC-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP20]]
+; GENERIC-NEXT:    [[TMP21:%.*]] = load i16, i16* [[ARRAYIDX37]], align 2
+; GENERIC-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP21]] to i32
+; GENERIC-NEXT:    [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
+; GENERIC-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP6]], i32 5
+; GENERIC-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP23]]
+; GENERIC-NEXT:    [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX46]], align 2
+; GENERIC-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP24]] to i32
+; GENERIC-NEXT:    [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
+; GENERIC-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP6]], i32 6
+; GENERIC-NEXT:    [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP26]]
+; GENERIC-NEXT:    [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
+; GENERIC-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
+; GENERIC-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
+; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
+; GENERIC-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i32 7
+; GENERIC-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]
+; GENERIC-NEXT:    [[TMP30:%.*]] = load i16, i16* [[ARRAYIDX64]], align 2
+; GENERIC-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP30]] to i32
+; GENERIC-NEXT:    [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
+; GENERIC-NEXT:    [[INC]] = add nuw nsw i32 [[I_0103]], 1
+; GENERIC-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
+; GENERIC-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; KRYO-LABEL: @gather_reduce_8x16_i64(
+; KRYO-NEXT:  entry:
+; KRYO-NEXT:    [[CMP_99:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; KRYO-NEXT:    br i1 [[CMP_99]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; KRYO:       for.body.preheader:
+; KRYO-NEXT:    br label [[FOR_BODY:%.*]]
+; KRYO:       for.cond.cleanup.loopexit:
+; KRYO-NEXT:    br label [[FOR_COND_CLEANUP]]
+; KRYO:       for.cond.cleanup:
+; KRYO-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; KRYO-NEXT:    ret i32 [[SUM_0_LCSSA]]
+; KRYO:       for.body:
+; KRYO-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; KRYO-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; KRYO-NEXT:    [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
+; KRYO-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
+; KRYO-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
+; KRYO-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
+; KRYO-NEXT:    [[TMP3:%.*]] = bitcast i16* [[B:%.*]] to <8 x i16>*
+; KRYO-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2
+; KRYO-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i32>
+; KRYO-NEXT:    [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]]
+; KRYO-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP6]], i32 0
+; KRYO-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
+; KRYO-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[G:%.*]], i64 [[TMP8]]
+; KRYO-NEXT:    [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
+; KRYO-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP9]] to i32
+; KRYO-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
+; KRYO-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP6]], i32 1
+; KRYO-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; KRYO-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP11]]
+; KRYO-NEXT:    [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
+; KRYO-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP12]] to i32
+; KRYO-NEXT:    [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
+; KRYO-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP6]], i32 2
+; KRYO-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; KRYO-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP14]]
+; KRYO-NEXT:    [[TMP15:%.*]] = load i16, i16* [[ARRAYIDX19]], align 2
+; KRYO-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP15]] to i32
+; KRYO-NEXT:    [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
+; KRYO-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP6]], i32 3
+; KRYO-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
+; KRYO-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP17]]
+; KRYO-NEXT:    [[TMP18:%.*]] = load i16, i16* [[ARRAYIDX28]], align 2
+; KRYO-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP18]] to i32
+; KRYO-NEXT:    [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
+; KRYO-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP6]], i32 4
+; KRYO-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
+; KRYO-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP20]]
+; KRYO-NEXT:    [[TMP21:%.*]] = load i16, i16* [[ARRAYIDX37]], align 2
+; KRYO-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP21]] to i32
+; KRYO-NEXT:    [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
+; KRYO-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP6]], i32 5
+; KRYO-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
+; KRYO-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP23]]
+; KRYO-NEXT:    [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX46]], align 2
+; KRYO-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP24]] to i32
+; KRYO-NEXT:    [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
+; KRYO-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP6]], i32 6
+; KRYO-NEXT:    [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
+; KRYO-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP26]]
+; KRYO-NEXT:    [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
+; KRYO-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
+; KRYO-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
+; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
+; KRYO-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i32 7
+; KRYO-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
+; KRYO-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]
+; KRYO-NEXT:    [[TMP30:%.*]] = load i16, i16* [[ARRAYIDX64]], align 2
+; KRYO-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP30]] to i32
+; KRYO-NEXT:    [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
+; KRYO-NEXT:    [[INC]] = add nuw nsw i32 [[I_0103]], 1
+; KRYO-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
+; KRYO-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp.99 = icmp sgt i32 %n, 0
+  br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ]
+  ret i32 %sum.0.lcssa
+
+for.body:
+  %i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ]
+  %a.addr.0101 = phi i16* [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ]
+  %incdec.ptr = getelementptr inbounds i16, i16* %a.addr.0101, i64 1
+  %0 = load i16, i16* %a.addr.0101, align 2
+  %conv = zext i16 %0 to i64
+  %incdec.ptr1 = getelementptr inbounds i16, i16* %b, i64 1
+  %1 = load i16, i16* %b, align 2
+  %conv2 = zext i16 %1 to i64
+  %sub = sub nsw i64 %conv, %conv2
+  %arrayidx = getelementptr inbounds i16, i16* %g, i64 %sub
+  %2 = load i16, i16* %arrayidx, align 2
+  %conv3 = zext i16 %2 to i32
+  %add = add nsw i32 %conv3, %sum.0102
+  %incdec.ptr4 = getelementptr inbounds i16, i16* %a.addr.0101, i64 2
+  %3 = load i16, i16* %incdec.ptr, align 2
+  %conv5 = zext i16 %3 to i64
+  %incdec.ptr6 = getelementptr inbounds i16, i16* %b, i64 2
+  %4 = load i16, i16* %incdec.ptr1, align 2
+  %conv7 = zext i16 %4 to i64
+  %sub8 = sub nsw i64 %conv5, %conv7
+  %arrayidx10 = getelementptr inbounds i16, i16* %g, i64 %sub8
+  %5 = load i16, i16* %arrayidx10, align 2
+  %conv11 = zext i16 %5 to i32
+  %add12 = add nsw i32 %add, %conv11
+  %incdec.ptr13 = getelementptr inbounds i16, i16* %a.addr.0101, i64 3
+  %6 = load i16, i16* %incdec.ptr4, align 2
+  %conv14 = zext i16 %6 to i64
+  %incdec.ptr15 = getelementptr inbounds i16, i16* %b, i64 3
+  %7 = load i16, i16* %incdec.ptr6, align 2
+  %conv16 = zext i16 %7 to i64
+  %sub17 = sub nsw i64 %conv14, %conv16
+  %arrayidx19 = getelementptr inbounds i16, i16* %g, i64 %sub17
+  %8 = load i16, i16* %arrayidx19, align 2
+  %conv20 = zext i16 %8 to i32
+  %add21 = add nsw i32 %add12, %conv20
+  %incdec.ptr22 = getelementptr inbounds i16, i16* %a.addr.0101, i64 4
+  %9 = load i16, i16* %incdec.ptr13, align 2
+  %conv23 = zext i16 %9 to i64
+  %incdec.ptr24 = getelementptr inbounds i16, i16* %b, i64 4
+  %10 = load i16, i16* %incdec.ptr15, align 2
+  %conv25 = zext i16 %10 to i64
+  %sub26 = sub nsw i64 %conv23, %conv25
+  %arrayidx28 = getelementptr inbounds i16, i16* %g, i64 %sub26
+  %11 = load i16, i16* %arrayidx28, align 2
+  %conv29 = zext i16 %11 to i32
+  %add30 = add nsw i32 %add21, %conv29
+  %incdec.ptr31 = getelementptr inbounds i16, i16* %a.addr.0101, i64 5
+  %12 = load i16, i16* %incdec.ptr22, align 2
+  %conv32 = zext i16 %12 to i64
+  %incdec.ptr33 = getelementptr inbounds i16, i16* %b, i64 5
+  %13 = load i16, i16* %incdec.ptr24, align 2
+  %conv34 = zext i16 %13 to i64
+  %sub35 = sub nsw i64 %conv32, %conv34
+  %arrayidx37 = getelementptr inbounds i16, i16* %g, i64 %sub35
+  %14 = load i16, i16* %arrayidx37, align 2
+  %conv38 = zext i16 %14 to i32
+  %add39 = add nsw i32 %add30, %conv38
+  %incdec.ptr40 = getelementptr inbounds i16, i16* %a.addr.0101, i64 6
+  %15 = load i16, i16* %incdec.ptr31, align 2
+  %conv41 = zext i16 %15 to i64
+  %incdec.ptr42 = getelementptr inbounds i16, i16* %b, i64 6
+  %16 = load i16, i16* %incdec.ptr33, align 2
+  %conv43 = zext i16 %16 to i64
+  %sub44 = sub nsw i64 %conv41, %conv43
+  %arrayidx46 = getelementptr inbounds i16, i16* %g, i64 %sub44
+  %17 = load i16, i16* %arrayidx46, align 2
+  %conv47 = zext i16 %17 to i32
+  %add48 = add nsw i32 %add39, %conv47
+  %incdec.ptr49 = getelementptr inbounds i16, i16* %a.addr.0101, i64 7
+  %18 = load i16, i16* %incdec.ptr40, align 2
+  %conv50 = zext i16 %18 to i64
+  %incdec.ptr51 = getelementptr inbounds i16, i16* %b, i64 7
+  %19 = load i16, i16* %incdec.ptr42, align 2
+  %conv52 = zext i16 %19 to i64
+  %sub53 = sub nsw i64 %conv50, %conv52
+  %arrayidx55 = getelementptr inbounds i16, i16* %g, i64 %sub53
+  %20 = load i16, i16* %arrayidx55, align 2
+  %conv56 = zext i16 %20 to i32
+  %add57 = add nsw i32 %add48, %conv56
+  %incdec.ptr58 = getelementptr inbounds i16, i16* %a.addr.0101, i64 8
+  %21 = load i16, i16* %incdec.ptr49, align 2
+  %conv59 = zext i16 %21 to i64
+  %22 = load i16, i16* %incdec.ptr51, align 2
+  %conv61 = zext i16 %22 to i64
+  %sub62 = sub nsw i64 %conv59, %conv61
+  %arrayidx64 = getelementptr inbounds i16, i16* %g, i64 %sub62
+  %23 = load i16, i16* %arrayidx64, align 2
+  %conv65 = zext i16 %23 to i32
+  %add66 = add nsw i32 %add57, %conv65
+  %inc = add nuw nsw i32 %i.0103, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-root.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-root.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-root.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,318 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -S | FileCheck %s --check-prefix=DEFAULT
+; RUN: opt < %s -slp-schedule-budget=0 -slp-min-tree-size=0 -slp-threshold=-30 -slp-vectorizer -S | FileCheck %s --check-prefix=GATHER
+; RUN: opt < %s -slp-schedule-budget=0 -slp-threshold=-30 -slp-vectorizer -S | FileCheck %s --check-prefix=MAX-COST
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+ at a = common global [80 x i8] zeroinitializer, align 16
+
+define void @PR28330(i32 %n) {
+; DEFAULT-LABEL: @PR28330(
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1
+; DEFAULT-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
+; DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]
+; DEFAULT:       for.body:
+; DEFAULT-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; DEFAULT-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
+; DEFAULT-NEXT:    [[P20:%.*]] = add i32 [[P17]], undef
+; DEFAULT-NEXT:    [[P22:%.*]] = add i32 [[P20]], undef
+; DEFAULT-NEXT:    [[P24:%.*]] = add i32 [[P22]], undef
+; DEFAULT-NEXT:    [[P26:%.*]] = add i32 [[P24]], undef
+; DEFAULT-NEXT:    [[P28:%.*]] = add i32 [[P26]], undef
+; DEFAULT-NEXT:    [[P30:%.*]] = add i32 [[P28]], undef
+; DEFAULT-NEXT:    [[P32:%.*]] = add i32 [[P30]], undef
+; DEFAULT-NEXT:    [[TMP3:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP2]])
+; DEFAULT-NEXT:    [[OP_EXTRA]] = add i32 [[TMP3]], [[P17]]
+; DEFAULT-NEXT:    [[P34:%.*]] = add i32 [[P32]], undef
+; DEFAULT-NEXT:    br label [[FOR_BODY]]
+;
+; GATHER-LABEL: @PR28330(
+; GATHER-NEXT:  entry:
+; GATHER-NEXT:    [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1
+; GATHER-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
+; GATHER-NEXT:    br label [[FOR_BODY:%.*]]
+; GATHER:       for.body:
+; GATHER-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; GATHER-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
+; GATHER-NEXT:    [[TMP3:%.*]] = insertelement <8 x i1> undef, i1 [[TMP2]], i32 0
+; GATHER-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1
+; GATHER-NEXT:    [[TMP5:%.*]] = insertelement <8 x i1> [[TMP3]], i1 [[TMP4]], i32 1
+; GATHER-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2
+; GATHER-NEXT:    [[TMP7:%.*]] = insertelement <8 x i1> [[TMP5]], i1 [[TMP6]], i32 2
+; GATHER-NEXT:    [[TMP8:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3
+; GATHER-NEXT:    [[TMP9:%.*]] = insertelement <8 x i1> [[TMP7]], i1 [[TMP8]], i32 3
+; GATHER-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4
+; GATHER-NEXT:    [[TMP11:%.*]] = insertelement <8 x i1> [[TMP9]], i1 [[TMP10]], i32 4
+; GATHER-NEXT:    [[TMP12:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5
+; GATHER-NEXT:    [[TMP13:%.*]] = insertelement <8 x i1> [[TMP11]], i1 [[TMP12]], i32 5
+; GATHER-NEXT:    [[TMP14:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6
+; GATHER-NEXT:    [[TMP15:%.*]] = insertelement <8 x i1> [[TMP13]], i1 [[TMP14]], i32 6
+; GATHER-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP1]], i32 7
+; GATHER-NEXT:    [[TMP17:%.*]] = insertelement <8 x i1> [[TMP15]], i1 [[TMP16]], i32 7
+; GATHER-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
+; GATHER-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP18]], i32 0
+; GATHER-NEXT:    [[P20:%.*]] = add i32 [[P17]], [[TMP19]]
+; GATHER-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP18]], i32 1
+; GATHER-NEXT:    [[P22:%.*]] = add i32 [[P20]], [[TMP20]]
+; GATHER-NEXT:    [[TMP21:%.*]] = extractelement <8 x i32> [[TMP18]], i32 2
+; GATHER-NEXT:    [[P24:%.*]] = add i32 [[P22]], [[TMP21]]
+; GATHER-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP18]], i32 3
+; GATHER-NEXT:    [[P26:%.*]] = add i32 [[P24]], [[TMP22]]
+; GATHER-NEXT:    [[TMP23:%.*]] = extractelement <8 x i32> [[TMP18]], i32 4
+; GATHER-NEXT:    [[P28:%.*]] = add i32 [[P26]], [[TMP23]]
+; GATHER-NEXT:    [[TMP24:%.*]] = extractelement <8 x i32> [[TMP18]], i32 5
+; GATHER-NEXT:    [[P30:%.*]] = add i32 [[P28]], [[TMP24]]
+; GATHER-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP18]], i32 6
+; GATHER-NEXT:    [[P32:%.*]] = add i32 [[P30]], [[TMP25]]
+; GATHER-NEXT:    [[TMP26:%.*]] = insertelement <8 x i32> undef, i32 [[TMP19]], i32 0
+; GATHER-NEXT:    [[TMP27:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP20]], i32 1
+; GATHER-NEXT:    [[TMP28:%.*]] = insertelement <8 x i32> [[TMP27]], i32 [[TMP21]], i32 2
+; GATHER-NEXT:    [[TMP29:%.*]] = insertelement <8 x i32> [[TMP28]], i32 [[TMP22]], i32 3
+; GATHER-NEXT:    [[TMP30:%.*]] = insertelement <8 x i32> [[TMP29]], i32 [[TMP23]], i32 4
+; GATHER-NEXT:    [[TMP31:%.*]] = insertelement <8 x i32> [[TMP30]], i32 [[TMP24]], i32 5
+; GATHER-NEXT:    [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP25]], i32 6
+; GATHER-NEXT:    [[TMP33:%.*]] = extractelement <8 x i32> [[TMP18]], i32 7
+; GATHER-NEXT:    [[TMP34:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP33]], i32 7
+; GATHER-NEXT:    [[TMP35:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP34]])
+; GATHER-NEXT:    [[OP_EXTRA]] = add i32 [[TMP35]], [[P17]]
+; GATHER-NEXT:    [[P34:%.*]] = add i32 [[P32]], [[TMP33]]
+; GATHER-NEXT:    br label [[FOR_BODY]]
+;
+; MAX-COST-LABEL: @PR28330(
+; MAX-COST-NEXT:  entry:
+; MAX-COST-NEXT:    [[P0:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1
+; MAX-COST-NEXT:    [[P1:%.*]] = icmp eq i8 [[P0]], 0
+; MAX-COST-NEXT:    [[P2:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2), align 2
+; MAX-COST-NEXT:    [[P3:%.*]] = icmp eq i8 [[P2]], 0
+; MAX-COST-NEXT:    [[P4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
+; MAX-COST-NEXT:    [[P5:%.*]] = icmp eq i8 [[P4]], 0
+; MAX-COST-NEXT:    [[P6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
+; MAX-COST-NEXT:    [[P7:%.*]] = icmp eq i8 [[P6]], 0
+; MAX-COST-NEXT:    [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
+; MAX-COST-NEXT:    [[P9:%.*]] = icmp eq i8 [[P8]], 0
+; MAX-COST-NEXT:    [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
+; MAX-COST-NEXT:    [[P11:%.*]] = icmp eq i8 [[P10]], 0
+; MAX-COST-NEXT:    [[P12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1
+; MAX-COST-NEXT:    [[P13:%.*]] = icmp eq i8 [[P12]], 0
+; MAX-COST-NEXT:    [[P14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
+; MAX-COST-NEXT:    [[P15:%.*]] = icmp eq i8 [[P14]], 0
+; MAX-COST-NEXT:    br label [[FOR_BODY:%.*]]
+; MAX-COST:       for.body:
+; MAX-COST-NEXT:    [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; MAX-COST-NEXT:    [[P19:%.*]] = select i1 [[P1]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P20:%.*]] = add i32 [[P17]], [[P19]]
+; MAX-COST-NEXT:    [[P21:%.*]] = select i1 [[P3]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P22:%.*]] = add i32 [[P20]], [[P21]]
+; MAX-COST-NEXT:    [[P23:%.*]] = select i1 [[P5]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P24:%.*]] = add i32 [[P22]], [[P23]]
+; MAX-COST-NEXT:    [[P25:%.*]] = select i1 [[P7]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P26:%.*]] = add i32 [[P24]], [[P25]]
+; MAX-COST-NEXT:    [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P28:%.*]] = add i32 [[P26]], [[P27]]
+; MAX-COST-NEXT:    [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P30:%.*]] = add i32 [[P28]], [[P29]]
+; MAX-COST-NEXT:    [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P32:%.*]] = add i32 [[P30]], [[P31]]
+; MAX-COST-NEXT:    [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P34]] = add i32 [[P32]], [[P33]]
+; MAX-COST-NEXT:    br label [[FOR_BODY]]
+;
+entry:
+  %p0 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1
+  %p1 = icmp eq i8 %p0, 0
+  %p2 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2), align 2
+  %p3 = icmp eq i8 %p2, 0
+  %p4 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
+  %p5 = icmp eq i8 %p4, 0
+  %p6 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
+  %p7 = icmp eq i8 %p6, 0
+  %p8 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
+  %p9 = icmp eq i8 %p8, 0
+  %p10 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
+  %p11 = icmp eq i8 %p10, 0
+  %p12 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1
+  %p13 = icmp eq i8 %p12, 0
+  %p14 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
+  %p15 = icmp eq i8 %p14, 0
+  br label %for.body
+
+for.body:
+  %p17 = phi i32 [ %p34, %for.body ], [ 0, %entry ]
+  %p19 = select i1 %p1, i32 -720, i32 -80
+  %p20 = add i32 %p17, %p19
+  %p21 = select i1 %p3, i32 -720, i32 -80
+  %p22 = add i32 %p20, %p21
+  %p23 = select i1 %p5, i32 -720, i32 -80
+  %p24 = add i32 %p22, %p23
+  %p25 = select i1 %p7, i32 -720, i32 -80
+  %p26 = add i32 %p24, %p25
+  %p27 = select i1 %p9, i32 -720, i32 -80
+  %p28 = add i32 %p26, %p27
+  %p29 = select i1 %p11, i32 -720, i32 -80
+  %p30 = add i32 %p28, %p29
+  %p31 = select i1 %p13, i32 -720, i32 -80
+  %p32 = add i32 %p30, %p31
+  %p33 = select i1 %p15, i32 -720, i32 -80
+  %p34 = add i32 %p32, %p33
+  br label %for.body
+}
+
+define void @PR32038(i32 %n) {
+; DEFAULT-LABEL: @PR32038(
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1
+; DEFAULT-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
+; DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]
+; DEFAULT:       for.body:
+; DEFAULT-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; DEFAULT-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
+; DEFAULT-NEXT:    [[P20:%.*]] = add i32 -5, undef
+; DEFAULT-NEXT:    [[P22:%.*]] = add i32 [[P20]], undef
+; DEFAULT-NEXT:    [[P24:%.*]] = add i32 [[P22]], undef
+; DEFAULT-NEXT:    [[P26:%.*]] = add i32 [[P24]], undef
+; DEFAULT-NEXT:    [[P28:%.*]] = add i32 [[P26]], undef
+; DEFAULT-NEXT:    [[P30:%.*]] = add i32 [[P28]], undef
+; DEFAULT-NEXT:    [[P32:%.*]] = add i32 [[P30]], undef
+; DEFAULT-NEXT:    [[TMP3:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP2]])
+; DEFAULT-NEXT:    [[OP_EXTRA]] = add i32 [[TMP3]], -5
+; DEFAULT-NEXT:    [[P34:%.*]] = add i32 [[P32]], undef
+; DEFAULT-NEXT:    br label [[FOR_BODY]]
+;
+; GATHER-LABEL: @PR32038(
+; GATHER-NEXT:  entry:
+; GATHER-NEXT:    [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1
+; GATHER-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
+; GATHER-NEXT:    br label [[FOR_BODY:%.*]]
+; GATHER:       for.body:
+; GATHER-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; GATHER-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
+; GATHER-NEXT:    [[TMP3:%.*]] = insertelement <8 x i1> undef, i1 [[TMP2]], i32 0
+; GATHER-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1
+; GATHER-NEXT:    [[TMP5:%.*]] = insertelement <8 x i1> [[TMP3]], i1 [[TMP4]], i32 1
+; GATHER-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2
+; GATHER-NEXT:    [[TMP7:%.*]] = insertelement <8 x i1> [[TMP5]], i1 [[TMP6]], i32 2
+; GATHER-NEXT:    [[TMP8:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3
+; GATHER-NEXT:    [[TMP9:%.*]] = insertelement <8 x i1> [[TMP7]], i1 [[TMP8]], i32 3
+; GATHER-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4
+; GATHER-NEXT:    [[TMP11:%.*]] = insertelement <8 x i1> [[TMP9]], i1 [[TMP10]], i32 4
+; GATHER-NEXT:    [[TMP12:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5
+; GATHER-NEXT:    [[TMP13:%.*]] = insertelement <8 x i1> [[TMP11]], i1 [[TMP12]], i32 5
+; GATHER-NEXT:    [[TMP14:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6
+; GATHER-NEXT:    [[TMP15:%.*]] = insertelement <8 x i1> [[TMP13]], i1 [[TMP14]], i32 6
+; GATHER-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP1]], i32 7
+; GATHER-NEXT:    [[TMP17:%.*]] = insertelement <8 x i1> [[TMP15]], i1 [[TMP16]], i32 7
+; GATHER-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
+; GATHER-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP18]], i32 0
+; GATHER-NEXT:    [[P20:%.*]] = add i32 -5, [[TMP19]]
+; GATHER-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP18]], i32 1
+; GATHER-NEXT:    [[P22:%.*]] = add i32 [[P20]], [[TMP20]]
+; GATHER-NEXT:    [[TMP21:%.*]] = extractelement <8 x i32> [[TMP18]], i32 2
+; GATHER-NEXT:    [[P24:%.*]] = add i32 [[P22]], [[TMP21]]
+; GATHER-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP18]], i32 3
+; GATHER-NEXT:    [[P26:%.*]] = add i32 [[P24]], [[TMP22]]
+; GATHER-NEXT:    [[TMP23:%.*]] = extractelement <8 x i32> [[TMP18]], i32 4
+; GATHER-NEXT:    [[P28:%.*]] = add i32 [[P26]], [[TMP23]]
+; GATHER-NEXT:    [[TMP24:%.*]] = extractelement <8 x i32> [[TMP18]], i32 5
+; GATHER-NEXT:    [[P30:%.*]] = add i32 [[P28]], [[TMP24]]
+; GATHER-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP18]], i32 6
+; GATHER-NEXT:    [[P32:%.*]] = add i32 [[P30]], [[TMP25]]
+; GATHER-NEXT:    [[TMP26:%.*]] = insertelement <8 x i32> undef, i32 [[TMP19]], i32 0
+; GATHER-NEXT:    [[TMP27:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP20]], i32 1
+; GATHER-NEXT:    [[TMP28:%.*]] = insertelement <8 x i32> [[TMP27]], i32 [[TMP21]], i32 2
+; GATHER-NEXT:    [[TMP29:%.*]] = insertelement <8 x i32> [[TMP28]], i32 [[TMP22]], i32 3
+; GATHER-NEXT:    [[TMP30:%.*]] = insertelement <8 x i32> [[TMP29]], i32 [[TMP23]], i32 4
+; GATHER-NEXT:    [[TMP31:%.*]] = insertelement <8 x i32> [[TMP30]], i32 [[TMP24]], i32 5
+; GATHER-NEXT:    [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP25]], i32 6
+; GATHER-NEXT:    [[TMP33:%.*]] = extractelement <8 x i32> [[TMP18]], i32 7
+; GATHER-NEXT:    [[TMP34:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP33]], i32 7
+; GATHER-NEXT:    [[TMP35:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP34]])
+; GATHER-NEXT:    [[OP_EXTRA]] = add i32 [[TMP35]], -5
+; GATHER-NEXT:    [[P34:%.*]] = add i32 [[P32]], [[TMP33]]
+; GATHER-NEXT:    br label [[FOR_BODY]]
+;
+; MAX-COST-LABEL: @PR32038(
+; MAX-COST-NEXT:  entry:
+; MAX-COST-NEXT:    [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1
+; MAX-COST-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer
+; MAX-COST-NEXT:    [[P4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
+; MAX-COST-NEXT:    [[P5:%.*]] = icmp eq i8 [[P4]], 0
+; MAX-COST-NEXT:    [[P6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
+; MAX-COST-NEXT:    [[P7:%.*]] = icmp eq i8 [[P6]], 0
+; MAX-COST-NEXT:    [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
+; MAX-COST-NEXT:    [[P9:%.*]] = icmp eq i8 [[P8]], 0
+; MAX-COST-NEXT:    [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
+; MAX-COST-NEXT:    [[P11:%.*]] = icmp eq i8 [[P10]], 0
+; MAX-COST-NEXT:    [[P12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1
+; MAX-COST-NEXT:    [[P13:%.*]] = icmp eq i8 [[P12]], 0
+; MAX-COST-NEXT:    [[P14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
+; MAX-COST-NEXT:    [[P15:%.*]] = icmp eq i8 [[P14]], 0
+; MAX-COST-NEXT:    br label [[FOR_BODY:%.*]]
+; MAX-COST:       for.body:
+; MAX-COST-NEXT:    [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; MAX-COST-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
+; MAX-COST-NEXT:    [[TMP3:%.*]] = insertelement <4 x i1> undef, i1 [[TMP2]], i32 0
+; MAX-COST-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
+; MAX-COST-NEXT:    [[TMP5:%.*]] = insertelement <4 x i1> [[TMP3]], i1 [[TMP4]], i32 1
+; MAX-COST-NEXT:    [[TMP6:%.*]] = insertelement <4 x i1> [[TMP5]], i1 [[P5]], i32 2
+; MAX-COST-NEXT:    [[TMP7:%.*]] = insertelement <4 x i1> [[TMP6]], i1 [[P7]], i32 3
+; MAX-COST-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> <i32 -720, i32 -720, i32 -720, i32 -720>, <4 x i32> <i32 -80, i32 -80, i32 -80, i32 -80>
+; MAX-COST-NEXT:    [[P20:%.*]] = add i32 -5, undef
+; MAX-COST-NEXT:    [[P22:%.*]] = add i32 [[P20]], undef
+; MAX-COST-NEXT:    [[P24:%.*]] = add i32 [[P22]], undef
+; MAX-COST-NEXT:    [[P26:%.*]] = add i32 [[P24]], undef
+; MAX-COST-NEXT:    [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P28:%.*]] = add i32 [[P26]], [[P27]]
+; MAX-COST-NEXT:    [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP8]])
+; MAX-COST-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], [[P27]]
+; MAX-COST-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], [[P29]]
+; MAX-COST-NEXT:    [[OP_EXTRA:%.*]] = add i32 [[TMP11]], -5
+; MAX-COST-NEXT:    [[P30:%.*]] = add i32 [[P28]], [[P29]]
+; MAX-COST-NEXT:    [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P32:%.*]] = add i32 [[OP_EXTRA]], [[P31]]
+; MAX-COST-NEXT:    [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P34]] = add i32 [[P32]], [[P33]]
+; MAX-COST-NEXT:    br label [[FOR_BODY]]
+;
+entry:
+  %p0 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1
+  %p1 = icmp eq i8 %p0, 0
+  %p2 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2), align 2
+  %p3 = icmp eq i8 %p2, 0
+  %p4 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
+  %p5 = icmp eq i8 %p4, 0
+  %p6 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
+  %p7 = icmp eq i8 %p6, 0
+  %p8 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
+  %p9 = icmp eq i8 %p8, 0
+  %p10 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
+  %p11 = icmp eq i8 %p10, 0
+  %p12 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1
+  %p13 = icmp eq i8 %p12, 0
+  %p14 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
+  %p15 = icmp eq i8 %p14, 0
+  br label %for.body
+
+for.body:
+  %p17 = phi i32 [ %p34, %for.body ], [ 0, %entry ]
+  %p19 = select i1 %p1, i32 -720, i32 -80
+  %p20 = add i32 -5, %p19
+  %p21 = select i1 %p3, i32 -720, i32 -80
+  %p22 = add i32 %p20, %p21
+  %p23 = select i1 %p5, i32 -720, i32 -80
+  %p24 = add i32 %p22, %p23
+  %p25 = select i1 %p7, i32 -720, i32 -80
+  %p26 = add i32 %p24, %p25
+  %p27 = select i1 %p9, i32 -720, i32 -80
+  %p28 = add i32 %p26, %p27
+  %p29 = select i1 %p11, i32 -720, i32 -80
+  %p30 = add i32 %p28, %p29
+  %p31 = select i1 %p13, i32 -720, i32 -80
+  %p32 = add i32 %p30, %p31
+  %p33 = select i1 %p15, i32 -720, i32 -80
+  %p34 = add i32 %p32, %p33
+  br label %for.body
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,242 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -slp-vectorizer -slp-threshold=-18 -dce -instcombine -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: cat %t | FileCheck -check-prefix=YAML %s
+; RUN: opt -S -passes='slp-vectorizer,dce,instcombine' -slp-threshold=-18 -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: cat %t | FileCheck -check-prefix=YAML %s
+
+
+target datalayout = "e-m:e-i32:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; These tests check that we remove from consideration pairs of seed
+; getelementptrs when they are known to have a constant difference. Such pairs
+; are likely not good candidates for vectorization since one can be computed
+; from the other. We use an unprofitable threshold to force vectorization.
+;
+; int getelementptr(int *g, int n, int w, int x, int y, int z) {
+;   int sum = 0;
+;   for (int i = 0; i < n ; ++i) {
+;     sum += g[2*i + w]; sum += g[2*i + x];
+;     sum += g[2*i + y]; sum += g[2*i + z];
+;   }
+;   return sum;
+; }
+;
+
+; YAML:      --- !Passed
+; YAML-NEXT: Pass:            slp-vectorizer
+; YAML-NEXT: Name:            VectorizedList
+; YAML-NEXT: Function:        getelementptr_4x32
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'SLP vectorized with cost '
+; YAML-NEXT:   - Cost:            '11'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '5'
+
+; YAML:      --- !Passed
+; YAML-NEXT: Pass:            slp-vectorizer
+; YAML-NEXT: Name:            VectorizedList
+; YAML-NEXT: Function:        getelementptr_4x32
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'SLP vectorized with cost '
+; YAML-NEXT:   - Cost:            '16'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '3'
+
+define i32 @getelementptr_4x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @getelementptr_4x32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP31]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, i32 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[Y:%.*]], i32 2
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[Z:%.*]], i32 3
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP21:%.*]], i32 1
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP3]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP21]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0
+; CHECK-NEXT:    [[T4:%.*]] = shl nsw i32 [[TMP5]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> undef, i32 [[T4]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[TMP7]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[G:%.*]], i64 [[TMP10]]
+; CHECK-NEXT:    [[T6:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[T6]], [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP13]]
+; CHECK-NEXT:    [[T8:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP15]]
+; CHECK-NEXT:    [[T10:%.*]] = load i32, i32* [[ARRAYIDX10]], align 4
+; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[ADD6]], [[T10]]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
+; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP17]]
+; CHECK-NEXT:    [[T12:%.*]] = load i32, i32* [[ARRAYIDX15]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> [[TMP18]], i32 [[ADD11]], i32 1
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x i32> <i32 1, i32 undef>, i32 [[T12]], i32 1
+; CHECK-NEXT:    [[TMP21]] = add nsw <2 x i32> [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x i32> [[TMP21]], i32 0
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP22]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp31 = icmp sgt i32 %n, 0
+  br i1 %cmp31, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add16, %for.cond.cleanup.loopexit ]
+  ret i32 %sum.0.lcssa
+
+for.body:
+  %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add16, %for.body ]
+  %t4 = shl nsw i32 %indvars.iv, 1
+  %t5 = add nsw i32 %t4, 0
+  %arrayidx = getelementptr inbounds i32, i32* %g, i32 %t5
+  %t6 = load i32, i32* %arrayidx, align 4
+  %add1 = add nsw i32 %t6, %sum.032
+  %t7 = add nsw i32 %t4, %x
+  %arrayidx5 = getelementptr inbounds i32, i32* %g, i32 %t7
+  %t8 = load i32, i32* %arrayidx5, align 4
+  %add6 = add nsw i32 %add1, %t8
+  %t9 = add nsw i32 %t4, %y
+  %arrayidx10 = getelementptr inbounds i32, i32* %g, i32 %t9
+  %t10 = load i32, i32* %arrayidx10, align 4
+  %add11 = add nsw i32 %add6, %t10
+  %t11 = add nsw i32 %t4, %z
+  %arrayidx15 = getelementptr inbounds i32, i32* %g, i32 %t11
+  %t12 = load i32, i32* %arrayidx15, align 4
+  %add16 = add nsw i32 %add11, %t12
+  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next , %n
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; YAML:      --- !Passed
+; YAML-NEXT: Pass:            slp-vectorizer
+; YAML-NEXT: Name:            VectorizedList
+; YAML-NEXT: Function:        getelementptr_2x32
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'SLP vectorized with cost '
+; YAML-NEXT:   - Cost:            '11'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '5'
+
+; YAML:      --- !Passed
+; YAML-NEXT: Pass:            slp-vectorizer
+; YAML-NEXT: Name:            VectorizedList
+; YAML-NEXT: Function:        getelementptr_2x32
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'SLP vectorized with cost '
+; YAML-NEXT:   - Cost:            '6'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '3'
+
+define i32 @getelementptr_2x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @getelementptr_2x32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP31]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> undef, i32 [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[Z:%.*]], i32 1
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP18:%.*]], i32 1
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP18]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[T4:%.*]] = shl nsw i32 [[TMP4]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = sext i32 [[T4]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[G:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[T6:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[T6]], [[TMP6]]
+; CHECK-NEXT:    [[T7:%.*]] = or i32 [[T4]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = sext i32 [[T7]] to i64
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP7]]
+; CHECK-NEXT:    [[T8:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> undef, i32 [[T4]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <2 x i32> [[TMP9]], [[TMP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP12]]
+; CHECK-NEXT:    [[T10:%.*]] = load i32, i32* [[ARRAYIDX10]], align 4
+; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[ADD6]], [[T10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP14]]
+; CHECK-NEXT:    [[T12:%.*]] = load i32, i32* [[ARRAYIDX15]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[ADD11]], i32 1
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <2 x i32> <i32 1, i32 undef>, i32 [[T12]], i32 1
+; CHECK-NEXT:    [[TMP18]] = add nsw <2 x i32> [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x i32> [[TMP18]], i32 0
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP19]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp31 = icmp sgt i32 %n, 0
+  br i1 %cmp31, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add16, %for.cond.cleanup.loopexit ]
+  ret i32 %sum.0.lcssa
+
+for.body:
+  %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add16, %for.body ]
+  %t4 = shl nsw i32 %indvars.iv, 1
+  %t5 = add nsw i32 %t4, 0
+  %arrayidx = getelementptr inbounds i32, i32* %g, i32 %t5
+  %t6 = load i32, i32* %arrayidx, align 4
+  %add1 = add nsw i32 %t6, %sum.032
+  %t7 = add nsw i32 %t4, 1
+  %arrayidx5 = getelementptr inbounds i32, i32* %g, i32 %t7
+  %t8 = load i32, i32* %arrayidx5, align 4
+  %add6 = add nsw i32 %add1, %t8
+  %t9 = add nsw i32 %t4, %y
+  %arrayidx10 = getelementptr inbounds i32, i32* %g, i32 %t9
+  %t10 = load i32, i32* %arrayidx10, align 4
+  %add11 = add nsw i32 %add6, %t10
+  %t11 = add nsw i32 %t4, %z
+  %arrayidx15 = getelementptr inbounds i32, i32* %g, i32 %t11
+  %t12 = load i32, i32* %arrayidx15, align 4
+  %add16 = add nsw i32 %add11, %t12
+  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next , %n
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AArch64/horizontal.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/horizontal.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/horizontal.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,436 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -slp-threshold=-6 -S -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: cat %t | FileCheck -check-prefix=YAML %s
+
+
+; FIXME: The threshold is changed to keep this test case a bit smaller.
+; The AArch64 cost model should not give such high costs to select statements.
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux"
+
+; YAML:      --- !Passed
+; YAML-NEXT: Pass:            slp-vectorizer
+; YAML-NEXT: Name:            VectorizedHorizontalReduction
+; YAML-NEXT: Function:        test_select
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
+; YAML-NEXT:   - Cost:            '-8'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '8'
+
+define i32 @test_select(i32* noalias nocapture readonly %blk1, i32* noalias nocapture readonly %blk2, i32 %lx, i32 %h) {
+; CHECK-LABEL: @test_select(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_22:%.*]] = icmp sgt i32 [[H:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP_22]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[LX:%.*]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[S_026:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[J_025:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[P2_024:%.*]] = phi i32* [ [[BLK2:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR29:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[P1_023:%.*]] = phi i32* [ [[BLK1:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[P1_023]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[P1_023]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[P1_023]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P1_023]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[P2_024]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <4 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP4]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 undef, [[S_026]]
+; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[ADD]], undef
+; CHECK-NEXT:    [[ADD19:%.*]] = add nsw i32 [[ADD11]], undef
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP7]])
+; CHECK-NEXT:    [[OP_EXTRA]] = add nsw i32 [[TMP8]], [[S_026]]
+; CHECK-NEXT:    [[ADD27:%.*]] = add nsw i32 [[ADD19]], undef
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i32, i32* [[P1_023]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[ADD_PTR29]] = getelementptr inbounds i32, i32* [[P2_024]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[J_025]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[H]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[S_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_EXTRA]], [[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[S_0_LCSSA]]
+;
+entry:
+  %cmp.22 = icmp sgt i32 %h, 0
+  br i1 %cmp.22, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %idx.ext = sext i32 %lx to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %s.026 = phi i32 [ 0, %for.body.lr.ph ], [ %add27, %for.body ]
+  %j.025 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %p2.024 = phi i32* [ %blk2, %for.body.lr.ph ], [ %add.ptr29, %for.body ]
+  %p1.023 = phi i32* [ %blk1, %for.body.lr.ph ], [ %add.ptr, %for.body ]
+  %0 = load i32, i32* %p1.023, align 4
+  %1 = load i32, i32* %p2.024, align 4
+  %sub = sub nsw i32 %0, %1
+  %cmp2 = icmp slt i32 %sub, 0
+  %sub3 = sub nsw i32 0, %sub
+  %sub3.sub = select i1 %cmp2, i32 %sub3, i32 %sub
+  %add = add nsw i32 %sub3.sub, %s.026
+  %arrayidx4 = getelementptr inbounds i32, i32* %p1.023, i64 1
+  %2 = load i32, i32* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds i32, i32* %p2.024, i64 1
+  %3 = load i32, i32* %arrayidx5, align 4
+  %sub6 = sub nsw i32 %2, %3
+  %cmp7 = icmp slt i32 %sub6, 0
+  %sub9 = sub nsw i32 0, %sub6
+  %v.1 = select i1 %cmp7, i32 %sub9, i32 %sub6
+  %add11 = add nsw i32 %add, %v.1
+  %arrayidx12 = getelementptr inbounds i32, i32* %p1.023, i64 2
+  %4 = load i32, i32* %arrayidx12, align 4
+  %arrayidx13 = getelementptr inbounds i32, i32* %p2.024, i64 2
+  %5 = load i32, i32* %arrayidx13, align 4
+  %sub14 = sub nsw i32 %4, %5
+  %cmp15 = icmp slt i32 %sub14, 0
+  %sub17 = sub nsw i32 0, %sub14
+  %sub17.sub14 = select i1 %cmp15, i32 %sub17, i32 %sub14
+  %add19 = add nsw i32 %add11, %sub17.sub14
+  %arrayidx20 = getelementptr inbounds i32, i32* %p1.023, i64 3
+  %6 = load i32, i32* %arrayidx20, align 4
+  %arrayidx21 = getelementptr inbounds i32, i32* %p2.024, i64 3
+  %7 = load i32, i32* %arrayidx21, align 4
+  %sub22 = sub nsw i32 %6, %7
+  %cmp23 = icmp slt i32 %sub22, 0
+  %sub25 = sub nsw i32 0, %sub22
+  %v.3 = select i1 %cmp23, i32 %sub25, i32 %sub22
+  %add27 = add nsw i32 %add19, %v.3
+  %add.ptr = getelementptr inbounds i32, i32* %p1.023, i64 %idx.ext
+  %add.ptr29 = getelementptr inbounds i32, i32* %p2.024, i64 %idx.ext
+  %inc = add nuw nsw i32 %j.025, 1
+  %exitcond = icmp eq i32 %inc, %h
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %s.0.lcssa = phi i32 [ 0, %entry ], [ %add27, %for.end.loopexit ]
+  ret i32 %s.0.lcssa
+}
+
+;; Check whether SLP can find a reduction phi whose incoming blocks are not
+;; the same as the block containing the phi.
+;;
+;; Came from code like,
+;;
+;; int s = 0;
+;; for (int j = 0; j < h; j++) {
+;;   s += p1[0] * p2[0]
+;;   s += p1[1] * p2[1];
+;;   s += p1[2] * p2[2];
+;;   s += p1[3] * p2[3];
+;;   if (s >= lim)
+;;      break;
+;;   p1 += lx;
+;;   p2 += lx;
+;; }
+define i32 @reduction_with_br(i32* noalias nocapture readonly %blk1, i32* noalias nocapture readonly %blk2, i32 %lx, i32 %h, i32 %lim) {
+; YAML:      --- !Passed
+; YAML-NEXT: Pass:            slp-vectorizer
+; YAML-NEXT: Name:            VectorizedHorizontalReduction
+; YAML-NEXT: Function:        reduction_with_br
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
+; YAML-NEXT:   - Cost:            '-11'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '3'
+; CHECK-LABEL: @reduction_with_br(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_16:%.*]] = icmp sgt i32 [[H:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP_16]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[LX:%.*]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[S_020:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[OP_EXTRA:%.*]], [[IF_END:%.*]] ]
+; CHECK-NEXT:    [[J_019:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[IF_END]] ]
+; CHECK-NEXT:    [[P2_018:%.*]] = phi i32* [ [[BLK2:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR16:%.*]], [[IF_END]] ]
+; CHECK-NEXT:    [[P1_017:%.*]] = phi i32* [ [[BLK1:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR:%.*]], [[IF_END]] ]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[P1_017]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[P2_018]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[P1_017]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[P2_018]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[P1_017]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P1_017]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, i32* [[P2_018]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[P2_018]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 undef, [[S_020]]
+; CHECK-NEXT:    [[ADD5:%.*]] = add nsw i32 [[ADD]], undef
+; CHECK-NEXT:    [[ADD9:%.*]] = add nsw i32 [[ADD5]], undef
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    [[OP_EXTRA]] = add nsw i32 [[TMP5]], [[S_020]]
+; CHECK-NEXT:    [[ADD13:%.*]] = add nsw i32 [[ADD9]], undef
+; CHECK-NEXT:    [[CMP14:%.*]] = icmp slt i32 [[OP_EXTRA]], [[LIM:%.*]]
+; CHECK-NEXT:    br i1 [[CMP14]], label [[IF_END]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i32, i32* [[P1_017]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[ADD_PTR16]] = getelementptr inbounds i32, i32* [[P2_018]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[J_019]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], [[H]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[S_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_EXTRA]], [[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[S_1]]
+;
+entry:
+  %cmp.16 = icmp sgt i32 %h, 0
+  br i1 %cmp.16, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %idx.ext = sext i32 %lx to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %if.end
+  %s.020 = phi i32 [ 0, %for.body.lr.ph ], [ %add13, %if.end ]
+  %j.019 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %if.end ]
+  %p2.018 = phi i32* [ %blk2, %for.body.lr.ph ], [ %add.ptr16, %if.end ]
+  %p1.017 = phi i32* [ %blk1, %for.body.lr.ph ], [ %add.ptr, %if.end ]
+  %0 = load i32, i32* %p1.017, align 4
+  %1 = load i32, i32* %p2.018, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %s.020
+  %arrayidx2 = getelementptr inbounds i32, i32* %p1.017, i64 1
+  %2 = load i32, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %p2.018, i64 1
+  %3 = load i32, i32* %arrayidx3, align 4
+  %mul4 = mul nsw i32 %3, %2
+  %add5 = add nsw i32 %add, %mul4
+  %arrayidx6 = getelementptr inbounds i32, i32* %p1.017, i64 2
+  %4 = load i32, i32* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds i32, i32* %p2.018, i64 2
+  %5 = load i32, i32* %arrayidx7, align 4
+  %mul8 = mul nsw i32 %5, %4
+  %add9 = add nsw i32 %add5, %mul8
+  %arrayidx10 = getelementptr inbounds i32, i32* %p1.017, i64 3
+  %6 = load i32, i32* %arrayidx10, align 4
+  %arrayidx11 = getelementptr inbounds i32, i32* %p2.018, i64 3
+  %7 = load i32, i32* %arrayidx11, align 4
+  %mul12 = mul nsw i32 %7, %6
+  %add13 = add nsw i32 %add9, %mul12
+  %cmp14 = icmp slt i32 %add13, %lim
+  br i1 %cmp14, label %if.end, label %for.end.loopexit
+
+if.end:                                           ; preds = %for.body
+  %add.ptr = getelementptr inbounds i32, i32* %p1.017, i64 %idx.ext
+  %add.ptr16 = getelementptr inbounds i32, i32* %p2.018, i64 %idx.ext
+  %inc = add nuw nsw i32 %j.019, 1
+  %cmp = icmp slt i32 %inc, %h
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body, %if.end
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %s.1 = phi i32 [ 0, %entry ], [ %add13, %for.end.loopexit ]
+  ret i32 %s.1
+}
+
+; YAML:      --- !Passed
+; YAML-NEXT: Pass:            slp-vectorizer
+; YAML-NEXT: Name:            VectorizedHorizontalReduction
+; YAML-NEXT: Function:        test_unrolled_select
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
+; YAML-NEXT:   - Cost:            '-47'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '10'
+
+define i32 @test_unrolled_select(i8* noalias nocapture readonly %blk1, i8* noalias nocapture readonly %blk2, i32 %lx, i32 %h, i32 %lim) #0 {
+; CHECK-LABEL: @test_unrolled_select(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_43:%.*]] = icmp sgt i32 [[H:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP_43]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[LX:%.*]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[S_047:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[OP_EXTRA:%.*]], [[IF_END_86:%.*]] ]
+; CHECK-NEXT:    [[J_046:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[IF_END_86]] ]
+; CHECK-NEXT:    [[P2_045:%.*]] = phi i8* [ [[BLK2:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR88:%.*]], [[IF_END_86]] ]
+; CHECK-NEXT:    [[P1_044:%.*]] = phi i8* [ [[BLK1:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR:%.*]], [[IF_END_86]] ]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, i8* [[P1_044]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i8, i8* [[P1_044]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i8, i8* [[P1_044]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds i8, i8* [[P1_044]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX50:%.*]] = getelementptr inbounds i8, i8* [[P1_044]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX52:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX61:%.*]] = getelementptr inbounds i8, i8* [[P1_044]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX63:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX72:%.*]] = getelementptr inbounds i8, i8* [[P1_044]], i64 7
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[P1_044]] to <8 x i8>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[ARRAYIDX74:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 7
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[P2_045]] to <8 x i8>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp slt <8 x i32> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = sub nsw <8 x i32> zeroinitializer, [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP7]], <8 x i32> [[TMP8]], <8 x i32> [[TMP6]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 undef, [[S_047]]
+; CHECK-NEXT:    [[ADD16:%.*]] = add nsw i32 [[ADD]], undef
+; CHECK-NEXT:    [[ADD27:%.*]] = add nsw i32 [[ADD16]], undef
+; CHECK-NEXT:    [[ADD38:%.*]] = add nsw i32 [[ADD27]], undef
+; CHECK-NEXT:    [[ADD49:%.*]] = add nsw i32 [[ADD38]], undef
+; CHECK-NEXT:    [[ADD60:%.*]] = add nsw i32 [[ADD49]], undef
+; CHECK-NEXT:    [[ADD71:%.*]] = add nsw i32 [[ADD60]], undef
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP9]])
+; CHECK-NEXT:    [[OP_EXTRA]] = add nsw i32 [[TMP10]], [[S_047]]
+; CHECK-NEXT:    [[ADD82:%.*]] = add nsw i32 [[ADD71]], undef
+; CHECK-NEXT:    [[CMP83:%.*]] = icmp slt i32 [[OP_EXTRA]], [[LIM:%.*]]
+; CHECK-NEXT:    br i1 [[CMP83]], label [[IF_END_86]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       if.end.86:
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, i8* [[P1_044]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[ADD_PTR88]] = getelementptr inbounds i8, i8* [[P2_045]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[J_046]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], [[H]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[S_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_EXTRA]], [[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[S_1]]
+;
+entry:
+  %cmp.43 = icmp sgt i32 %h, 0
+  br i1 %cmp.43, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %idx.ext = sext i32 %lx to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %if.end.86
+  %s.047 = phi i32 [ 0, %for.body.lr.ph ], [ %add82, %if.end.86 ]
+  %j.046 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %if.end.86 ]
+  %p2.045 = phi i8* [ %blk2, %for.body.lr.ph ], [ %add.ptr88, %if.end.86 ]
+  %p1.044 = phi i8* [ %blk1, %for.body.lr.ph ], [ %add.ptr, %if.end.86 ]
+  %0 = load i8, i8* %p1.044, align 1
+  %conv = zext i8 %0 to i32
+  %1 = load i8, i8* %p2.045, align 1
+  %conv2 = zext i8 %1 to i32
+  %sub = sub nsw i32 %conv, %conv2
+  %cmp3 = icmp slt i32 %sub, 0
+  %sub5 = sub nsw i32 0, %sub
+  %sub5.sub = select i1 %cmp3, i32 %sub5, i32 %sub
+  %add = add nsw i32 %sub5.sub, %s.047
+  %arrayidx6 = getelementptr inbounds i8, i8* %p1.044, i64 1
+  %2 = load i8, i8* %arrayidx6, align 1
+  %conv7 = zext i8 %2 to i32
+  %arrayidx8 = getelementptr inbounds i8, i8* %p2.045, i64 1
+  %3 = load i8, i8* %arrayidx8, align 1
+  %conv9 = zext i8 %3 to i32
+  %sub10 = sub nsw i32 %conv7, %conv9
+  %cmp11 = icmp slt i32 %sub10, 0
+  %sub14 = sub nsw i32 0, %sub10
+  %v.1 = select i1 %cmp11, i32 %sub14, i32 %sub10
+  %add16 = add nsw i32 %add, %v.1
+  %arrayidx17 = getelementptr inbounds i8, i8* %p1.044, i64 2
+  %4 = load i8, i8* %arrayidx17, align 1
+  %conv18 = zext i8 %4 to i32
+  %arrayidx19 = getelementptr inbounds i8, i8* %p2.045, i64 2
+  %5 = load i8, i8* %arrayidx19, align 1
+  %conv20 = zext i8 %5 to i32
+  %sub21 = sub nsw i32 %conv18, %conv20
+  %cmp22 = icmp slt i32 %sub21, 0
+  %sub25 = sub nsw i32 0, %sub21
+  %sub25.sub21 = select i1 %cmp22, i32 %sub25, i32 %sub21
+  %add27 = add nsw i32 %add16, %sub25.sub21
+  %arrayidx28 = getelementptr inbounds i8, i8* %p1.044, i64 3
+  %6 = load i8, i8* %arrayidx28, align 1
+  %conv29 = zext i8 %6 to i32
+  %arrayidx30 = getelementptr inbounds i8, i8* %p2.045, i64 3
+  %7 = load i8, i8* %arrayidx30, align 1
+  %conv31 = zext i8 %7 to i32
+  %sub32 = sub nsw i32 %conv29, %conv31
+  %cmp33 = icmp slt i32 %sub32, 0
+  %sub36 = sub nsw i32 0, %sub32
+  %v.3 = select i1 %cmp33, i32 %sub36, i32 %sub32
+  %add38 = add nsw i32 %add27, %v.3
+  %arrayidx39 = getelementptr inbounds i8, i8* %p1.044, i64 4
+  %8 = load i8, i8* %arrayidx39, align 1
+  %conv40 = zext i8 %8 to i32
+  %arrayidx41 = getelementptr inbounds i8, i8* %p2.045, i64 4
+  %9 = load i8, i8* %arrayidx41, align 1
+  %conv42 = zext i8 %9 to i32
+  %sub43 = sub nsw i32 %conv40, %conv42
+  %cmp44 = icmp slt i32 %sub43, 0
+  %sub47 = sub nsw i32 0, %sub43
+  %sub47.sub43 = select i1 %cmp44, i32 %sub47, i32 %sub43
+  %add49 = add nsw i32 %add38, %sub47.sub43
+  %arrayidx50 = getelementptr inbounds i8, i8* %p1.044, i64 5
+  %10 = load i8, i8* %arrayidx50, align 1
+  %conv51 = zext i8 %10 to i32
+  %arrayidx52 = getelementptr inbounds i8, i8* %p2.045, i64 5
+  %11 = load i8, i8* %arrayidx52, align 1
+  %conv53 = zext i8 %11 to i32
+  %sub54 = sub nsw i32 %conv51, %conv53
+  %cmp55 = icmp slt i32 %sub54, 0
+  %sub58 = sub nsw i32 0, %sub54
+  %v.5 = select i1 %cmp55, i32 %sub58, i32 %sub54
+  %add60 = add nsw i32 %add49, %v.5
+  %arrayidx61 = getelementptr inbounds i8, i8* %p1.044, i64 6
+  %12 = load i8, i8* %arrayidx61, align 1
+  %conv62 = zext i8 %12 to i32
+  %arrayidx63 = getelementptr inbounds i8, i8* %p2.045, i64 6
+  %13 = load i8, i8* %arrayidx63, align 1
+  %conv64 = zext i8 %13 to i32
+  %sub65 = sub nsw i32 %conv62, %conv64
+  %cmp66 = icmp slt i32 %sub65, 0
+  %sub69 = sub nsw i32 0, %sub65
+  %sub69.sub65 = select i1 %cmp66, i32 %sub69, i32 %sub65
+  %add71 = add nsw i32 %add60, %sub69.sub65
+  %arrayidx72 = getelementptr inbounds i8, i8* %p1.044, i64 7
+  %14 = load i8, i8* %arrayidx72, align 1
+  %conv73 = zext i8 %14 to i32
+  %arrayidx74 = getelementptr inbounds i8, i8* %p2.045, i64 7
+  %15 = load i8, i8* %arrayidx74, align 1
+  %conv75 = zext i8 %15 to i32
+  %sub76 = sub nsw i32 %conv73, %conv75
+  %cmp77 = icmp slt i32 %sub76, 0
+  %sub80 = sub nsw i32 0, %sub76
+  %v.7 = select i1 %cmp77, i32 %sub80, i32 %sub76
+  %add82 = add nsw i32 %add71, %v.7
+  %cmp83 = icmp slt i32 %add82, %lim
+  br i1 %cmp83, label %if.end.86, label %for.end.loopexit
+
+if.end.86:                                        ; preds = %for.body
+  %add.ptr = getelementptr inbounds i8, i8* %p1.044, i64 %idx.ext
+  %add.ptr88 = getelementptr inbounds i8, i8* %p2.045, i64 %idx.ext
+  %inc = add nuw nsw i32 %j.046, 1
+  %cmp = icmp slt i32 %inc, %h
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body, %if.end.86
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %s.1 = phi i32 [ 0, %entry ], [ %add82, %for.end.loopexit ]
+  ret i32 %s.1
+}
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AArch64/lit.local.cfg?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/lit.local.cfg (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/lit.local.cfg Tue Apr 16 21:52:47 2019
@@ -0,0 +1,2 @@
+if not 'AArch64' in config.root.targets:
+    config.unsupported = True

Added: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/load-store-q.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AArch64/load-store-q.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/load-store-q.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/load-store-q.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,46 @@
+; RUN: opt -S -basicaa -slp-vectorizer < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios5.0.0"
+
+; Holding a value live over a call boundary may require
+; spills and fills. This is the case for <2 x double>,
+; as it occupies a Q register of which there are no
+; callee-saves.
+ 
+; CHECK: load double
+; CHECK: load double
+; CHECK: call void @g
+; CHECK: store double
+; CHECK: store double
+define void @f(double* %p, double* %q) {
+  %addr2 = getelementptr double, double* %q, i32 1
+  %addr = getelementptr double, double* %p, i32 1
+  %x = load double, double* %p
+  %y = load double, double* %addr
+  call void @g()
+  store double %x, double* %q
+  store double %y, double* %addr2
+  ret void
+}
+declare void @g()
+
+; Check we deal with loops correctly.
+;
+; CHECK: store <2 x double>
+; CHECK: load <2 x double>
+define void @f2(double* %p, double* %q) {
+entry:
+  br label %loop
+
+loop:
+  %p1 = phi double [0.0, %entry], [%x, %loop]
+  %p2 = phi double [0.0, %entry], [%y, %loop]
+  %addr2 = getelementptr double, double* %q, i32 1
+  %addr = getelementptr double, double* %p, i32 1
+  store double %p1, double* %q
+  store double %p2, double* %addr2
+
+  %x = load double, double* %p
+  %y = load double, double* %addr
+  br label %loop
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/matmul.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AArch64/matmul.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/matmul.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/matmul.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,139 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=aarch64-unknown-unknown -mcpu=cortex-a53 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+; This test is reduced from the matrix multiplication benchmark in the test-suite:
+; https://github.com/llvm/llvm-test-suite/tree/master/SingleSource/Benchmarks/Misc/matmul_f64_4x4.c
+; The operations here are expected to be vectorized to <2 x double>.
+; Otherwise, performance will suffer on Cortex-A53.
+
+define void @wrap_mul4(double* nocapture %Out, [2 x double]* nocapture readonly %A, [4 x double]* nocapture readonly %B) {
+; CHECK-LABEL: @wrap_mul4(
+; CHECK-NEXT:    [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A:%.*]], i64 0, i64 0
+; CHECK-NEXT:    [[TEMP:%.*]] = load double, double* [[ARRAYIDX1_I]], align 8
+; CHECK-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B:%.*]], i64 0, i64 0
+; CHECK-NEXT:    [[ARRAYIDX5_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A]], i64 0, i64 1
+; CHECK-NEXT:    [[TEMP2:%.*]] = load double, double* [[ARRAYIDX5_I]], align 8
+; CHECK-NEXT:    [[ARRAYIDX7_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 0
+; CHECK-NEXT:    [[ARRAYIDX13_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 0, i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[ARRAYIDX3_I]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TEMP]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[TEMP]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[ARRAYIDX18_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[ARRAYIDX7_I]] to <2 x double>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> undef, double [[TEMP2]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[TEMP2]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x double> [[TMP9]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP5]], [[TMP10]]
+; CHECK-NEXT:    [[ARRAYIDX25_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 0, i64 2
+; CHECK-NEXT:    [[ARRAYIDX30_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 2
+; CHECK-NEXT:    [[ARRAYIDX37_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 0, i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast double* [[ARRAYIDX25_I]] to <2 x double>*
+; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x double>, <2 x double>* [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = fmul <2 x double> [[TMP4]], [[TMP13]]
+; CHECK-NEXT:    [[ARRAYIDX42_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 3
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double* [[ARRAYIDX30_I]] to <2 x double>*
+; CHECK-NEXT:    [[TMP16:%.*]] = load <2 x double>, <2 x double>* [[TMP15]], align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = fmul <2 x double> [[TMP9]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = fadd <2 x double> [[TMP14]], [[TMP17]]
+; CHECK-NEXT:    [[ARRAYIDX47_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A]], i64 1, i64 0
+; CHECK-NEXT:    [[TEMP10:%.*]] = load double, double* [[ARRAYIDX47_I]], align 8
+; CHECK-NEXT:    [[ARRAYIDX52_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A]], i64 1, i64 1
+; CHECK-NEXT:    [[TEMP11:%.*]] = load double, double* [[ARRAYIDX52_I]], align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> undef, double [[TEMP10]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[TEMP10]], i32 1
+; CHECK-NEXT:    [[TMP21:%.*]] = fmul <2 x double> [[TMP2]], [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <2 x double> undef, double [[TEMP11]], i32 0
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <2 x double> [[TMP22]], double [[TEMP11]], i32 1
+; CHECK-NEXT:    [[TMP24:%.*]] = fmul <2 x double> [[TMP7]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = fadd <2 x double> [[TMP21]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = fmul <2 x double> [[TMP13]], [[TMP20]]
+; CHECK-NEXT:    [[TMP27:%.*]] = fmul <2 x double> [[TMP16]], [[TMP23]]
+; CHECK-NEXT:    [[TMP28:%.*]] = fadd <2 x double> [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    [[RES_I_SROA_4_0_OUT2_I_SROA_IDX2:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 1
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast double* [[OUT]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP11]], <2 x double>* [[TMP29]], align 8
+; CHECK-NEXT:    [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 2
+; CHECK-NEXT:    [[RES_I_SROA_6_0_OUT2_I_SROA_IDX6:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 3
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast double* [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP18]], <2 x double>* [[TMP30]], align 8
+; CHECK-NEXT:    [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 4
+; CHECK-NEXT:    [[RES_I_SROA_8_0_OUT2_I_SROA_IDX10:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 5
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast double* [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP25]], <2 x double>* [[TMP31]], align 8
+; CHECK-NEXT:    [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 6
+; CHECK-NEXT:    [[RES_I_SROA_10_0_OUT2_I_SROA_IDX14:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 7
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast double* [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP28]], <2 x double>* [[TMP32]], align 8
+; CHECK-NEXT:    ret void
+;
+  %arrayidx1.i = getelementptr inbounds [2 x double], [2 x double]* %A, i64 0, i64 0
+  %temp = load double, double* %arrayidx1.i, align 8
+  %arrayidx3.i = getelementptr inbounds [4 x double], [4 x double]* %B, i64 0, i64 0
+  %temp1 = load double, double* %arrayidx3.i, align 8
+  %mul.i = fmul double %temp, %temp1
+  %arrayidx5.i = getelementptr inbounds [2 x double], [2 x double]* %A, i64 0, i64 1
+  %temp2 = load double, double* %arrayidx5.i, align 8
+  %arrayidx7.i = getelementptr inbounds [4 x double], [4 x double]* %B, i64 1, i64 0
+  %temp3 = load double, double* %arrayidx7.i, align 8
+  %mul8.i = fmul double %temp2, %temp3
+  %add.i = fadd double %mul.i, %mul8.i
+  %arrayidx13.i = getelementptr inbounds [4 x double], [4 x double]* %B, i64 0, i64 1
+  %temp4 = load double, double* %arrayidx13.i, align 8
+  %mul14.i = fmul double %temp, %temp4
+  %arrayidx18.i = getelementptr inbounds [4 x double], [4 x double]* %B, i64 1, i64 1
+  %temp5 = load double, double* %arrayidx18.i, align 8
+  %mul19.i = fmul double %temp2, %temp5
+  %add20.i = fadd double %mul14.i, %mul19.i
+  %arrayidx25.i = getelementptr inbounds [4 x double], [4 x double]* %B, i64 0, i64 2
+  %temp6 = load double, double* %arrayidx25.i, align 8
+  %mul26.i = fmul double %temp, %temp6
+  %arrayidx30.i = getelementptr inbounds [4 x double], [4 x double]* %B, i64 1, i64 2
+  %temp7 = load double, double* %arrayidx30.i, align 8
+  %mul31.i = fmul double %temp2, %temp7
+  %add32.i = fadd double %mul26.i, %mul31.i
+  %arrayidx37.i = getelementptr inbounds [4 x double], [4 x double]* %B, i64 0, i64 3
+  %temp8 = load double, double* %arrayidx37.i, align 8
+  %mul38.i = fmul double %temp, %temp8
+  %arrayidx42.i = getelementptr inbounds [4 x double], [4 x double]* %B, i64 1, i64 3
+  %temp9 = load double, double* %arrayidx42.i, align 8
+  %mul43.i = fmul double %temp2, %temp9
+  %add44.i = fadd double %mul38.i, %mul43.i
+  %arrayidx47.i = getelementptr inbounds [2 x double], [2 x double]* %A, i64 1, i64 0
+  %temp10 = load double, double* %arrayidx47.i, align 8
+  %mul50.i = fmul double %temp1, %temp10
+  %arrayidx52.i = getelementptr inbounds [2 x double], [2 x double]* %A, i64 1, i64 1
+  %temp11 = load double, double* %arrayidx52.i, align 8
+  %mul55.i = fmul double %temp3, %temp11
+  %add56.i = fadd double %mul50.i, %mul55.i
+  %mul62.i = fmul double %temp4, %temp10
+  %mul67.i = fmul double %temp5, %temp11
+  %add68.i = fadd double %mul62.i, %mul67.i
+  %mul74.i = fmul double %temp6, %temp10
+  %mul79.i = fmul double %temp7, %temp11
+  %add80.i = fadd double %mul74.i, %mul79.i
+  %mul86.i = fmul double %temp8, %temp10
+  %mul91.i = fmul double %temp9, %temp11
+  %add92.i = fadd double %mul86.i, %mul91.i
+  store double %add.i, double* %Out, align 8
+  %Res.i.sroa.4.0.Out2.i.sroa_idx2 = getelementptr inbounds double, double* %Out, i64 1
+  store double %add20.i, double* %Res.i.sroa.4.0.Out2.i.sroa_idx2, align 8
+  %Res.i.sroa.5.0.Out2.i.sroa_idx4 = getelementptr inbounds double, double* %Out, i64 2
+  store double %add32.i, double* %Res.i.sroa.5.0.Out2.i.sroa_idx4, align 8
+  %Res.i.sroa.6.0.Out2.i.sroa_idx6 = getelementptr inbounds double, double* %Out, i64 3
+  store double %add44.i, double* %Res.i.sroa.6.0.Out2.i.sroa_idx6, align 8
+  %Res.i.sroa.7.0.Out2.i.sroa_idx8 = getelementptr inbounds double, double* %Out, i64 4
+  store double %add56.i, double* %Res.i.sroa.7.0.Out2.i.sroa_idx8, align 8
+  %Res.i.sroa.8.0.Out2.i.sroa_idx10 = getelementptr inbounds double, double* %Out, i64 5
+  store double %add68.i, double* %Res.i.sroa.8.0.Out2.i.sroa_idx10, align 8
+  %Res.i.sroa.9.0.Out2.i.sroa_idx12 = getelementptr inbounds double, double* %Out, i64 6
+  store double %add80.i, double* %Res.i.sroa.9.0.Out2.i.sroa_idx12, align 8
+  %Res.i.sroa.10.0.Out2.i.sroa_idx14 = getelementptr inbounds double, double* %Out, i64 7
+  store double %add92.i, double* %Res.i.sroa.10.0.Out2.i.sroa_idx14, align 8
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/minimum-sizes.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AArch64/minimum-sizes.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/minimum-sizes.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/minimum-sizes.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -slp-vectorizer < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; This test ensures that we do not regress due to PR26364. The vectorizer
+; should not compute a smaller size for %k.13 since it is in a use-def cycle
+; and cannot be demoted.
+;
+define fastcc void @PR26364() {
+; CHECK-LABEL: @PR26364(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[FOR_END11:%.*]], label [[FOR_COND4:%.*]]
+; CHECK:       for.cond4:
+; CHECK-NEXT:    [[K_13:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[K_3:%.*]], [[FOR_COND4]] ]
+; CHECK-NEXT:    [[E_02:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ 0, [[FOR_COND4]] ]
+; CHECK-NEXT:    [[E_1:%.*]] = select i1 undef, i32 [[E_02]], i32 0
+; CHECK-NEXT:    [[K_3]] = select i1 undef, i32 [[K_13]], i32 undef
+; CHECK-NEXT:    br label [[FOR_COND4]]
+; CHECK:       for.end11:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 undef, label %for.end11, label %for.cond4
+
+for.cond4:
+  %k.13 = phi i32 [ undef, %entry ], [ %k.3, %for.cond4 ]
+  %e.02 = phi i32 [ 1, %entry ], [ 0, %for.cond4 ]
+  %e.1 = select i1 undef, i32 %e.02, i32 0
+  %k.3 = select i1 undef, i32 %k.13, i32 undef
+  br label %for.cond4
+
+for.end11:
+  ret void
+}
+
+; This test ensures that we do not regress due to PR26629. We must look at
+; every root in the vectorizable tree when computing minimum sizes since one
+; root may require fewer bits than another.
+;
+define void @PR26629(i32* %c) {
+; CHECK-LABEL: @PR26629(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[FOR_PH:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[C:%.*]], align 4
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[D:%.*]] = phi i72 [ 576507472957710340, [[FOR_PH]] ], [ [[BF_SET17:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[TMP0]], undef
+; CHECK-NEXT:    [[BF_CLEAR13:%.*]] = and i72 [[D]], -576460748008464384
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[SUB]] to i72
+; CHECK-NEXT:    [[BF_VALUE15:%.*]] = and i72 [[TMP1]], 8191
+; CHECK-NEXT:    [[BF_CLEAR16:%.*]] = or i72 [[BF_VALUE15]], [[BF_CLEAR13]]
+; CHECK-NEXT:    [[BF_SET17]] = or i72 [[BF_CLEAR16]], undef
+; CHECK-NEXT:    br label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 undef, label %for.ph, label %for.end
+
+for.ph:
+  %0 = load i32, i32* %c, align 4
+  br label %for.body
+
+for.body:
+  %d = phi i72 [ 576507472957710340, %for.ph ], [ %bf.set17, %for.body ]
+  %sub = sub i32 %0, undef
+  %bf.clear13 = and i72 %d, -576460748008464384
+  %1 = zext i32 %sub to i72
+  %bf.value15 = and i72 %1, 8191
+  %bf.clear16 = or i72 %bf.value15, %bf.clear13
+  %bf.set17 = or i72 %bf.clear16, undef
+  br label %for.body
+
+for.end:
+  ret void
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/mismatched-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AArch64/mismatched-intrinsics.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/mismatched-intrinsics.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/mismatched-intrinsics.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -slp-vectorizer %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios5.0.0"
+
+define i64 @mismatched_intrinsics(<4 x i32> %in1, <2 x i32> %in2) nounwind {
+; CHECK-LABEL: @mismatched_intrinsics(
+; CHECK-NEXT:    [[VADDLVQ_S32_I:%.*]] = tail call i64 @llvm.arm64.neon.saddlv.i64.v4i32(<4 x i32> [[IN1:%.*]])
+; CHECK-NEXT:    [[VADDLV_S32_I:%.*]] = tail call i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32> [[IN2:%.*]])
+; CHECK-NEXT:    [[TST:%.*]] = icmp sgt i64 [[VADDLVQ_S32_I]], [[VADDLV_S32_I]]
+; CHECK-NEXT:    [[EQUAL:%.*]] = sext i1 [[TST]] to i64
+; CHECK-NEXT:    ret i64 [[EQUAL]]
+;
+
+  %vaddlvq_s32.i = tail call i64 @llvm.arm64.neon.saddlv.i64.v4i32(<4 x i32> %in1) #2
+  %vaddlv_s32.i = tail call i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32> %in2) #2
+  %tst = icmp sgt i64 %vaddlvq_s32.i, %vaddlv_s32.i
+  %equal = sext i1 %tst to i64
+  ret i64 %equal
+}
+
+declare i64 @llvm.arm64.neon.saddlv.i64.v4i32(<4 x i32> %in1)
+declare i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32> %in1)

Added: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/nontemporal.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AArch64/nontemporal.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/nontemporal.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/nontemporal.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -basicaa -slp-vectorizer -dce < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios5.0.0"
+
+define void @foo(float* noalias %a, float* noalias %b, float* noalias %c) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[B:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4, !nontemporal !0
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[C:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[A:%.*]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4, !nontemporal !0
+; CHECK-NEXT:    ret void
+;
+entry:
+; Check that we don't lose !nontemporal hint when vectorizing loads.
+  %b1 = load float, float* %b, align 4, !nontemporal !0
+  %arrayidx.1 = getelementptr inbounds float, float* %b, i64 1
+  %b2 = load float, float* %arrayidx.1, align 4, !nontemporal !0
+  %arrayidx.2 = getelementptr inbounds float, float* %b, i64 2
+  %b3 = load float, float* %arrayidx.2, align 4, !nontemporal !0
+  %arrayidx.3 = getelementptr inbounds float, float* %b, i64 3
+  %b4 = load float, float* %arrayidx.3, align 4, !nontemporal !0
+
+; Check that we don't introduce !nontemporal hint when the original scalar loads didn't have it.
+  %c1 = load float, float* %c, align 4
+  %arrayidx2.1 = getelementptr inbounds float, float* %c, i64 1
+  %c2 = load float, float* %arrayidx2.1, align 4
+  %arrayidx2.2 = getelementptr inbounds float, float* %c, i64 2
+  %c3 = load float, float* %arrayidx2.2, align 4
+  %arrayidx2.3 = getelementptr inbounds float, float* %c, i64 3
+  %c4 = load float, float* %arrayidx2.3, align 4
+
+  %a1 = fadd float %b1, %c1
+  %a2 = fadd float %b2, %c2
+  %a3 = fadd float %b3, %c3
+  %a4 = fadd float %b4, %c4
+
+; Check that we don't lose !nontemporal hint when vectorizing stores.
+  store float %a1, float* %a, align 4, !nontemporal !0
+  %arrayidx3.1 = getelementptr inbounds float, float* %a, i64 1
+  store float %a2, float* %arrayidx3.1, align 4, !nontemporal !0
+  %arrayidx3.2 = getelementptr inbounds float, float* %a, i64 2
+  store float %a3, float* %arrayidx3.2, align 4, !nontemporal !0
+  %arrayidx3.3 = getelementptr inbounds float, float* %a, i64 3
+  store float %a4, float* %arrayidx3.3, align 4, !nontemporal !0
+
+  ret void
+}
+
+define void @foo2(float* noalias %a, float* noalias %b) {
+; CHECK-LABEL: @foo2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[B:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[A:%.*]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+; Check that we don't mark vector load with !nontemporal attribute if some of
+; the original scalar loads don't have it.
+  %b1 = load float, float* %b, align 4, !nontemporal !0
+  %arrayidx.1 = getelementptr inbounds float, float* %b, i64 1
+  %b2 = load float, float* %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds float, float* %b, i64 2
+  %b3 = load float, float* %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds float, float* %b, i64 3
+  %b4 = load float, float* %arrayidx.3, align 4, !nontemporal !0
+
+; Check that we don't mark vector store with !nontemporal attribute if some of
+; the original scalar stores don't have it.
+  store float %b1, float* %a, align 4, !nontemporal !0
+  %arrayidx3.1 = getelementptr inbounds float, float* %a, i64 1
+  store float %b2, float* %arrayidx3.1, align 4
+  %arrayidx3.2 = getelementptr inbounds float, float* %a, i64 2
+  store float %b3, float* %arrayidx3.2, align 4
+  %arrayidx3.3 = getelementptr inbounds float, float* %a, i64 3
+  store float %b4, float* %arrayidx3.3, align 4, !nontemporal !0
+
+  ret void
+}
+
+!0 = !{i32 1}

Added: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/remarks.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AArch64/remarks.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/remarks.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/remarks.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,32 @@
+; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic -pass-remarks=slp-vectorizer -o /dev/null < %s 2>&1 | FileCheck %s
+
+define void @f(double* %r, double* %w) {
+  %r0 = getelementptr inbounds double, double* %r, i64 0
+  %r1 = getelementptr inbounds double, double* %r, i64 1
+  %f0 = load double, double* %r0
+  %f1 = load double, double* %r1
+  %add0 = fadd double %f0, %f0
+  %add1 = fadd double %f1, %f1
+  %w0 = getelementptr inbounds double, double* %w, i64 0
+  %w1 = getelementptr inbounds double, double* %w, i64 1
+; CHECK: remark: /tmp/s.c:5:10: Stores SLP vectorized with cost -4 and with tree size 3
+  store double %add0, double* %w0, !dbg !9
+  store double %add1, double* %w1
+  ret void
+}
+
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 4.0.0 (trunk 281293) (llvm/trunk 281290)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!1 = !DIFile(filename: "/tmp/s.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"PIC Level", i32 2}
+!6 = !{!"clang version 4.0.0 (trunk 281293) (llvm/trunk 281290)"}
+!7 = distinct !DISubprogram(name: "baz", scope: !1, file: !1, line: 4, type: !8, isLocal: false, isDefinition: true, scopeLine: 4, isOptimized: true, unit: !0, retainedNodes: !2)
+!8 = !DISubroutineType(types: !2)
+!9 = !DILocation(line: 5, column: 10, scope: !7)

Added: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu -mcpu=cortex-a57 | FileCheck %s
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+define void @test1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = sdiv <4 x i32> [[TMP4]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i32, i32* %b, align 4
+  %1 = load i32, i32* %c, align 4
+  %add = add nsw i32 %1, %0
+  %div = sdiv i32 %add, 2
+  store i32 %div, i32* %a, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 1
+  %2 = load i32, i32* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds i32, i32* %c, i64 1
+  %3 = load i32, i32* %arrayidx4, align 4
+  %add5 = add nsw i32 %3, %2
+  %div6 = sdiv i32 %add5, 2
+  %arrayidx7 = getelementptr inbounds i32, i32* %a, i64 1
+  store i32 %div6, i32* %arrayidx7, align 4
+  %arrayidx8 = getelementptr inbounds i32, i32* %b, i64 2
+  %4 = load i32, i32* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i32, i32* %c, i64 2
+  %5 = load i32, i32* %arrayidx9, align 4
+  %add10 = add nsw i32 %5, %4
+  %div11 = sdiv i32 %add10, 2
+  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 2
+  store i32 %div11, i32* %arrayidx12, align 4
+  %arrayidx13 = getelementptr inbounds i32, i32* %b, i64 3
+  %6 = load i32, i32* %arrayidx13, align 4
+  %arrayidx14 = getelementptr inbounds i32, i32* %c, i64 3
+  %7 = load i32, i32* %arrayidx14, align 4
+  %add15 = add nsw i32 %7, %6
+  %div16 = sdiv i32 %add15, 2
+  %arrayidx17 = getelementptr inbounds i32, i32* %a, i64 3
+  store i32 %div16, i32* %arrayidx17, align 4
+  ret void
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; Debug informations shouldn't affect spill cost.
+; RUN: opt -S -slp-vectorizer %s -o - | FileCheck %s
+
+target triple = "aarch64"
+
+%struct.S = type { i64, i64 }
+
+define void @patatino(i64 %n, i64 %i, %struct.S* %p) !dbg !7 {
+; CHECK-LABEL: @patatino(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[N:%.*]], metadata !18, metadata !DIExpression()), !dbg !23
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[I:%.*]], metadata !19, metadata !DIExpression()), !dbg !24
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata %struct.S* [[P:%.*]], metadata !20, metadata !DIExpression()), !dbg !25
+; CHECK-NEXT:    [[X1:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[P]], i64 [[N]], i32 0, !dbg !26
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 undef, metadata !21, metadata !DIExpression()), !dbg !27
+; CHECK-NEXT:    [[Y3:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[N]], i32 1, !dbg !28
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[X1]] to <2 x i64>*, !dbg !26
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8, !dbg !26, !tbaa !29
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 undef, metadata !22, metadata !DIExpression()), !dbg !33
+; CHECK-NEXT:    [[X5:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[I]], i32 0, !dbg !34
+; CHECK-NEXT:    [[Y7:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[I]], i32 1, !dbg !35
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[X5]] to <2 x i64>*, !dbg !36
+; CHECK-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[TMP2]], align 8, !dbg !36, !tbaa !29
+; CHECK-NEXT:    ret void, !dbg !37
+;
+entry:
+  call void @llvm.dbg.value(metadata i64 %n, metadata !18, metadata !DIExpression()), !dbg !23
+  call void @llvm.dbg.value(metadata i64 %i, metadata !19, metadata !DIExpression()), !dbg !24
+  call void @llvm.dbg.value(metadata %struct.S* %p, metadata !20, metadata !DIExpression()), !dbg !25
+  %x1 = getelementptr inbounds %struct.S, %struct.S* %p, i64 %n, i32 0, !dbg !26
+  %0 = load i64, i64* %x1, align 8, !dbg !26, !tbaa !27
+  call void @llvm.dbg.value(metadata i64 %0, metadata !21, metadata !DIExpression()), !dbg !32
+  %y3 = getelementptr inbounds %struct.S, %struct.S* %p, i64 %n, i32 1, !dbg !33
+  %1 = load i64, i64* %y3, align 8, !dbg !33, !tbaa !34
+  call void @llvm.dbg.value(metadata i64 %1, metadata !22, metadata !DIExpression()), !dbg !35
+  %x5 = getelementptr inbounds %struct.S, %struct.S* %p, i64 %i, i32 0, !dbg !36
+  store i64 %0, i64* %x5, align 8, !dbg !37, !tbaa !27
+  %y7 = getelementptr inbounds %struct.S, %struct.S* %p, i64 %i, i32 1, !dbg !38
+  store i64 %1, i64* %y7, align 8, !dbg !39, !tbaa !34
+  ret void, !dbg !40
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
+attributes #1 = { nounwind readnone speculatable }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 7.0.0 (trunk 330946) (llvm/trunk 330976)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "slp-reduced.c", directory: "/usr2/gberry/local/loop-align")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 7.0.0 (trunk 330946) (llvm/trunk 330976)"}
+!7 = distinct !DISubprogram(name: "patatino", scope: !1, file: !1, line: 6, type: !8, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !17)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10, !10, !11}
+!10 = !DIBasicType(name: "long int", size: 64, encoding: DW_ATE_signed)
+!11 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 64)
+!12 = !DIDerivedType(tag: DW_TAG_typedef, name: "S", file: !1, line: 4, baseType: !13)
+!13 = distinct !DICompositeType(tag: DW_TAG_structure_type, file: !1, line: 1, size: 128, elements: !14)
+!14 = !{!15, !16}
+!15 = !DIDerivedType(tag: DW_TAG_member, name: "x", scope: !13, file: !1, line: 2, baseType: !10, size: 64)
+!16 = !DIDerivedType(tag: DW_TAG_member, name: "y", scope: !13, file: !1, line: 3, baseType: !10, size: 64, offset: 64)
+!17 = !{!18, !19, !20, !21, !22}
+!18 = !DILocalVariable(name: "n", arg: 1, scope: !7, file: !1, line: 6, type: !10)
+!19 = !DILocalVariable(name: "i", arg: 2, scope: !7, file: !1, line: 6, type: !10)
+!20 = !DILocalVariable(name: "p", arg: 3, scope: !7, file: !1, line: 6, type: !11)
+!21 = !DILocalVariable(name: "x", scope: !7, file: !1, line: 7, type: !10)
+!22 = !DILocalVariable(name: "y", scope: !7, file: !1, line: 8, type: !10)
+!23 = !DILocation(line: 6, column: 15, scope: !7)
+!24 = !DILocation(line: 6, column: 23, scope: !7)
+!25 = !DILocation(line: 6, column: 29, scope: !7)
+!26 = !DILocation(line: 7, column: 19, scope: !7)
+!27 = !{!28, !29, i64 0}
+!28 = !{!"", !29, i64 0, !29, i64 8}
+!29 = !{!"long", !30, i64 0}
+!30 = !{!"omnipotent char", !31, i64 0}
+!31 = !{!"Simple C/C++ TBAA"}
+!32 = !DILocation(line: 7, column: 10, scope: !7)
+!33 = !DILocation(line: 8, column: 19, scope: !7)
+!34 = !{!28, !29, i64 8}
+!35 = !DILocation(line: 8, column: 10, scope: !7)
+!36 = !DILocation(line: 9, column: 10, scope: !7)
+!37 = !DILocation(line: 9, column: 12, scope: !7)
+!38 = !DILocation(line: 10, column: 10, scope: !7)
+!39 = !DILocation(line: 10, column: 12, scope: !7)
+!40 = !DILocation(line: 11, column: 1, scope: !7)

Added: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/transpose.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AArch64/transpose.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/transpose.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/transpose.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,310 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) {
+; CHECK-LABEL: @build_vec_v2i64(
+; CHECK-NEXT:    [[V0_0:%.*]] = extractelement <2 x i64> [[V0:%.*]], i32 0
+; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <2 x i64> [[V0]], i32 1
+; CHECK-NEXT:    [[V1_0:%.*]] = extractelement <2 x i64> [[V1:%.*]], i32 0
+; CHECK-NEXT:    [[V1_1:%.*]] = extractelement <2 x i64> [[V1]], i32 1
+; CHECK-NEXT:    [[TMP0_0:%.*]] = add i64 [[V0_0]], [[V1_0]]
+; CHECK-NEXT:    [[TMP0_1:%.*]] = add i64 [[V0_1]], [[V1_1]]
+; CHECK-NEXT:    [[TMP1_0:%.*]] = sub i64 [[V0_0]], [[V1_0]]
+; CHECK-NEXT:    [[TMP1_1:%.*]] = sub i64 [[V0_1]], [[V1_1]]
+; CHECK-NEXT:    [[TMP2_0:%.*]] = add i64 [[TMP0_0]], [[TMP0_1]]
+; CHECK-NEXT:    [[TMP2_1:%.*]] = add i64 [[TMP1_0]], [[TMP1_1]]
+; CHECK-NEXT:    [[TMP3_0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2_0]], i32 0
+; CHECK-NEXT:    [[TMP3_1:%.*]] = insertelement <2 x i64> [[TMP3_0]], i64 [[TMP2_1]], i32 1
+; CHECK-NEXT:    ret <2 x i64> [[TMP3_1]]
+;
+  %v0.0 = extractelement <2 x i64> %v0, i32 0
+  %v0.1 = extractelement <2 x i64> %v0, i32 1
+  %v1.0 = extractelement <2 x i64> %v1, i32 0
+  %v1.1 = extractelement <2 x i64> %v1, i32 1
+  %tmp0.0 = add i64 %v0.0, %v1.0
+  %tmp0.1 = add i64 %v0.1, %v1.1
+  %tmp1.0 = sub i64 %v0.0, %v1.0
+  %tmp1.1 = sub i64 %v0.1, %v1.1
+  %tmp2.0 = add i64 %tmp0.0, %tmp0.1
+  %tmp2.1 = add i64 %tmp1.0, %tmp1.1
+  %tmp3.0 = insertelement <2 x i64> undef, i64 %tmp2.0, i32 0
+  %tmp3.1 = insertelement <2 x i64> %tmp3.0, i64 %tmp2.1, i32 1
+  ret <2 x i64> %tmp3.1
+}
+
+define void @store_chain_v2i64(i64* %a, i64* %b, i64* %c) {
+; CHECK-LABEL: @store_chain_v2i64(
+; CHECK-NEXT:    [[A_1:%.*]] = getelementptr i64, i64* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[B_1:%.*]] = getelementptr i64, i64* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[C_1:%.*]] = getelementptr i64, i64* [[C:%.*]], i64 1
+; CHECK-NEXT:    [[V0_0:%.*]] = load i64, i64* [[A]], align 8
+; CHECK-NEXT:    [[V0_1:%.*]] = load i64, i64* [[A_1]], align 8
+; CHECK-NEXT:    [[V1_0:%.*]] = load i64, i64* [[B]], align 8
+; CHECK-NEXT:    [[V1_1:%.*]] = load i64, i64* [[B_1]], align 8
+; CHECK-NEXT:    [[TMP0_0:%.*]] = add i64 [[V0_0]], [[V1_0]]
+; CHECK-NEXT:    [[TMP0_1:%.*]] = add i64 [[V0_1]], [[V1_1]]
+; CHECK-NEXT:    [[TMP1_0:%.*]] = sub i64 [[V0_0]], [[V1_0]]
+; CHECK-NEXT:    [[TMP1_1:%.*]] = sub i64 [[V0_1]], [[V1_1]]
+; CHECK-NEXT:    [[TMP2_0:%.*]] = add i64 [[TMP0_0]], [[TMP0_1]]
+; CHECK-NEXT:    [[TMP2_1:%.*]] = add i64 [[TMP1_0]], [[TMP1_1]]
+; CHECK-NEXT:    store i64 [[TMP2_0]], i64* [[C]], align 8
+; CHECK-NEXT:    store i64 [[TMP2_1]], i64* [[C_1]], align 8
+; CHECK-NEXT:    ret void
+;
+  %a.0 = getelementptr i64, i64* %a, i64 0
+  %a.1 = getelementptr i64, i64* %a, i64 1
+  %b.0 = getelementptr i64, i64* %b, i64 0
+  %b.1 = getelementptr i64, i64* %b, i64 1
+  %c.0 = getelementptr i64, i64* %c, i64 0
+  %c.1 = getelementptr i64, i64* %c, i64 1
+  %v0.0 = load i64, i64* %a.0, align 8
+  %v0.1 = load i64, i64* %a.1, align 8
+  %v1.0 = load i64, i64* %b.0, align 8
+  %v1.1 = load i64, i64* %b.1, align 8
+  %tmp0.0 = add i64 %v0.0, %v1.0
+  %tmp0.1 = add i64 %v0.1, %v1.1
+  %tmp1.0 = sub i64 %v0.0, %v1.0
+  %tmp1.1 = sub i64 %v0.1, %v1.1
+  %tmp2.0 = add i64 %tmp0.0, %tmp0.1
+  %tmp2.1 = add i64 %tmp1.0, %tmp1.1
+  store i64 %tmp2.0, i64* %c.0, align 8
+  store i64 %tmp2.1, i64* %c.1, align 8
+  ret void
+}
+
+define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @build_vec_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[V1]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[SHUFFLE3:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[SHUFFLE2]], [[SHUFFLE3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sub <4 x i32> [[SHUFFLE2]], [[SHUFFLE3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[TMP5]], [[TMP10]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP11]]
+;
+  %v0.0 = extractelement <4 x i32> %v0, i32 0
+  %v0.1 = extractelement <4 x i32> %v0, i32 1
+  %v0.2 = extractelement <4 x i32> %v0, i32 2
+  %v0.3 = extractelement <4 x i32> %v0, i32 3
+  %v1.0 = extractelement <4 x i32> %v1, i32 0
+  %v1.1 = extractelement <4 x i32> %v1, i32 1
+  %v1.2 = extractelement <4 x i32> %v1, i32 2
+  %v1.3 = extractelement <4 x i32> %v1, i32 3
+  %tmp0.0 = add i32 %v0.0, %v1.0
+  %tmp0.1 = add i32 %v0.1, %v1.1
+  %tmp0.2 = add i32 %v0.2, %v1.2
+  %tmp0.3 = add i32 %v0.3, %v1.3
+  %tmp1.0 = sub i32 %v0.0, %v1.0
+  %tmp1.1 = sub i32 %v0.1, %v1.1
+  %tmp1.2 = sub i32 %v0.2, %v1.2
+  %tmp1.3 = sub i32 %v0.3, %v1.3
+  %tmp2.0 = add i32 %tmp0.0, %tmp0.1
+  %tmp2.1 = add i32 %tmp1.0, %tmp1.1
+  %tmp2.2 = add i32 %tmp0.2, %tmp0.3
+  %tmp2.3 = add i32 %tmp1.2, %tmp1.3
+  %tmp3.0 = insertelement <4 x i32> undef, i32 %tmp2.0, i32 0
+  %tmp3.1 = insertelement <4 x i32> %tmp3.0, i32 %tmp2.1, i32 1
+  %tmp3.2 = insertelement <4 x i32> %tmp3.1, i32 %tmp2.2, i32 2
+  %tmp3.3 = insertelement <4 x i32> %tmp3.2, i32 %tmp2.3, i32 3
+  ret <4 x i32> %tmp3.3
+}
+
+define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) {
+; CHECK-LABEL: @build_vec_v4i32_reuse_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[V1]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sub <2 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = add <2 x i32> [[TMP5]], [[TMP10]]
+; CHECK-NEXT:    [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    ret <4 x i32> [[TMP3_3]]
+;
+  %v0.0 = extractelement <2 x i32> %v0, i32 0
+  %v0.1 = extractelement <2 x i32> %v0, i32 1
+  %v1.0 = extractelement <2 x i32> %v1, i32 0
+  %v1.1 = extractelement <2 x i32> %v1, i32 1
+  %tmp0.0 = add i32 %v0.0, %v1.0
+  %tmp0.1 = add i32 %v0.1, %v1.1
+  %tmp1.0 = sub i32 %v0.0, %v1.0
+  %tmp1.1 = sub i32 %v0.1, %v1.1
+  %tmp2.0 = add i32 %tmp0.0, %tmp0.1
+  %tmp2.1 = add i32 %tmp1.0, %tmp1.1
+  %tmp3.0 = insertelement <4 x i32> undef, i32 %tmp2.0, i32 0
+  %tmp3.1 = insertelement <4 x i32> %tmp3.0, i32 %tmp2.1, i32 1
+  %tmp3.2 = insertelement <4 x i32> %tmp3.1, i32 %tmp2.0, i32 2
+  %tmp3.3 = insertelement <4 x i32> %tmp3.2, i32 %tmp2.1, i32 3
+  ret <4 x i32> %tmp3.3
+}
+
+define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
+; CHECK-LABEL: @build_vec_v4i32_reuse_1(
+; CHECK-NEXT:    [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0
+; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1
+; CHECK-NEXT:    [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0
+; CHECK-NEXT:    [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1
+; CHECK-NEXT:    [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]]
+; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]]
+; CHECK-NEXT:    [[TMP0_2:%.*]] = xor i32 [[V0_0]], [[V1_0]]
+; CHECK-NEXT:    [[TMP0_3:%.*]] = xor i32 [[V0_1]], [[V1_1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[TMP0_0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> undef, i32 [[TMP0_1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP1_2:%.*]] = sub i32 [[TMP0_2]], [[TMP0_3]]
+; CHECK-NEXT:    [[TMP1_3:%.*]] = sub i32 [[TMP0_3]], [[TMP0_2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP2_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP2_1:%.*]] = insertelement <4 x i32> [[TMP2_0]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP2_2:%.*]] = insertelement <4 x i32> [[TMP2_1]], i32 [[TMP1_2]], i32 2
+; CHECK-NEXT:    [[TMP2_3:%.*]] = insertelement <4 x i32> [[TMP2_2]], i32 [[TMP1_3]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[TMP2_3]]
+;
+  %v0.0 = extractelement <2 x i32> %v0, i32 0
+  %v0.1 = extractelement <2 x i32> %v0, i32 1
+  %v1.0 = extractelement <2 x i32> %v1, i32 0
+  %v1.1 = extractelement <2 x i32> %v1, i32 1
+  %tmp0.0 = add i32 %v0.0, %v1.0
+  %tmp0.1 = add i32 %v0.1, %v1.1
+  %tmp0.2 = xor i32 %v0.0, %v1.0
+  %tmp0.3 = xor i32 %v0.1, %v1.1
+  %tmp1.0 = sub i32 %tmp0.0, %tmp0.1
+  %tmp1.1 = sub i32 %tmp0.0, %tmp0.1
+  %tmp1.2 = sub i32 %tmp0.2, %tmp0.3
+  %tmp1.3 = sub i32 %tmp0.3, %tmp0.2
+  %tmp2.0 = insertelement <4 x i32> undef, i32 %tmp1.0, i32 0
+  %tmp2.1 = insertelement <4 x i32> %tmp2.0, i32 %tmp1.1, i32 1
+  %tmp2.2 = insertelement <4 x i32> %tmp2.1, i32 %tmp1.2, i32 2
+  %tmp2.3 = insertelement <4 x i32> %tmp2.2, i32 %tmp1.3, i32 3
+  ret <4 x i32> %tmp2.3
+}
+
+define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
+; CHECK-LABEL: @build_vec_v4i32_3_binops(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[V1]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <2 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <2 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <2 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add <2 x i32> [[TMP5]], [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add <2 x i32> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[TMP3_3]]
+;
+  %v0.0 = extractelement <2 x i32> %v0, i32 0
+  %v0.1 = extractelement <2 x i32> %v0, i32 1
+  %v1.0 = extractelement <2 x i32> %v1, i32 0
+  %v1.1 = extractelement <2 x i32> %v1, i32 1
+  %tmp0.0 = add i32 %v0.0, %v1.0
+  %tmp0.1 = add i32 %v0.1, %v1.1
+  %tmp0.2 = xor i32 %v0.0, %v1.0
+  %tmp0.3 = xor i32 %v0.1, %v1.1
+  %tmp1.0 = mul i32 %v0.0, %v1.0
+  %tmp1.1 = mul i32 %v0.1, %v1.1
+  %tmp1.2 = xor i32 %v0.0, %v1.0
+  %tmp1.3 = xor i32 %v0.1, %v1.1
+  %tmp2.0 = add i32 %tmp0.0, %tmp0.1
+  %tmp2.1 = add i32 %tmp1.0, %tmp1.1
+  %tmp2.2 = add i32 %tmp0.2, %tmp0.3
+  %tmp2.3 = add i32 %tmp1.2, %tmp1.3
+  %tmp3.0 = insertelement <4 x i32> undef, i32 %tmp2.0, i32 0
+  %tmp3.1 = insertelement <4 x i32> %tmp3.0, i32 %tmp2.1, i32 1
+  %tmp3.2 = insertelement <4 x i32> %tmp3.1, i32 %tmp2.2, i32 2
+  %tmp3.3 = insertelement <4 x i32> %tmp3.2, i32 %tmp2.3, i32 3
+  ret <4 x i32> %tmp3.3
+}
+
+define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @reduction_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[V1]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[SHUFFLE3:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = sub <4 x i32> [[SHUFFLE2]], [[SHUFFLE3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[SHUFFLE2]], [[SHUFFLE3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[TMP5]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr <4 x i32> [[TMP11]], <i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP13:%.*]] = and <4 x i32> [[TMP12]], <i32 65537, i32 65537, i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP14:%.*]] = mul nuw <4 x i32> [[TMP13]], <i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP15:%.*]] = add <4 x i32> [[TMP14]], [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <4 x i32> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP16]])
+; CHECK-NEXT:    ret i32 [[TMP17]]
+;
+  %v0.0 = extractelement <4 x i32> %v0, i32 0
+  %v0.1 = extractelement <4 x i32> %v0, i32 1
+  %v0.2 = extractelement <4 x i32> %v0, i32 2
+  %v0.3 = extractelement <4 x i32> %v0, i32 3
+  %v1.0 = extractelement <4 x i32> %v1, i32 0
+  %v1.1 = extractelement <4 x i32> %v1, i32 1
+  %v1.2 = extractelement <4 x i32> %v1, i32 2
+  %v1.3 = extractelement <4 x i32> %v1, i32 3
+  %tmp0.0 = add i32 %v0.0, %v1.0
+  %tmp0.1 = add i32 %v0.1, %v1.1
+  %tmp0.2 = add i32 %v0.2, %v1.2
+  %tmp0.3 = add i32 %v0.3, %v1.3
+  %tmp1.0 = sub i32 %v0.0, %v1.0
+  %tmp1.1 = sub i32 %v0.1, %v1.1
+  %tmp1.2 = sub i32 %v0.2, %v1.2
+  %tmp1.3 = sub i32 %v0.3, %v1.3
+  %tmp2.0 = add i32 %tmp0.0, %tmp0.1
+  %tmp2.1 = add i32 %tmp1.0, %tmp1.1
+  %tmp2.2 = add i32 %tmp0.2, %tmp0.3
+  %tmp2.3 = add i32 %tmp1.2, %tmp1.3
+  %tmp3.0 = lshr i32 %tmp2.0, 15
+  %tmp3.1 = lshr i32 %tmp2.1, 15
+  %tmp3.2 = lshr i32 %tmp2.2, 15
+  %tmp3.3 = lshr i32 %tmp2.3, 15
+  %tmp4.0 = and i32 %tmp3.0, 65537
+  %tmp4.1 = and i32 %tmp3.1, 65537
+  %tmp4.2 = and i32 %tmp3.2, 65537
+  %tmp4.3 = and i32 %tmp3.3, 65537
+  %tmp5.0 = mul nuw i32 %tmp4.0, 65535
+  %tmp5.1 = mul nuw i32 %tmp4.1, 65535
+  %tmp5.2 = mul nuw i32 %tmp4.2, 65535
+  %tmp5.3 = mul nuw i32 %tmp4.3, 65535
+  %tmp6.0 = add i32 %tmp5.0, %tmp2.0
+  %tmp6.1 = add i32 %tmp5.1, %tmp2.1
+  %tmp6.2 = add i32 %tmp5.2, %tmp2.2
+  %tmp6.3 = add i32 %tmp5.3, %tmp2.3
+  %tmp7.0 = xor i32 %tmp6.0, %tmp5.0
+  %tmp7.1 = xor i32 %tmp6.1, %tmp5.1
+  %tmp7.2 = xor i32 %tmp6.2, %tmp5.2
+  %tmp7.3 = xor i32 %tmp6.3, %tmp5.3
+  %reduce.0 = add i32 %tmp7.1, %tmp7.0
+  %reduce.1 = add i32 %reduce.0, %tmp7.2
+  %reduce.2 = add i32 %reduce.1, %tmp7.3
+  ret i32 %reduce.2
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,127 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -S -mtriple=aarch64-unknown-unknown -mcpu=cortex-a53 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+; This test is reduced from the TSVC evaluation of vectorizers:
+; https://github.com/llvm/llvm-test-suite/commits/master/MultiSource/Benchmarks/TSVC/LoopRerolling-flt/tsc.c
+; Two loads and an fmul are expected to be vectorized to <2 x float>.
+; Otherwise, performance will suffer on Cortex-A53.
+; See https://bugs.llvm.org/show_bug.cgi?id=36280 for more details.
+
+%struct.GlobalData = type { [32000 x float], [3 x i32], [4 x i8], [32000 x float], [5 x i32], [12 x i8], [32000 x float], [7 x i32], [4 x i8], [32000 x float], [11 x i32], [4 x i8], [32000 x float], [13 x i32], [12 x i8], [256 x [256 x float]], [17 x i32], [12 x i8], [256 x [256 x float]], [19 x i32], [4 x i8], [256 x [256 x float]], [23 x i32], [4 x i8], [256 x [256 x float]] }
+
+ at global_data = common dso_local global %struct.GlobalData zeroinitializer, align 16
+
+define i32 @s352() {
+; CHECK-LABEL: @s352(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[PREHEADER:%.*]]
+; CHECK:       preheader:
+; CHECK-NEXT:    [[NL_017:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_COND_CLEANUP3:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret i32 0
+; CHECK:       for.cond.cleanup3:
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[NL_017]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1600000
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[PREHEADER]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[DOT_115:%.*]] = phi float [ 0.000000e+00, [[PREHEADER]] ], [ [[ADD39:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA:%.*]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[MUL7:%.*]] = fmul float [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[DOT_115]], [[MUL7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX10]], align 4
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* [[ARRAYIDX13]], align 4
+; CHECK-NEXT:    [[MUL14:%.*]] = fmul float [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[ADD15:%.*]] = fadd float [[ADD]], [[MUL14]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[TMP5]]
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[ARRAYIDX18]] to <2 x float>*
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[TMP7]], align 4
+; CHECK-NEXT:    [[ARRAYIDX29:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[ARRAYIDX21]] to <2 x float>*
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP11]], i32 0
+; CHECK-NEXT:    [[ADD23:%.*]] = fadd float [[ADD15]], [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[TMP11]], i32 1
+; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[ADD23]], [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, float* [[ARRAYIDX34]], align 4
+; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, float* [[ARRAYIDX37]], align 4
+; CHECK-NEXT:    [[MUL38:%.*]] = fmul float [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[ADD39]] = fadd float [[ADD31]], [[MUL38]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 32000
+; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP3]]
+;
+entry:
+  br label %preheader
+
+preheader:
+  %nl.017 = phi i32 [ 0, %entry ], [ %inc, %for.cond.cleanup3 ]
+  br label %for.body
+
+for.cond.cleanup:
+  ret i32 0
+
+for.cond.cleanup3:
+  %inc = add nuw nsw i32 %nl.017, 1
+  %exitcond = icmp eq i32 %inc, 1600000
+  br i1 %exitcond, label %for.cond.cleanup, label %preheader
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %preheader ], [ %indvars.iv.next, %for.body ]
+  %dot.115 = phi float [ 0.000000e+00, %preheader ], [ %add39, %for.body ]
+  %arrayidx = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx6 = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 3, i64 %indvars.iv
+  %1 = load float, float* %arrayidx6, align 4
+  %mul7 = fmul float %0, %1
+  %add = fadd float %dot.115, %mul7
+  %2 = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx10 = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 0, i64 %2
+  %3 = load float, float* %arrayidx10, align 4
+  %arrayidx13 = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 3, i64 %2
+  %4 = load float, float* %arrayidx13, align 4
+  %mul14 = fmul float %3, %4
+  %add15 = fadd float %add, %mul14
+  %5 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx18 = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 0, i64 %5
+  %6 = load float, float* %arrayidx18, align 4
+  %arrayidx21 = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 3, i64 %5
+  %7 = load float, float* %arrayidx21, align 4
+  %mul22 = fmul float %6, %7
+  %add23 = fadd float %add15, %mul22
+  %8 = add nuw nsw i64 %indvars.iv, 3
+  %arrayidx26 = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 0, i64 %8
+  %9 = load float, float* %arrayidx26, align 4
+  %arrayidx29 = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 3, i64 %8
+  %10 = load float, float* %arrayidx29, align 4
+  %mul30 = fmul float %9, %10
+  %add31 = fadd float %add23, %mul30
+  %11 = add nuw nsw i64 %indvars.iv, 4
+  %arrayidx34 = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 0, i64 %11
+  %12 = load float, float* %arrayidx34, align 4
+  %arrayidx37 = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 3, i64 %11
+  %13 = load float, float* %arrayidx37, align 4
+  %mul38 = fmul float %12, %13
+  %add39 = fadd float %add31, %mul38
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5
+  %cmp2 = icmp ult i64 %indvars.iv.next, 32000
+  br i1 %cmp2, label %for.body, label %for.cond.cleanup3
+}
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/address-space-ptr-sze-gep-index-assert.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/address-space-ptr-sze-gep-index-assert.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/address-space-ptr-sze-gep-index-assert.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/address-space-ptr-sze-gep-index-assert.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,149 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -slp-threshold=-18 < %s | FileCheck %s
+
+; Make sure there's no SCEV assert when the indexes are for different
+; sized address spaces
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
+define void @slp_scev_assert(i32 %idx, i64 %tmp3) #0 {
+; CHECK-LABEL: @slp_scev_assert(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP:%.*]] = addrspacecast i8 addrspace(5)* undef to i8*
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* undef, i32 [[IDX:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP]], i64 [[TMP3:%.*]]
+; CHECK-NEXT:    store i8 0, i8 addrspace(5)* [[TMP2]]
+; CHECK-NEXT:    store i8 0, i8* [[TMP4]]
+; CHECK-NEXT:    ret void
+;
+bb:
+  %tmp = addrspacecast i8 addrspace(5)* undef to i8*
+  %tmp2 = getelementptr inbounds i8, i8 addrspace(5)* undef, i32 %idx
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i64 %tmp3
+  store i8 0, i8 addrspace(5)* %tmp2
+  store i8 0, i8* %tmp4
+  ret void
+}
+
+define void @multi_as_reduction_different_sized(i32 addrspace(3)* %lds, i32 %idx0, i64 %idx1) #0 {
+; CHECK-LABEL: @multi_as_reduction_different_sized(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[FLAT:%.*]] = addrspacecast i32 addrspace(3)* [[LDS:%.*]] to i32*
+; CHECK-NEXT:    [[ADD0:%.*]] = add i32 [[IDX0:%.*]], 2
+; CHECK-NEXT:    [[ADD1:%.*]] = add i64 [[IDX1:%.*]], 1
+; CHECK-NEXT:    [[LDS_1:%.*]] = getelementptr inbounds i32, i32 addrspace(3)* [[LDS]], i32 [[ADD0]]
+; CHECK-NEXT:    [[FLAT_1:%.*]] = getelementptr inbounds i32, i32* [[FLAT]], i64 [[ADD1]]
+; CHECK-NEXT:    [[LOAD_LDS_0:%.*]] = load i32, i32 addrspace(3)* [[LDS]], align 4
+; CHECK-NEXT:    [[LOAD_LDS_1:%.*]] = load i32, i32 addrspace(3)* [[LDS_1]], align 4
+; CHECK-NEXT:    [[LOAD_FLAT_0:%.*]] = load i32, i32* [[FLAT]], align 4
+; CHECK-NEXT:    [[LOAD_FLAT_1:%.*]] = load i32, i32* [[FLAT_1]], align 4
+; CHECK-NEXT:    [[SUB0:%.*]] = sub i32 [[LOAD_FLAT_0]], [[LOAD_LDS_0]]
+; CHECK-NEXT:    [[SUB1:%.*]] = sub i32 [[LOAD_FLAT_1]], [[LOAD_LDS_1]]
+; CHECK-NEXT:    store i32 [[SUB0]], i32* undef
+; CHECK-NEXT:    store i32 [[SUB1]], i32* undef
+; CHECK-NEXT:    ret void
+;
+bb:
+  %flat = addrspacecast i32 addrspace(3)* %lds to i32*
+  %add0 = add i32 %idx0, 2
+  %add1 = add i64 %idx1, 1
+
+  %lds.1 = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 %add0
+  %flat.1 = getelementptr inbounds i32, i32* %flat, i64 %add1
+
+  %load.lds.0 = load i32, i32 addrspace(3)* %lds, align 4
+  %load.lds.1 = load i32, i32 addrspace(3)* %lds.1, align 4
+
+  %load.flat.0 = load i32, i32* %flat, align 4
+  %load.flat.1 = load i32, i32* %flat.1, align 4
+
+  %sub0 = sub i32 %load.flat.0, %load.lds.0
+  %sub1 = sub i32 %load.flat.1, %load.lds.1
+
+  store i32 %sub0, i32* undef
+  store i32 %sub1, i32* undef
+  ret void
+}
+
+; This should vectorize if using GetUnderlyingObject
+define void @multi_as_reduction_same_size(i32 addrspace(1)* %global, i64 %idx0, i64 %idx1) #0 {
+; CHECK-LABEL: @multi_as_reduction_same_size(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[FLAT:%.*]] = addrspacecast i32 addrspace(1)* [[GLOBAL:%.*]] to i32*
+; CHECK-NEXT:    [[ADD0:%.*]] = add i64 [[IDX0:%.*]], 2
+; CHECK-NEXT:    [[ADD1:%.*]] = add i64 [[IDX1:%.*]], 1
+; CHECK-NEXT:    [[GLOBAL_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[GLOBAL]], i64 [[ADD0]]
+; CHECK-NEXT:    [[FLAT_1:%.*]] = getelementptr inbounds i32, i32* [[FLAT]], i64 [[ADD1]]
+; CHECK-NEXT:    [[LOAD_GLOBAL_0:%.*]] = load i32, i32 addrspace(1)* [[GLOBAL]], align 4
+; CHECK-NEXT:    [[LOAD_GLOBAL_1:%.*]] = load i32, i32 addrspace(1)* [[GLOBAL_1]], align 4
+; CHECK-NEXT:    [[LOAD_FLAT_0:%.*]] = load i32, i32* [[FLAT]], align 4
+; CHECK-NEXT:    [[LOAD_FLAT_1:%.*]] = load i32, i32* [[FLAT_1]], align 4
+; CHECK-NEXT:    [[SUB0:%.*]] = sub i32 [[LOAD_FLAT_0]], [[LOAD_GLOBAL_0]]
+; CHECK-NEXT:    [[SUB1:%.*]] = sub i32 [[LOAD_FLAT_1]], [[LOAD_GLOBAL_1]]
+; CHECK-NEXT:    store i32 [[SUB0]], i32* undef
+; CHECK-NEXT:    store i32 [[SUB1]], i32* undef
+; CHECK-NEXT:    ret void
+;
+bb:
+  %flat = addrspacecast i32 addrspace(1)* %global to i32*
+  %add0 = add i64 %idx0, 2
+  %add1 = add i64 %idx1, 1
+
+  %global.1 = getelementptr inbounds i32, i32 addrspace(1)* %global, i64 %add0
+  %flat.1 = getelementptr inbounds i32, i32* %flat, i64 %add1
+
+  %load.global.0 = load i32, i32 addrspace(1)* %global, align 4
+  %load.global.1 = load i32, i32 addrspace(1)* %global.1, align 4
+
+  %load.flat.0 = load i32, i32* %flat, align 4
+  %load.flat.1 = load i32, i32* %flat.1, align 4
+
+  %sub0 = sub i32 %load.flat.0, %load.global.0
+  %sub1 = sub i32 %load.flat.1, %load.global.1
+
+  store i32 %sub0, i32* undef
+  store i32 %sub1, i32* undef
+  ret void
+}
+
+; This should vectorize if using GetUnderlyingObject
+; The add is done in the same width, even though the address space size is smaller
+define void @multi_as_reduction_different_sized_noncanon(i32 addrspace(3)* %lds, i64 %idx0, i64 %idx1) #0 {
+; CHECK-LABEL: @multi_as_reduction_different_sized_noncanon(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[FLAT:%.*]] = addrspacecast i32 addrspace(3)* [[LDS:%.*]] to i32*
+; CHECK-NEXT:    [[ADD0:%.*]] = add i64 [[IDX0:%.*]], 2
+; CHECK-NEXT:    [[ADD1:%.*]] = add i64 [[IDX1:%.*]], 1
+; CHECK-NEXT:    [[LDS_1:%.*]] = getelementptr inbounds i32, i32 addrspace(3)* [[LDS]], i64 [[ADD0]]
+; CHECK-NEXT:    [[FLAT_1:%.*]] = getelementptr inbounds i32, i32* [[FLAT]], i64 [[ADD1]]
+; CHECK-NEXT:    [[LOAD_LDS_0:%.*]] = load i32, i32 addrspace(3)* [[LDS]], align 4
+; CHECK-NEXT:    [[LOAD_LDS_1:%.*]] = load i32, i32 addrspace(3)* [[LDS_1]], align 4
+; CHECK-NEXT:    [[LOAD_FLAT_0:%.*]] = load i32, i32* [[FLAT]], align 4
+; CHECK-NEXT:    [[LOAD_FLAT_1:%.*]] = load i32, i32* [[FLAT_1]], align 4
+; CHECK-NEXT:    [[SUB0:%.*]] = sub i32 [[LOAD_FLAT_0]], [[LOAD_LDS_0]]
+; CHECK-NEXT:    [[SUB1:%.*]] = sub i32 [[LOAD_FLAT_1]], [[LOAD_LDS_1]]
+; CHECK-NEXT:    store i32 [[SUB0]], i32* undef
+; CHECK-NEXT:    store i32 [[SUB1]], i32* undef
+; CHECK-NEXT:    ret void
+;
+bb:
+  %flat = addrspacecast i32 addrspace(3)* %lds to i32*
+  %add0 = add i64 %idx0, 2
+  %add1 = add i64 %idx1, 1
+
+  %lds.1 = getelementptr inbounds i32, i32 addrspace(3)* %lds, i64 %add0
+  %flat.1 = getelementptr inbounds i32, i32* %flat, i64 %add1
+
+  %load.lds.0 = load i32, i32 addrspace(3)* %lds, align 4
+  %load.lds.1 = load i32, i32 addrspace(3)* %lds.1, align 4
+
+  %load.flat.0 = load i32, i32* %flat, align 4
+  %load.flat.1 = load i32, i32* %flat.1, align 4
+
+  %sub0 = sub i32 %load.flat.0, %load.lds.0
+  %sub1 = sub i32 %load.flat.1, %load.lds.1
+
+  store i32 %sub0, i32* undef
+  store i32 %sub1, i32* undef
+  ret void
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,250 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -S -slp-threshold=-100 -slp-vectorize-hor-store -dce | FileCheck %s --check-prefix=GFX9
+
+ at arr = local_unnamed_addr global [32 x i32] zeroinitializer, align 16
+ at arr64 = local_unnamed_addr global [32 x i64] zeroinitializer, align 16
+ at var = global i32 zeroinitializer, align 8
+ at var64 = global i64 zeroinitializer, align 8
+
+ at farr = local_unnamed_addr global [32 x float] zeroinitializer, align 16
+ at fvar = global float zeroinitializer, align 8
+
+ at darr = local_unnamed_addr global [32 x double] zeroinitializer, align 16
+ at dvar = global double zeroinitializer, align 8
+
+; Tests whether the min/max reduction pattern is vectorized if SLP starts at the store.
+define i32 @smaxv6() {
+; GFX9-LABEL: @smaxv6(
+; GFX9-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16
+; GFX9-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; GFX9-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; GFX9-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
+; GFX9-NEXT:    [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[TMP2]], i32 [[TMP3]]
+; GFX9-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP4]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP4]], <4 x i32> [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
+; GFX9-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT:    [[TMP6:%.*]] = icmp sgt i32 [[TMP5]], [[SELECT1]]
+; GFX9-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP6]], i32 [[TMP5]], i32 [[SELECT1]]
+; GFX9-NEXT:    [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i32 3, i32 4
+; GFX9-NEXT:    store i32 [[STORE_SELECT]], i32* @var, align 8
+; GFX9-NEXT:    ret i32 [[OP_EXTRA]]
+;
+  %load1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
+  %load2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
+  %cmp1 = icmp sgt i32 %load1, %load2
+  %select1 = select i1 %cmp1, i32 %load1, i32 %load2
+
+  %load3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
+  %cmp2 = icmp sgt i32 %select1, %load3
+  %select2 = select i1 %cmp2, i32 %select1, i32 %load3
+
+  %load4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
+  %cmp3 = icmp sgt i32 %select2, %load4
+  %select3 = select i1 %cmp3, i32 %select2, i32 %load4
+
+  %load5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
+  %cmp4 = icmp sgt i32 %select3, %load5
+  %select4 = select i1 %cmp4, i32 %select3, i32 %load5
+
+  %load6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
+  %cmp5 = icmp sgt i32 %select4, %load6
+  %select5 = select i1 %cmp5, i32 %select4, i32 %load6
+
+  %store-select = select i1 %cmp1, i32 3, i32 4
+  store i32 %store-select, i32* @var, align 8
+  ret i32 %select5
+}
+
+define i64 @sminv6() {
+; GFX9-LABEL: @sminv6(
+; GFX9-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([32 x i64]* @arr64 to <2 x i64>*), align 16
+; GFX9-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; GFX9-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+; GFX9-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[TMP2]], [[TMP3]]
+; GFX9-NEXT:    [[SELECT1:%.*]] = select i1 [[CMP1]], i64 [[TMP2]], i64 [[TMP3]]
+; GFX9-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 2) to <4 x i64>*), align 16
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp slt <4 x i64> [[TMP4]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i64> [[TMP4]], <4 x i64> [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[RDX_MINMAX_SELECT]], <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp slt <4 x i64> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i64> [[RDX_MINMAX_SELECT]], <4 x i64> [[RDX_SHUF1]]
+; GFX9-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT:    [[TMP6:%.*]] = icmp slt i64 [[TMP5]], [[SELECT1]]
+; GFX9-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP6]], i64 [[TMP5]], i64 [[SELECT1]]
+; GFX9-NEXT:    [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i64 3, i64 4
+; GFX9-NEXT:    store i64 [[STORE_SELECT]], i64* @var64, align 8
+; GFX9-NEXT:    ret i64 [[OP_EXTRA]]
+;
+  %load1 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 0), align 16
+  %load2 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 1), align 8
+  %cmp1 = icmp slt i64 %load1, %load2
+  %select1 = select i1 %cmp1, i64 %load1, i64 %load2
+
+  %load3 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 2), align 16
+  %cmp2 = icmp slt i64 %select1, %load3
+  %select2 = select i1 %cmp2, i64 %select1, i64 %load3
+
+  %load4 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 3), align 8
+  %cmp3 = icmp slt i64 %select2, %load4
+  %select3 = select i1 %cmp3, i64 %select2, i64 %load4
+
+  %load5 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 4), align 16
+  %cmp4 = icmp slt i64 %select3, %load5
+  %select4 = select i1 %cmp4, i64 %select3, i64 %load5
+
+  %load6 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 5), align 8
+  %cmp5 = icmp slt i64 %select4, %load6
+  %select5 = select i1 %cmp5, i64 %select4, i64 %load6
+
+  %store-select = select i1 %cmp1, i64 3, i64 4
+  store i64 %store-select, i64* @var64, align 8
+  ret i64 %select5
+}
+
+define float @fmaxv6() {
+; GFX9-LABEL: @fmaxv6(
+; GFX9-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([32 x float]* @farr to <2 x float>*), align 16
+; GFX9-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; GFX9-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; GFX9-NEXT:    [[CMP1:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
+; GFX9-NEXT:    [[SELECT1:%.*]] = select i1 [[CMP1]], float [[TMP2]], float [[TMP3]]
+; GFX9-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 2) to <4 x float>*), align 8
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <4 x float> [[TMP4]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP4]], <4 x float> [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF1]]
+; GFX9-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT:    [[TMP6:%.*]] = fcmp fast ogt float [[TMP5]], [[SELECT1]]
+; GFX9-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP6]], float [[TMP5]], float [[SELECT1]]
+; GFX9-NEXT:    [[STORE_SELECT:%.*]] = select i1 [[CMP1]], float 3.000000e+00, float 4.000000e+00
+; GFX9-NEXT:    store float [[STORE_SELECT]], float* @fvar, align 8
+; GFX9-NEXT:    ret float [[OP_EXTRA]]
+;
+  %load1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 0), align 16
+  %load2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 1), align 4
+  %cmp1 = fcmp fast ogt float %load1, %load2
+  %select1 = select i1 %cmp1, float %load1, float %load2
+
+  %load3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 2), align 8
+  %cmp2 = fcmp fast ogt float %select1, %load3
+  %select2 = select i1 %cmp2, float %select1, float %load3
+
+  %load4 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 3), align 4
+  %cmp3 = fcmp fast ogt float %select2, %load4
+  %select3 = select i1 %cmp3, float %select2, float %load4
+
+  %load5 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 4), align 16
+  %cmp4 = fcmp fast ogt float %select3, %load5
+  %select4 = select i1 %cmp4, float %select3, float %load5
+
+  %load6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 5), align 4
+  %cmp5 = fcmp fast ogt float %select4, %load6
+  %select5 = select i1 %cmp5, float %select4, float %load6
+
+  %store-select = select i1 %cmp1, float 3.0, float 4.0
+  store float %store-select, float* @fvar, align 8
+  ret float %select5
+}
+
+define double @dminv6() {
+; GFX9-LABEL: @dminv6(
+; GFX9-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([32 x double]* @darr to <2 x double>*), align 16
+; GFX9-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; GFX9-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; GFX9-NEXT:    [[CMP1:%.*]] = fcmp fast olt double [[TMP2]], [[TMP3]]
+; GFX9-NEXT:    [[SELECT1:%.*]] = select i1 [[CMP1]], double [[TMP2]], double [[TMP3]]
+; GFX9-NEXT:    [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 2) to <4 x double>*), align 8
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <4 x double> [[TMP4]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x double> [[TMP4]], <4 x double> [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x double> [[RDX_MINMAX_SELECT]], <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast olt <4 x double> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x double> [[RDX_MINMAX_SELECT]], <4 x double> [[RDX_SHUF1]]
+; GFX9-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT:    [[TMP6:%.*]] = fcmp fast olt double [[TMP5]], [[SELECT1]]
+; GFX9-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP6]], double [[TMP5]], double [[SELECT1]]
+; GFX9-NEXT:    [[STORE_SELECT:%.*]] = select i1 [[CMP1]], double 3.000000e+00, double 4.000000e+00
+; GFX9-NEXT:    store double [[STORE_SELECT]], double* @dvar, align 8
+; GFX9-NEXT:    ret double [[OP_EXTRA]]
+;
+  %load1 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 0), align 16
+  %load2 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 1), align 4
+  %cmp1 = fcmp fast olt double %load1, %load2
+  %select1 = select i1 %cmp1, double %load1, double %load2
+
+  %load3 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 2), align 8
+  %cmp2 = fcmp fast olt double %select1, %load3
+  %select2 = select i1 %cmp2, double %select1, double %load3
+
+  %load4 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 3), align 4
+  %cmp3 = fcmp fast olt double %select2, %load4
+  %select3 = select i1 %cmp3, double %select2, double %load4
+
+  %load5 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 4), align 16
+  %cmp4 = fcmp fast olt double %select3, %load5
+  %select4 = select i1 %cmp4, double %select3, double %load5
+
+  %load6 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 5), align 4
+  %cmp5 = fcmp fast olt double %select4, %load6
+  %select5 = select i1 %cmp5, double %select4, double %load6
+
+  %store-select = select i1 %cmp1, double 3.0, double 4.0
+  store double %store-select, double* @dvar, align 8
+  ret double %select5
+}
+
+define i32 @smax_wdiff_valuenum(i32, i32 %v1) {
+; GFX9-LABEL: @smax_wdiff_valuenum(
+; GFX9-NEXT:    [[VLOAD:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16
+; GFX9-NEXT:    [[ELT1:%.*]] = extractelement <2 x i32> [[VLOAD]], i32 0
+; GFX9-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[ELT1]], [[V1:%.*]]
+; GFX9-NEXT:    [[EX0:%.*]] = extractelement <2 x i32> [[VLOAD]], i32 0
+; GFX9-NEXT:    [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[EX0]], i32 [[V1]]
+; GFX9-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP2]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP2]], <4 x i32> [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
+; GFX9-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP3]], [[SELECT1]]
+; GFX9-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP4]], i32 [[TMP3]], i32 [[SELECT1]]
+; GFX9-NEXT:    [[STOREVAL:%.*]] = select i1 [[CMP1]], i32 3, i32 4
+; GFX9-NEXT:    store i32 [[STOREVAL]], i32* @var, align 8
+; GFX9-NEXT:    ret i32 [[OP_EXTRA]]
+;
+  %vload = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16
+  %elt1 = extractelement <2 x i32> %vload, i32 0
+  %cmp1 = icmp sgt i32 %elt1, %v1
+  %ex0 = extractelement <2 x i32> %vload, i32 0
+  %select1 = select i1 %cmp1, i32 %ex0, i32 %v1
+
+  %load3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
+  %cmp2 = icmp sgt i32 %select1, %load3
+  %select2 = select i1 %cmp2, i32 %select1, i32 %load3
+
+  %load4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
+  %cmp3 = icmp sgt i32 %select2, %load4
+  %select3 = select i1 %cmp3, i32 %select2, i32 %load4
+
+  %load5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
+  %cmp4 = icmp sgt i32 %select3, %load5
+  %select4 = select i1 %cmp4, i32 %select3, i32 %load5
+
+  %load6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
+  %cmp5 = icmp sgt i32 %select4, %load6
+  %select5 = select i1 %cmp5, i32 %select4, i32 %load6
+
+  %storeval = select i1 %cmp1, i32 3, i32 4
+  store i32 %storeval, i32* @var, align 8
+  ret i32 %select5
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/lit.local.cfg?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/lit.local.cfg (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/lit.local.cfg Tue Apr 16 21:52:47 2019
@@ -0,0 +1,3 @@
+if not 'AMDGPU' in config.root.targets:
+    config.unsupported = True
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,203 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,GFX9,GFX89 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,VI,GFX89 %s
+
+; FIXME: Should still like to vectorize the memory operations for VI
+
+; Simple 3-pair chain with loads and stores
+; GCN-LABEL: @test1_as_3_3_3_v2f16(
+; GFX89: load <2 x half>, <2 x half> addrspace(3)*
+; GFX89: load <2 x half>, <2 x half> addrspace(3)*
+; GFX89: fmul <2 x half>
+; GFX89: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* %
+; GFX89: ret
+define amdgpu_kernel void @test1_as_3_3_3_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c) {
+  %i0 = load half, half addrspace(3)* %a, align 2
+  %i1 = load half, half addrspace(3)* %b, align 2
+  %mul = fmul half %i0, %i1
+  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
+  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
+  %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
+  %i4 = load half, half addrspace(3)* %arrayidx4, align 2
+  %mul5 = fmul half %i3, %i4
+  store half %mul, half addrspace(3)* %c, align 2
+  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
+  store half %mul5, half addrspace(3)* %arrayidx5, align 2
+  ret void
+}
+
+; GCN-LABEL: @test1_as_3_0_0(
+; GFX89: load <2 x half>, <2 x half> addrspace(3)*
+; GFX89: load <2 x half>, <2 x half>*
+; GFX89: fmul <2 x half>
+; GFX89: store <2 x half> %{{.*}}, <2 x half>* %
+; GFX89: ret
+define amdgpu_kernel void @test1_as_3_0_0(half addrspace(3)* %a, half* %b, half* %c) {
+  %i0 = load half, half addrspace(3)* %a, align 2
+  %i1 = load half, half* %b, align 2
+  %mul = fmul half %i0, %i1
+  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
+  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
+  %arrayidx4 = getelementptr inbounds half, half* %b, i64 1
+  %i4 = load half, half* %arrayidx4, align 2
+  %mul5 = fmul half %i3, %i4
+  store half %mul, half* %c, align 2
+  %arrayidx5 = getelementptr inbounds half, half* %c, i64 1
+  store half %mul5, half* %arrayidx5, align 2
+  ret void
+}
+
+; GCN-LABEL: @test1_as_0_0_3_v2f16(
+; GFX89: load <2 x half>, <2 x half>*
+; GFX89: load <2 x half>, <2 x half>*
+; GFX89: fmul <2 x half>
+; GFX89: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* %
+; GFX89: ret
+define amdgpu_kernel void @test1_as_0_0_3_v2f16(half* %a, half* %b, half addrspace(3)* %c) {
+  %i0 = load half, half* %a, align 2
+  %i1 = load half, half* %b, align 2
+  %mul = fmul half %i0, %i1
+  %arrayidx3 = getelementptr inbounds half, half* %a, i64 1
+  %i3 = load half, half* %arrayidx3, align 2
+  %arrayidx4 = getelementptr inbounds half, half* %b, i64 1
+  %i4 = load half, half* %arrayidx4, align 2
+  %mul5 = fmul half %i3, %i4
+  store half %mul, half addrspace(3)* %c, align 2
+  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
+  store half %mul5, half addrspace(3)* %arrayidx5, align 2
+  ret void
+}
+
+; GCN-LABEL: @test1_fma_v2f16(
+; GFX9: load <2 x half>
+; GFX9: load <2 x half>
+; GFX9: load <2 x half>
+; GFX9: call <2 x half> @llvm.fma.v2f16(
+; GFX9: store <2 x half>
+define amdgpu_kernel void @test1_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
+  %i0 = load half, half addrspace(3)* %a, align 2
+  %i1 = load half, half addrspace(3)* %b, align 2
+  %i2 = load half, half addrspace(3)* %c, align 2
+  %fma0 = call half @llvm.fma.f16(half %i0, half %i1, half %i2)
+  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
+  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
+  %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
+  %i4 = load half, half addrspace(3)* %arrayidx4, align 2
+  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
+  %i5 = load half, half addrspace(3)* %arrayidx5, align 2
+  %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
+  store half %fma0, half addrspace(3)* %d, align 2
+  %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
+  store half %fma1, half addrspace(3)* %arrayidx6, align 2
+  ret void
+}
+
+; GCN-LABEL: @mul_scalar_v2f16(
+; GFX9: load <2 x half>
+; GFX9: fmul <2 x half>
+; GFX9: store <2 x half>
+define amdgpu_kernel void @mul_scalar_v2f16(half addrspace(3)* %a, half %scalar, half addrspace(3)* %c) {
+  %i0 = load half, half addrspace(3)* %a, align 2
+  %mul = fmul half %i0, %scalar
+  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
+  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
+  %mul5 = fmul half %i3, %scalar
+  store half %mul, half addrspace(3)* %c, align 2
+  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
+  store half %mul5, half addrspace(3)* %arrayidx5, align 2
+  ret void
+}
+
+; GCN-LABEL: @fabs_v2f16
+; GFX9: load <2 x half>
+; GFX9: call <2 x half> @llvm.fabs.v2f16(
+; GFX9: store <2 x half>
+define amdgpu_kernel void @fabs_v2f16(half addrspace(3)* %a, half addrspace(3)* %c) {
+  %i0 = load half, half addrspace(3)* %a, align 2
+  %fabs0 = call half @llvm.fabs.f16(half %i0)
+  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
+  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
+  %fabs1 = call half @llvm.fabs.f16(half %i3)
+  store half %fabs0, half addrspace(3)* %c, align 2
+  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
+  store half %fabs1, half addrspace(3)* %arrayidx5, align 2
+  ret void
+}
+
+; GCN-LABEL: @test1_fabs_fma_v2f16(
+; GFX9: load <2 x half>
+; GFX9: call <2 x half> @llvm.fabs.v2f16(
+; GFX9: call <2 x half> @llvm.fma.v2f16(
+; GFX9: store <2 x half>
+define amdgpu_kernel void @test1_fabs_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
+  %i0 = load half, half addrspace(3)* %a, align 2
+  %i1 = load half, half addrspace(3)* %b, align 2
+  %i2 = load half, half addrspace(3)* %c, align 2
+  %i0.fabs = call half @llvm.fabs.f16(half %i0)
+
+  %fma0 = call half @llvm.fma.f16(half %i0.fabs, half %i1, half %i2)
+  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
+  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
+  %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
+  %i4 = load half, half addrspace(3)* %arrayidx4, align 2
+  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
+  %i5 = load half, half addrspace(3)* %arrayidx5, align 2
+  %i3.fabs = call half @llvm.fabs.f16(half %i3)
+
+  %fma1 = call half @llvm.fma.f16(half %i3.fabs, half %i4, half %i5)
+  store half %fma0, half addrspace(3)* %d, align 2
+  %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
+  store half %fma1, half addrspace(3)* %arrayidx6, align 2
+  ret void
+}
+
+; FIXME: Should do vector load and extract component for fabs
+; GCN-LABEL: @test1_fabs_scalar_fma_v2f16(
+; GFX9: load half
+; GFX9: call half @llvm.fabs.f16(
+; GFX9: load <2 x half>
+; GFX9: load half
+; GFX9: load <2 x half>
+; GFX9: call <2 x half> @llvm.fma.v2f16(
+; GFX9: store <2 x half>
+define amdgpu_kernel void @test1_fabs_scalar_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
+  %i0 = load half, half addrspace(3)* %a, align 2
+  %i1 = load half, half addrspace(3)* %b, align 2
+  %i2 = load half, half addrspace(3)* %c, align 2
+  %i1.fabs = call half @llvm.fabs.f16(half %i1)
+
+  %fma0 = call half @llvm.fma.f16(half %i0, half %i1.fabs, half %i2)
+  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
+  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
+  %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
+  %i4 = load half, half addrspace(3)* %arrayidx4, align 2
+  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
+  %i5 = load half, half addrspace(3)* %arrayidx5, align 2
+  %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
+  store half %fma0, half addrspace(3)* %d, align 2
+  %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
+  store half %fma1, half addrspace(3)* %arrayidx6, align 2
+  ret void
+}
+
+; GCN-LABEL: @canonicalize_v2f16
+; GFX9: load <2 x half>
+; GFX9: call <2 x half> @llvm.canonicalize.v2f16(
+; GFX9: store <2 x half>
+define amdgpu_kernel void @canonicalize_v2f16(half addrspace(3)* %a, half addrspace(3)* %c) {
+  %i0 = load half, half addrspace(3)* %a, align 2
+  %canonicalize0 = call half @llvm.canonicalize.f16(half %i0)
+  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
+  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
+  %canonicalize1 = call half @llvm.canonicalize.f16(half %i3)
+  store half %canonicalize0, half addrspace(3)* %c, align 2
+  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
+  store half %canonicalize1, half addrspace(3)* %arrayidx5, align 2
+  ret void
+}
+
+declare half @llvm.fabs.f16(half) #1
+declare half @llvm.fma.f16(half, half, half) #1
+declare half @llvm.canonicalize.f16(half) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }

Added: llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,722 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,VI %s
+
+define half @reduction_half4(<4 x half> %a) {
+; GFX9-LABEL: @reduction_half4(
+; GFX9-NEXT:  entry:
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x half> [[A:%.*]], <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x half> [[A]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x half> [[BIN_RDX]], <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x half> [[BIN_RDX]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <4 x half> [[BIN_RDX2]], i32 0
+; GFX9-NEXT:    ret half [[TMP0]]
+;
+; VI-LABEL: @reduction_half4(
+; VI-NEXT:  entry:
+; VI-NEXT:    [[ELT0:%.*]] = extractelement <4 x half> [[A:%.*]], i64 0
+; VI-NEXT:    [[ELT1:%.*]] = extractelement <4 x half> [[A]], i64 1
+; VI-NEXT:    [[ELT2:%.*]] = extractelement <4 x half> [[A]], i64 2
+; VI-NEXT:    [[ELT3:%.*]] = extractelement <4 x half> [[A]], i64 3
+; VI-NEXT:    [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
+; VI-NEXT:    [[ADD2:%.*]] = fadd fast half [[ELT2]], [[ADD1]]
+; VI-NEXT:    [[ADD3:%.*]] = fadd fast half [[ELT3]], [[ADD2]]
+; VI-NEXT:    ret half [[ADD3]]
+;
+entry:
+  %elt0 = extractelement <4 x half> %a, i64 0
+  %elt1 = extractelement <4 x half> %a, i64 1
+  %elt2 = extractelement <4 x half> %a, i64 2
+  %elt3 = extractelement <4 x half> %a, i64 3
+
+  %add1 = fadd fast half %elt1, %elt0
+  %add2 = fadd fast half %elt2, %add1
+  %add3 = fadd fast half %elt3, %add2
+
+  ret half %add3
+}
+
+define half @reduction_half8(<8 x half> %vec8) {
+; GFX9-LABEL: @reduction_half8(
+; GFX9-NEXT:  entry:
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x half> [[VEC8:%.*]], <8 x half> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x half> [[VEC8]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x half> [[BIN_RDX]], <8 x half> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x half> [[BIN_RDX]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x half> [[BIN_RDX2]], <8 x half> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x half> [[BIN_RDX2]], [[RDX_SHUF3]]
+; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <8 x half> [[BIN_RDX4]], i32 0
+; GFX9-NEXT:    ret half [[TMP0]]
+;
+; VI-LABEL: @reduction_half8(
+; VI-NEXT:  entry:
+; VI-NEXT:    [[ELT0:%.*]] = extractelement <8 x half> [[VEC8:%.*]], i64 0
+; VI-NEXT:    [[ELT1:%.*]] = extractelement <8 x half> [[VEC8]], i64 1
+; VI-NEXT:    [[ELT2:%.*]] = extractelement <8 x half> [[VEC8]], i64 2
+; VI-NEXT:    [[ELT3:%.*]] = extractelement <8 x half> [[VEC8]], i64 3
+; VI-NEXT:    [[ELT4:%.*]] = extractelement <8 x half> [[VEC8]], i64 4
+; VI-NEXT:    [[ELT5:%.*]] = extractelement <8 x half> [[VEC8]], i64 5
+; VI-NEXT:    [[ELT6:%.*]] = extractelement <8 x half> [[VEC8]], i64 6
+; VI-NEXT:    [[ELT7:%.*]] = extractelement <8 x half> [[VEC8]], i64 7
+; VI-NEXT:    [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
+; VI-NEXT:    [[ADD2:%.*]] = fadd fast half [[ELT2]], [[ADD1]]
+; VI-NEXT:    [[ADD3:%.*]] = fadd fast half [[ELT3]], [[ADD2]]
+; VI-NEXT:    [[ADD4:%.*]] = fadd fast half [[ELT4]], [[ADD3]]
+; VI-NEXT:    [[ADD5:%.*]] = fadd fast half [[ELT5]], [[ADD4]]
+; VI-NEXT:    [[ADD6:%.*]] = fadd fast half [[ELT6]], [[ADD5]]
+; VI-NEXT:    [[ADD7:%.*]] = fadd fast half [[ELT7]], [[ADD6]]
+; VI-NEXT:    ret half [[ADD7]]
+;
+entry:
+  %elt0 = extractelement <8 x half> %vec8, i64 0
+  %elt1 = extractelement <8 x half> %vec8, i64 1
+  %elt2 = extractelement <8 x half> %vec8, i64 2
+  %elt3 = extractelement <8 x half> %vec8, i64 3
+  %elt4 = extractelement <8 x half> %vec8, i64 4
+  %elt5 = extractelement <8 x half> %vec8, i64 5
+  %elt6 = extractelement <8 x half> %vec8, i64 6
+  %elt7 = extractelement <8 x half> %vec8, i64 7
+
+  %add1 = fadd fast half %elt1, %elt0
+  %add2 = fadd fast half %elt2, %add1
+  %add3 = fadd fast half %elt3, %add2
+  %add4 = fadd fast half %elt4, %add3
+  %add5 = fadd fast half %elt5, %add4
+  %add6 = fadd fast half %elt6, %add5
+  %add7 = fadd fast half %elt7, %add6
+
+  ret half %add7
+}
+
+define half @reduction_half16(<16 x half> %vec16) {
+; GFX9-LABEL: @reduction_half16(
+; GFX9-NEXT:  entry:
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x half> [[VEC16:%.*]], <16 x half> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[BIN_RDX:%.*]] = fadd fast <16 x half> [[VEC16]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x half> [[BIN_RDX]], <16 x half> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <16 x half> [[BIN_RDX]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <16 x half> [[BIN_RDX2]], <16 x half> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <16 x half> [[BIN_RDX2]], [[RDX_SHUF3]]
+; GFX9-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <16 x half> [[BIN_RDX4]], <16 x half> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[BIN_RDX6:%.*]] = fadd fast <16 x half> [[BIN_RDX4]], [[RDX_SHUF5]]
+; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <16 x half> [[BIN_RDX6]], i32 0
+; GFX9-NEXT:    ret half [[TMP0]]
+;
+; VI-LABEL: @reduction_half16(
+; VI-NEXT:  entry:
+; VI-NEXT:    [[ELT0:%.*]] = extractelement <16 x half> [[VEC16:%.*]], i64 0
+; VI-NEXT:    [[ELT1:%.*]] = extractelement <16 x half> [[VEC16]], i64 1
+; VI-NEXT:    [[ELT2:%.*]] = extractelement <16 x half> [[VEC16]], i64 2
+; VI-NEXT:    [[ELT3:%.*]] = extractelement <16 x half> [[VEC16]], i64 3
+; VI-NEXT:    [[ELT4:%.*]] = extractelement <16 x half> [[VEC16]], i64 4
+; VI-NEXT:    [[ELT5:%.*]] = extractelement <16 x half> [[VEC16]], i64 5
+; VI-NEXT:    [[ELT6:%.*]] = extractelement <16 x half> [[VEC16]], i64 6
+; VI-NEXT:    [[ELT7:%.*]] = extractelement <16 x half> [[VEC16]], i64 7
+; VI-NEXT:    [[ELT8:%.*]] = extractelement <16 x half> [[VEC16]], i64 8
+; VI-NEXT:    [[ELT9:%.*]] = extractelement <16 x half> [[VEC16]], i64 9
+; VI-NEXT:    [[ELT10:%.*]] = extractelement <16 x half> [[VEC16]], i64 10
+; VI-NEXT:    [[ELT11:%.*]] = extractelement <16 x half> [[VEC16]], i64 11
+; VI-NEXT:    [[ELT12:%.*]] = extractelement <16 x half> [[VEC16]], i64 12
+; VI-NEXT:    [[ELT13:%.*]] = extractelement <16 x half> [[VEC16]], i64 13
+; VI-NEXT:    [[ELT14:%.*]] = extractelement <16 x half> [[VEC16]], i64 14
+; VI-NEXT:    [[ELT15:%.*]] = extractelement <16 x half> [[VEC16]], i64 15
+; VI-NEXT:    [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
+; VI-NEXT:    [[ADD2:%.*]] = fadd fast half [[ELT2]], [[ADD1]]
+; VI-NEXT:    [[ADD3:%.*]] = fadd fast half [[ELT3]], [[ADD2]]
+; VI-NEXT:    [[ADD4:%.*]] = fadd fast half [[ELT4]], [[ADD3]]
+; VI-NEXT:    [[ADD5:%.*]] = fadd fast half [[ELT5]], [[ADD4]]
+; VI-NEXT:    [[ADD6:%.*]] = fadd fast half [[ELT6]], [[ADD5]]
+; VI-NEXT:    [[ADD7:%.*]] = fadd fast half [[ELT7]], [[ADD6]]
+; VI-NEXT:    [[ADD8:%.*]] = fadd fast half [[ELT8]], [[ADD7]]
+; VI-NEXT:    [[ADD9:%.*]] = fadd fast half [[ELT9]], [[ADD8]]
+; VI-NEXT:    [[ADD10:%.*]] = fadd fast half [[ELT10]], [[ADD9]]
+; VI-NEXT:    [[ADD11:%.*]] = fadd fast half [[ELT11]], [[ADD10]]
+; VI-NEXT:    [[ADD12:%.*]] = fadd fast half [[ELT12]], [[ADD11]]
+; VI-NEXT:    [[ADD13:%.*]] = fadd fast half [[ELT13]], [[ADD12]]
+; VI-NEXT:    [[ADD14:%.*]] = fadd fast half [[ELT14]], [[ADD13]]
+; VI-NEXT:    [[ADD15:%.*]] = fadd fast half [[ELT15]], [[ADD14]]
+; VI-NEXT:    ret half [[ADD15]]
+;
+entry:
+  %elt0 = extractelement <16 x half> %vec16, i64 0
+  %elt1 = extractelement <16 x half> %vec16, i64 1
+  %elt2 = extractelement <16 x half> %vec16, i64 2
+  %elt3 = extractelement <16 x half> %vec16, i64 3
+  %elt4 = extractelement <16 x half> %vec16, i64 4
+  %elt5 = extractelement <16 x half> %vec16, i64 5
+  %elt6 = extractelement <16 x half> %vec16, i64 6
+  %elt7 = extractelement <16 x half> %vec16, i64 7
+  %elt8 = extractelement <16 x half> %vec16, i64 8
+  %elt9 = extractelement <16 x half> %vec16, i64 9
+  %elt10 = extractelement <16 x half> %vec16, i64 10
+  %elt11 = extractelement <16 x half> %vec16, i64 11
+  %elt12 = extractelement <16 x half> %vec16, i64 12
+  %elt13 = extractelement <16 x half> %vec16, i64 13
+  %elt14 = extractelement <16 x half> %vec16, i64 14
+  %elt15 = extractelement <16 x half> %vec16, i64 15
+
+  %add1 = fadd fast half %elt1, %elt0
+  %add2 = fadd fast half %elt2, %add1
+  %add3 = fadd fast half %elt3, %add2
+  %add4 = fadd fast half %elt4, %add3
+  %add5 = fadd fast half %elt5, %add4
+  %add6 = fadd fast half %elt6, %add5
+  %add7 = fadd fast half %elt7, %add6
+  %add8 = fadd fast half %elt8, %add7
+  %add9 = fadd fast half %elt9, %add8
+  %add10 = fadd fast half %elt10, %add9
+  %add11 = fadd fast half %elt11, %add10
+  %add12 = fadd fast half %elt12, %add11
+  %add13 = fadd fast half %elt13, %add12
+  %add14 = fadd fast half %elt14, %add13
+  %add15 = fadd fast half %elt15, %add14
+
+  ret half %add15
+}
+
+; FIXME: support vectorization;
+define half @reduction_sub_half4(<4 x half> %a) {
+; GCN-LABEL: @reduction_sub_half4(
+; GCN-NEXT:  entry:
+; GCN-NEXT:    [[ELT0:%.*]] = extractelement <4 x half> [[A:%.*]], i64 0
+; GCN-NEXT:    [[ELT1:%.*]] = extractelement <4 x half> [[A]], i64 1
+; GCN-NEXT:    [[ELT2:%.*]] = extractelement <4 x half> [[A]], i64 2
+; GCN-NEXT:    [[ELT3:%.*]] = extractelement <4 x half> [[A]], i64 3
+; GCN-NEXT:    [[ADD1:%.*]] = fsub fast half [[ELT1]], [[ELT0]]
+; GCN-NEXT:    [[ADD2:%.*]] = fsub fast half [[ELT2]], [[ADD1]]
+; GCN-NEXT:    [[ADD3:%.*]] = fsub fast half [[ELT3]], [[ADD2]]
+; GCN-NEXT:    ret half [[ADD3]]
+;
+entry:
+  %elt0 = extractelement <4 x half> %a, i64 0
+  %elt1 = extractelement <4 x half> %a, i64 1
+  %elt2 = extractelement <4 x half> %a, i64 2
+  %elt3 = extractelement <4 x half> %a, i64 3
+
+  %add1 = fsub fast half %elt1, %elt0
+  %add2 = fsub fast half %elt2, %add1
+  %add3 = fsub fast half %elt3, %add2
+
+  ret half %add3
+}
+
+define i16 @reduction_v4i16(<4 x i16> %a) {
+; GFX9-LABEL: @reduction_v4i16(
+; GFX9-NEXT:  entry:
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i16> [[A:%.*]], <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT:    [[BIN_RDX:%.*]] = add <4 x i16> [[A]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i16> [[BIN_RDX]], <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i16> [[BIN_RDX]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <4 x i16> [[BIN_RDX2]], i32 0
+; GFX9-NEXT:    ret i16 [[TMP0]]
+;
+; VI-LABEL: @reduction_v4i16(
+; VI-NEXT:  entry:
+; VI-NEXT:    [[ELT0:%.*]] = extractelement <4 x i16> [[A:%.*]], i64 0
+; VI-NEXT:    [[ELT1:%.*]] = extractelement <4 x i16> [[A]], i64 1
+; VI-NEXT:    [[ELT2:%.*]] = extractelement <4 x i16> [[A]], i64 2
+; VI-NEXT:    [[ELT3:%.*]] = extractelement <4 x i16> [[A]], i64 3
+; VI-NEXT:    [[ADD1:%.*]] = add i16 [[ELT1]], [[ELT0]]
+; VI-NEXT:    [[ADD2:%.*]] = add i16 [[ELT2]], [[ADD1]]
+; VI-NEXT:    [[ADD3:%.*]] = add i16 [[ELT3]], [[ADD2]]
+; VI-NEXT:    ret i16 [[ADD3]]
+;
+entry:
+  %elt0 = extractelement <4 x i16> %a, i64 0
+  %elt1 = extractelement <4 x i16> %a, i64 1
+  %elt2 = extractelement <4 x i16> %a, i64 2
+  %elt3 = extractelement <4 x i16> %a, i64 3
+
+  %add1 = add i16 %elt1, %elt0
+  %add2 = add i16 %elt2, %add1
+  %add3 = add i16 %elt3, %add2
+
+  ret i16 %add3
+}
+
+define i16 @reduction_v8i16(<8 x i16> %vec8) {
+; GFX9-LABEL: @reduction_v8i16(
+; GFX9-NEXT:  entry:
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[VEC8:%.*]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[BIN_RDX:%.*]] = add <8 x i16> [[VEC8]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i16> [[BIN_RDX]], <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[BIN_RDX2:%.*]] = add <8 x i16> [[BIN_RDX]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i16> [[BIN_RDX2]], <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i16> [[BIN_RDX2]], [[RDX_SHUF3]]
+; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <8 x i16> [[BIN_RDX4]], i32 0
+; GFX9-NEXT:    ret i16 [[TMP0]]
+;
+; VI-LABEL: @reduction_v8i16(
+; VI-NEXT:  entry:
+; VI-NEXT:    [[ELT0:%.*]] = extractelement <8 x i16> [[VEC8:%.*]], i64 0
+; VI-NEXT:    [[ELT1:%.*]] = extractelement <8 x i16> [[VEC8]], i64 1
+; VI-NEXT:    [[ELT2:%.*]] = extractelement <8 x i16> [[VEC8]], i64 2
+; VI-NEXT:    [[ELT3:%.*]] = extractelement <8 x i16> [[VEC8]], i64 3
+; VI-NEXT:    [[ELT4:%.*]] = extractelement <8 x i16> [[VEC8]], i64 4
+; VI-NEXT:    [[ELT5:%.*]] = extractelement <8 x i16> [[VEC8]], i64 5
+; VI-NEXT:    [[ELT6:%.*]] = extractelement <8 x i16> [[VEC8]], i64 6
+; VI-NEXT:    [[ELT7:%.*]] = extractelement <8 x i16> [[VEC8]], i64 7
+; VI-NEXT:    [[ADD1:%.*]] = add i16 [[ELT1]], [[ELT0]]
+; VI-NEXT:    [[ADD2:%.*]] = add i16 [[ELT2]], [[ADD1]]
+; VI-NEXT:    [[ADD3:%.*]] = add i16 [[ELT3]], [[ADD2]]
+; VI-NEXT:    [[ADD4:%.*]] = add i16 [[ELT4]], [[ADD3]]
+; VI-NEXT:    [[ADD5:%.*]] = add i16 [[ELT5]], [[ADD4]]
+; VI-NEXT:    [[ADD6:%.*]] = add i16 [[ELT6]], [[ADD5]]
+; VI-NEXT:    [[ADD7:%.*]] = add i16 [[ELT7]], [[ADD6]]
+; VI-NEXT:    ret i16 [[ADD7]]
+;
+entry:
+  %elt0 = extractelement <8 x i16> %vec8, i64 0
+  %elt1 = extractelement <8 x i16> %vec8, i64 1
+  %elt2 = extractelement <8 x i16> %vec8, i64 2
+  %elt3 = extractelement <8 x i16> %vec8, i64 3
+  %elt4 = extractelement <8 x i16> %vec8, i64 4
+  %elt5 = extractelement <8 x i16> %vec8, i64 5
+  %elt6 = extractelement <8 x i16> %vec8, i64 6
+  %elt7 = extractelement <8 x i16> %vec8, i64 7
+
+  %add1 = add i16 %elt1, %elt0
+  %add2 = add i16 %elt2, %add1
+  %add3 = add i16 %elt3, %add2
+  %add4 = add i16 %elt4, %add3
+  %add5 = add i16 %elt5, %add4
+  %add6 = add i16 %elt6, %add5
+  %add7 = add i16 %elt7, %add6
+
+  ret i16 %add7
+}
+
+define i16 @reduction_umin_v4i16(<4 x i16> %vec4) {
+; GFX9-LABEL: @reduction_umin_v4i16(
+; GFX9-NEXT:  entry:
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i16> [[VEC4:%.*]], <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp ult <4 x i16> [[VEC4]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i16> [[VEC4]], <4 x i16> [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp ult <4 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> [[RDX_SHUF1]]
+; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <4 x i16> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT:    ret i16 [[TMP0]]
+;
+; VI-LABEL: @reduction_umin_v4i16(
+; VI-NEXT:  entry:
+; VI-NEXT:    [[ELT0:%.*]] = extractelement <4 x i16> [[VEC4:%.*]], i64 0
+; VI-NEXT:    [[ELT1:%.*]] = extractelement <4 x i16> [[VEC4]], i64 1
+; VI-NEXT:    [[ELT2:%.*]] = extractelement <4 x i16> [[VEC4]], i64 2
+; VI-NEXT:    [[ELT3:%.*]] = extractelement <4 x i16> [[VEC4]], i64 3
+; VI-NEXT:    [[CMP1:%.*]] = icmp ult i16 [[ELT1]], [[ELT0]]
+; VI-NEXT:    [[MIN1:%.*]] = select i1 [[CMP1]], i16 [[ELT1]], i16 [[ELT0]]
+; VI-NEXT:    [[CMP2:%.*]] = icmp ult i16 [[ELT2]], [[MIN1]]
+; VI-NEXT:    [[MIN2:%.*]] = select i1 [[CMP2]], i16 [[ELT2]], i16 [[MIN1]]
+; VI-NEXT:    [[CMP3:%.*]] = icmp ult i16 [[ELT3]], [[MIN2]]
+; VI-NEXT:    [[MIN3:%.*]] = select i1 [[CMP3]], i16 [[ELT3]], i16 [[MIN2]]
+; VI-NEXT:    ret i16 [[MIN3]]
+;
+entry:
+  %elt0 = extractelement <4 x i16> %vec4, i64 0
+  %elt1 = extractelement <4 x i16> %vec4, i64 1
+  %elt2 = extractelement <4 x i16> %vec4, i64 2
+  %elt3 = extractelement <4 x i16> %vec4, i64 3
+
+  %cmp1 = icmp ult i16 %elt1, %elt0
+  %min1 = select i1 %cmp1, i16 %elt1, i16 %elt0
+  %cmp2 = icmp ult i16 %elt2, %min1
+  %min2 = select i1 %cmp2, i16 %elt2, i16 %min1
+  %cmp3 = icmp ult i16 %elt3, %min2
+  %min3 = select i1 %cmp3, i16 %elt3, i16 %min2
+
+  ret i16 %min3
+}
+
+define i16 @reduction_icmp_v8i16(<8 x i16> %vec8) {
+; GFX9-LABEL: @reduction_icmp_v8i16(
+; GFX9-NEXT:  entry:
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[VEC8:%.*]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp ult <8 x i16> [[VEC8]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x i16> [[VEC8]], <8 x i16> [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i16> [[RDX_MINMAX_SELECT]], <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp ult <8 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x i16> [[RDX_MINMAX_SELECT]], <8 x i16> [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x i16> [[RDX_MINMAX_SELECT3]], <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp ult <8 x i16> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x i16> [[RDX_MINMAX_SELECT3]], <8 x i16> [[RDX_SHUF4]]
+; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <8 x i16> [[RDX_MINMAX_SELECT6]], i32 0
+; GFX9-NEXT:    ret i16 [[TMP0]]
+;
+; VI-LABEL: @reduction_icmp_v8i16(
+; VI-NEXT:  entry:
+; VI-NEXT:    [[ELT0:%.*]] = extractelement <8 x i16> [[VEC8:%.*]], i64 0
+; VI-NEXT:    [[ELT1:%.*]] = extractelement <8 x i16> [[VEC8]], i64 1
+; VI-NEXT:    [[ELT2:%.*]] = extractelement <8 x i16> [[VEC8]], i64 2
+; VI-NEXT:    [[ELT3:%.*]] = extractelement <8 x i16> [[VEC8]], i64 3
+; VI-NEXT:    [[ELT4:%.*]] = extractelement <8 x i16> [[VEC8]], i64 4
+; VI-NEXT:    [[ELT5:%.*]] = extractelement <8 x i16> [[VEC8]], i64 5
+; VI-NEXT:    [[ELT6:%.*]] = extractelement <8 x i16> [[VEC8]], i64 6
+; VI-NEXT:    [[ELT7:%.*]] = extractelement <8 x i16> [[VEC8]], i64 7
+; VI-NEXT:    [[CMP0:%.*]] = icmp ult i16 [[ELT1]], [[ELT0]]
+; VI-NEXT:    [[MIN1:%.*]] = select i1 [[CMP0]], i16 [[ELT1]], i16 [[ELT0]]
+; VI-NEXT:    [[CMP1:%.*]] = icmp ult i16 [[ELT2]], [[MIN1]]
+; VI-NEXT:    [[MIN2:%.*]] = select i1 [[CMP1]], i16 [[ELT2]], i16 [[MIN1]]
+; VI-NEXT:    [[CMP2:%.*]] = icmp ult i16 [[ELT3]], [[MIN2]]
+; VI-NEXT:    [[MIN3:%.*]] = select i1 [[CMP2]], i16 [[ELT3]], i16 [[MIN2]]
+; VI-NEXT:    [[CMP3:%.*]] = icmp ult i16 [[ELT4]], [[MIN3]]
+; VI-NEXT:    [[MIN4:%.*]] = select i1 [[CMP3]], i16 [[ELT4]], i16 [[MIN3]]
+; VI-NEXT:    [[CMP4:%.*]] = icmp ult i16 [[ELT5]], [[MIN4]]
+; VI-NEXT:    [[MIN5:%.*]] = select i1 [[CMP4]], i16 [[ELT5]], i16 [[MIN4]]
+; VI-NEXT:    [[CMP5:%.*]] = icmp ult i16 [[ELT6]], [[MIN5]]
+; VI-NEXT:    [[MIN6:%.*]] = select i1 [[CMP5]], i16 [[ELT6]], i16 [[MIN5]]
+; VI-NEXT:    [[CMP6:%.*]] = icmp ult i16 [[ELT7]], [[MIN6]]
+; VI-NEXT:    [[MIN7:%.*]] = select i1 [[CMP6]], i16 [[ELT7]], i16 [[MIN6]]
+; VI-NEXT:    ret i16 [[MIN7]]
+;
+entry:
+  %elt0 = extractelement <8 x i16> %vec8, i64 0
+  %elt1 = extractelement <8 x i16> %vec8, i64 1
+  %elt2 = extractelement <8 x i16> %vec8, i64 2
+  %elt3 = extractelement <8 x i16> %vec8, i64 3
+  %elt4 = extractelement <8 x i16> %vec8, i64 4
+  %elt5 = extractelement <8 x i16> %vec8, i64 5
+  %elt6 = extractelement <8 x i16> %vec8, i64 6
+  %elt7 = extractelement <8 x i16> %vec8, i64 7
+
+  %cmp0 = icmp ult i16 %elt1, %elt0
+  %min1 = select i1 %cmp0, i16 %elt1, i16 %elt0
+  %cmp1 = icmp ult i16 %elt2, %min1
+  %min2 = select i1 %cmp1, i16 %elt2, i16 %min1
+  %cmp2 = icmp ult i16 %elt3, %min2
+  %min3 = select i1 %cmp2, i16 %elt3, i16 %min2
+
+  %cmp3 = icmp ult i16 %elt4, %min3
+  %min4 = select i1 %cmp3, i16 %elt4, i16 %min3
+  %cmp4 = icmp ult i16 %elt5, %min4
+  %min5 = select i1 %cmp4, i16 %elt5, i16 %min4
+
+  %cmp5 = icmp ult i16 %elt6, %min5
+  %min6 = select i1 %cmp5, i16 %elt6, i16 %min5
+  %cmp6 = icmp ult i16 %elt7, %min6
+  %min7 = select i1 %cmp6, i16 %elt7, i16 %min6
+
+  ret i16 %min7
+}
+
+define i16 @reduction_smin_v16i16(<16 x i16> %vec16) {
+; GFX9-LABEL: @reduction_smin_v16i16(
+; GFX9-NEXT:  entry:
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x i16> [[VEC16:%.*]], <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp slt <16 x i16> [[VEC16]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP]], <16 x i16> [[VEC16]], <16 x i16> [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x i16> [[RDX_MINMAX_SELECT]], <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp slt <16 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP2]], <16 x i16> [[RDX_MINMAX_SELECT]], <16 x i16> [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <16 x i16> [[RDX_MINMAX_SELECT3]], <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp slt <16 x i16> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP5]], <16 x i16> [[RDX_MINMAX_SELECT3]], <16 x i16> [[RDX_SHUF4]]
+; GFX9-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <16 x i16> [[RDX_MINMAX_SELECT6]], <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = icmp slt <16 x i16> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP8]], <16 x i16> [[RDX_MINMAX_SELECT6]], <16 x i16> [[RDX_SHUF7]]
+; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <16 x i16> [[RDX_MINMAX_SELECT9]], i32 0
+; GFX9-NEXT:    ret i16 [[TMP0]]
+;
+; VI-LABEL: @reduction_smin_v16i16(
+; VI-NEXT:  entry:
+; VI-NEXT:    [[ELT0:%.*]] = extractelement <16 x i16> [[VEC16:%.*]], i64 0
+; VI-NEXT:    [[ELT1:%.*]] = extractelement <16 x i16> [[VEC16]], i64 1
+; VI-NEXT:    [[ELT2:%.*]] = extractelement <16 x i16> [[VEC16]], i64 2
+; VI-NEXT:    [[ELT3:%.*]] = extractelement <16 x i16> [[VEC16]], i64 3
+; VI-NEXT:    [[ELT4:%.*]] = extractelement <16 x i16> [[VEC16]], i64 4
+; VI-NEXT:    [[ELT5:%.*]] = extractelement <16 x i16> [[VEC16]], i64 5
+; VI-NEXT:    [[ELT6:%.*]] = extractelement <16 x i16> [[VEC16]], i64 6
+; VI-NEXT:    [[ELT7:%.*]] = extractelement <16 x i16> [[VEC16]], i64 7
+; VI-NEXT:    [[ELT8:%.*]] = extractelement <16 x i16> [[VEC16]], i64 8
+; VI-NEXT:    [[ELT9:%.*]] = extractelement <16 x i16> [[VEC16]], i64 9
+; VI-NEXT:    [[ELT10:%.*]] = extractelement <16 x i16> [[VEC16]], i64 10
+; VI-NEXT:    [[ELT11:%.*]] = extractelement <16 x i16> [[VEC16]], i64 11
+; VI-NEXT:    [[ELT12:%.*]] = extractelement <16 x i16> [[VEC16]], i64 12
+; VI-NEXT:    [[ELT13:%.*]] = extractelement <16 x i16> [[VEC16]], i64 13
+; VI-NEXT:    [[ELT14:%.*]] = extractelement <16 x i16> [[VEC16]], i64 14
+; VI-NEXT:    [[ELT15:%.*]] = extractelement <16 x i16> [[VEC16]], i64 15
+; VI-NEXT:    [[CMP0:%.*]] = icmp slt i16 [[ELT1]], [[ELT0]]
+; VI-NEXT:    [[MIN1:%.*]] = select i1 [[CMP0]], i16 [[ELT1]], i16 [[ELT0]]
+; VI-NEXT:    [[CMP1:%.*]] = icmp slt i16 [[ELT2]], [[MIN1]]
+; VI-NEXT:    [[MIN2:%.*]] = select i1 [[CMP1]], i16 [[ELT2]], i16 [[MIN1]]
+; VI-NEXT:    [[CMP2:%.*]] = icmp slt i16 [[ELT3]], [[MIN2]]
+; VI-NEXT:    [[MIN3:%.*]] = select i1 [[CMP2]], i16 [[ELT3]], i16 [[MIN2]]
+; VI-NEXT:    [[CMP3:%.*]] = icmp slt i16 [[ELT4]], [[MIN3]]
+; VI-NEXT:    [[MIN4:%.*]] = select i1 [[CMP3]], i16 [[ELT4]], i16 [[MIN3]]
+; VI-NEXT:    [[CMP4:%.*]] = icmp slt i16 [[ELT5]], [[MIN4]]
+; VI-NEXT:    [[MIN5:%.*]] = select i1 [[CMP4]], i16 [[ELT5]], i16 [[MIN4]]
+; VI-NEXT:    [[CMP5:%.*]] = icmp slt i16 [[ELT6]], [[MIN5]]
+; VI-NEXT:    [[MIN6:%.*]] = select i1 [[CMP5]], i16 [[ELT6]], i16 [[MIN5]]
+; VI-NEXT:    [[CMP6:%.*]] = icmp slt i16 [[ELT7]], [[MIN6]]
+; VI-NEXT:    [[MIN7:%.*]] = select i1 [[CMP6]], i16 [[ELT7]], i16 [[MIN6]]
+; VI-NEXT:    [[CMP7:%.*]] = icmp slt i16 [[ELT8]], [[MIN7]]
+; VI-NEXT:    [[MIN8:%.*]] = select i1 [[CMP7]], i16 [[ELT8]], i16 [[MIN7]]
+; VI-NEXT:    [[CMP8:%.*]] = icmp slt i16 [[ELT9]], [[MIN8]]
+; VI-NEXT:    [[MIN9:%.*]] = select i1 [[CMP8]], i16 [[ELT9]], i16 [[MIN8]]
+; VI-NEXT:    [[CMP9:%.*]] = icmp slt i16 [[ELT10]], [[MIN9]]
+; VI-NEXT:    [[MIN10:%.*]] = select i1 [[CMP9]], i16 [[ELT10]], i16 [[MIN9]]
+; VI-NEXT:    [[CMP10:%.*]] = icmp slt i16 [[ELT11]], [[MIN10]]
+; VI-NEXT:    [[MIN11:%.*]] = select i1 [[CMP10]], i16 [[ELT11]], i16 [[MIN10]]
+; VI-NEXT:    [[CMP11:%.*]] = icmp slt i16 [[ELT12]], [[MIN11]]
+; VI-NEXT:    [[MIN12:%.*]] = select i1 [[CMP11]], i16 [[ELT12]], i16 [[MIN11]]
+; VI-NEXT:    [[CMP12:%.*]] = icmp slt i16 [[ELT13]], [[MIN12]]
+; VI-NEXT:    [[MIN13:%.*]] = select i1 [[CMP12]], i16 [[ELT13]], i16 [[MIN12]]
+; VI-NEXT:    [[CMP13:%.*]] = icmp slt i16 [[ELT14]], [[MIN13]]
+; VI-NEXT:    [[MIN14:%.*]] = select i1 [[CMP13]], i16 [[ELT14]], i16 [[MIN13]]
+; VI-NEXT:    [[CMP14:%.*]] = icmp slt i16 [[ELT15]], [[MIN14]]
+; VI-NEXT:    [[MIN15:%.*]] = select i1 [[CMP14]], i16 [[ELT15]], i16 [[MIN14]]
+; VI-NEXT:    ret i16 [[MIN15]]
+;
+entry:
+  %elt0 = extractelement <16 x i16> %vec16, i64 0
+  %elt1 = extractelement <16 x i16> %vec16, i64 1
+  %elt2 = extractelement <16 x i16> %vec16, i64 2
+  %elt3 = extractelement <16 x i16> %vec16, i64 3
+  %elt4 = extractelement <16 x i16> %vec16, i64 4
+  %elt5 = extractelement <16 x i16> %vec16, i64 5
+  %elt6 = extractelement <16 x i16> %vec16, i64 6
+  %elt7 = extractelement <16 x i16> %vec16, i64 7
+
+  %elt8 = extractelement <16 x i16> %vec16, i64 8
+  %elt9 = extractelement <16 x i16> %vec16, i64 9
+  %elt10 = extractelement <16 x i16> %vec16, i64 10
+  %elt11 = extractelement <16 x i16> %vec16, i64 11
+  %elt12 = extractelement <16 x i16> %vec16, i64 12
+  %elt13 = extractelement <16 x i16> %vec16, i64 13
+  %elt14 = extractelement <16 x i16> %vec16, i64 14
+  %elt15 = extractelement <16 x i16> %vec16, i64 15
+
+  %cmp0 = icmp slt i16 %elt1, %elt0
+  %min1 = select i1 %cmp0, i16 %elt1, i16 %elt0
+  %cmp1 = icmp slt i16 %elt2, %min1
+  %min2 = select i1 %cmp1, i16 %elt2, i16 %min1
+  %cmp2 = icmp slt i16 %elt3, %min2
+  %min3 = select i1 %cmp2, i16 %elt3, i16 %min2
+
+  %cmp3 = icmp slt i16 %elt4, %min3
+  %min4 = select i1 %cmp3, i16 %elt4, i16 %min3
+  %cmp4 = icmp slt i16 %elt5, %min4
+  %min5 = select i1 %cmp4, i16 %elt5, i16 %min4
+
+  %cmp5 = icmp slt i16 %elt6, %min5
+  %min6 = select i1 %cmp5, i16 %elt6, i16 %min5
+  %cmp6 = icmp slt i16 %elt7, %min6
+  %min7 = select i1 %cmp6, i16 %elt7, i16 %min6
+
+  %cmp7 = icmp slt i16 %elt8, %min7
+  %min8 = select i1 %cmp7, i16 %elt8, i16 %min7
+  %cmp8 = icmp slt i16 %elt9, %min8
+  %min9 = select i1 %cmp8, i16 %elt9, i16 %min8
+
+  %cmp9 = icmp slt i16 %elt10, %min9
+  %min10 = select i1 %cmp9, i16 %elt10, i16 %min9
+  %cmp10 = icmp slt i16 %elt11, %min10
+  %min11 = select i1 %cmp10, i16 %elt11, i16 %min10
+
+  %cmp11 = icmp slt i16 %elt12, %min11
+  %min12 = select i1 %cmp11, i16 %elt12, i16 %min11
+  %cmp12 = icmp slt i16 %elt13, %min12
+  %min13 = select i1 %cmp12, i16 %elt13, i16 %min12
+
+  %cmp13 = icmp slt i16 %elt14, %min13
+  %min14 = select i1 %cmp13, i16 %elt14, i16 %min13
+  %cmp14 = icmp slt i16 %elt15, %min14
+  %min15 = select i1 %cmp14, i16 %elt15, i16 %min14
+
+
+  ret i16 %min15
+}
+
+define i16 @reduction_umax_v4i16(<4 x i16> %vec4) {
+; GFX9-LABEL: @reduction_umax_v4i16(
+; GFX9-NEXT:  entry:
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i16> [[VEC4:%.*]], <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp ugt <4 x i16> [[VEC4]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i16> [[VEC4]], <4 x i16> [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp ugt <4 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> [[RDX_SHUF1]]
+; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <4 x i16> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT:    ret i16 [[TMP0]]
+;
+; VI-LABEL: @reduction_umax_v4i16(
+; VI-NEXT:  entry:
+; VI-NEXT:    [[ELT0:%.*]] = extractelement <4 x i16> [[VEC4:%.*]], i64 0
+; VI-NEXT:    [[ELT1:%.*]] = extractelement <4 x i16> [[VEC4]], i64 1
+; VI-NEXT:    [[ELT2:%.*]] = extractelement <4 x i16> [[VEC4]], i64 2
+; VI-NEXT:    [[ELT3:%.*]] = extractelement <4 x i16> [[VEC4]], i64 3
+; VI-NEXT:    [[CMP1:%.*]] = icmp ugt i16 [[ELT1]], [[ELT0]]
+; VI-NEXT:    [[MAX1:%.*]] = select i1 [[CMP1]], i16 [[ELT1]], i16 [[ELT0]]
+; VI-NEXT:    [[CMP2:%.*]] = icmp ugt i16 [[ELT2]], [[MAX1]]
+; VI-NEXT:    [[MAX2:%.*]] = select i1 [[CMP2]], i16 [[ELT2]], i16 [[MAX1]]
+; VI-NEXT:    [[CMP3:%.*]] = icmp ugt i16 [[ELT3]], [[MAX2]]
+; VI-NEXT:    [[MAX3:%.*]] = select i1 [[CMP3]], i16 [[ELT3]], i16 [[MAX2]]
+; VI-NEXT:    ret i16 [[MAX3]]
+;
+entry:
+  %elt0 = extractelement <4 x i16> %vec4, i64 0
+  %elt1 = extractelement <4 x i16> %vec4, i64 1
+  %elt2 = extractelement <4 x i16> %vec4, i64 2
+  %elt3 = extractelement <4 x i16> %vec4, i64 3
+
+  %cmp1 = icmp ugt i16 %elt1, %elt0
+  %max1 = select i1 %cmp1, i16 %elt1, i16 %elt0
+  %cmp2 = icmp ugt i16 %elt2, %max1
+  %max2 = select i1 %cmp2, i16 %elt2, i16 %max1
+  %cmp3 = icmp ugt i16 %elt3, %max2
+  %max3 = select i1 %cmp3, i16 %elt3, i16 %max2
+
+  ret i16 %max3
+}
+
+define i16 @reduction_smax_v4i16(<4 x i16> %vec4) {
+; GFX9-LABEL: @reduction_smax_v4i16(
+; GFX9-NEXT:  entry:
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i16> [[VEC4:%.*]], <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i16> [[VEC4]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i16> [[VEC4]], <4 x i16> [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> [[RDX_SHUF1]]
+; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <4 x i16> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT:    ret i16 [[TMP0]]
+;
+; VI-LABEL: @reduction_smax_v4i16(
+; VI-NEXT:  entry:
+; VI-NEXT:    [[ELT0:%.*]] = extractelement <4 x i16> [[VEC4:%.*]], i64 0
+; VI-NEXT:    [[ELT1:%.*]] = extractelement <4 x i16> [[VEC4]], i64 1
+; VI-NEXT:    [[ELT2:%.*]] = extractelement <4 x i16> [[VEC4]], i64 2
+; VI-NEXT:    [[ELT3:%.*]] = extractelement <4 x i16> [[VEC4]], i64 3
+; VI-NEXT:    [[CMP1:%.*]] = icmp sgt i16 [[ELT1]], [[ELT0]]
+; VI-NEXT:    [[MAX1:%.*]] = select i1 [[CMP1]], i16 [[ELT1]], i16 [[ELT0]]
+; VI-NEXT:    [[CMP2:%.*]] = icmp sgt i16 [[ELT2]], [[MAX1]]
+; VI-NEXT:    [[MAX2:%.*]] = select i1 [[CMP2]], i16 [[ELT2]], i16 [[MAX1]]
+; VI-NEXT:    [[CMP3:%.*]] = icmp sgt i16 [[ELT3]], [[MAX2]]
+; VI-NEXT:    [[MAX3:%.*]] = select i1 [[CMP3]], i16 [[ELT3]], i16 [[MAX2]]
+; VI-NEXT:    ret i16 [[MAX3]]
+;
+entry:
+  %elt0 = extractelement <4 x i16> %vec4, i64 0
+  %elt1 = extractelement <4 x i16> %vec4, i64 1
+  %elt2 = extractelement <4 x i16> %vec4, i64 2
+  %elt3 = extractelement <4 x i16> %vec4, i64 3
+
+  %cmp1 = icmp sgt i16 %elt1, %elt0
+  %max1 = select i1 %cmp1, i16 %elt1, i16 %elt0
+  %cmp2 = icmp sgt i16 %elt2, %max1
+  %max2 = select i1 %cmp2, i16 %elt2, i16 %max1
+  %cmp3 = icmp sgt i16 %elt3, %max2
+  %max3 = select i1 %cmp3, i16 %elt3, i16 %max2
+
+  ret i16 %max3
+}
+
+define half @reduction_fmax_v4half(<4 x half> %vec4) {
+; GFX9-LABEL: @reduction_fmax_v4half(
+; GFX9-NEXT:  entry:
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x half> [[VEC4:%.*]], <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <4 x half> [[VEC4]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x half> [[VEC4]], <4 x half> [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x half> [[RDX_MINMAX_SELECT]], <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <4 x half> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x half> [[RDX_MINMAX_SELECT]], <4 x half> [[RDX_SHUF1]]
+; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <4 x half> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT:    ret half [[TMP0]]
+;
+; VI-LABEL: @reduction_fmax_v4half(
+; VI-NEXT:  entry:
+; VI-NEXT:    [[ELT0:%.*]] = extractelement <4 x half> [[VEC4:%.*]], i64 0
+; VI-NEXT:    [[ELT1:%.*]] = extractelement <4 x half> [[VEC4]], i64 1
+; VI-NEXT:    [[ELT2:%.*]] = extractelement <4 x half> [[VEC4]], i64 2
+; VI-NEXT:    [[ELT3:%.*]] = extractelement <4 x half> [[VEC4]], i64 3
+; VI-NEXT:    [[CMP1:%.*]] = fcmp fast ogt half [[ELT1]], [[ELT0]]
+; VI-NEXT:    [[MAX1:%.*]] = select i1 [[CMP1]], half [[ELT1]], half [[ELT0]]
+; VI-NEXT:    [[CMP2:%.*]] = fcmp fast ogt half [[ELT2]], [[MAX1]]
+; VI-NEXT:    [[MAX2:%.*]] = select i1 [[CMP2]], half [[ELT2]], half [[MAX1]]
+; VI-NEXT:    [[CMP3:%.*]] = fcmp fast ogt half [[ELT3]], [[MAX2]]
+; VI-NEXT:    [[MAX3:%.*]] = select i1 [[CMP3]], half [[ELT3]], half [[MAX2]]
+; VI-NEXT:    ret half [[MAX3]]
+;
+entry:
+  %elt0 = extractelement <4 x half> %vec4, i64 0
+  %elt1 = extractelement <4 x half> %vec4, i64 1
+  %elt2 = extractelement <4 x half> %vec4, i64 2
+  %elt3 = extractelement <4 x half> %vec4, i64 3
+
+  %cmp1 = fcmp fast ogt half %elt1, %elt0
+  %max1 = select i1 %cmp1, half %elt1, half %elt0
+  %cmp2 = fcmp fast ogt half %elt2, %max1
+  %max2 = select i1 %cmp2, half %elt2, half %max1
+  %cmp3 = fcmp fast ogt half %elt3, %max2
+  %max3 = select i1 %cmp3, half %elt3, half %max2
+
+  ret half %max3
+}
+
+define half @reduction_fmin_v4half(<4 x half> %vec4) {
+; GFX9-LABEL: @reduction_fmin_v4half(
+; GFX9-NEXT:  entry:
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x half> [[VEC4:%.*]], <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <4 x half> [[VEC4]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x half> [[VEC4]], <4 x half> [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x half> [[RDX_MINMAX_SELECT]], <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast olt <4 x half> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x half> [[RDX_MINMAX_SELECT]], <4 x half> [[RDX_SHUF1]]
+; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <4 x half> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT:    ret half [[TMP0]]
+;
+; VI-LABEL: @reduction_fmin_v4half(
+; VI-NEXT:  entry:
+; VI-NEXT:    [[ELT0:%.*]] = extractelement <4 x half> [[VEC4:%.*]], i64 0
+; VI-NEXT:    [[ELT1:%.*]] = extractelement <4 x half> [[VEC4]], i64 1
+; VI-NEXT:    [[ELT2:%.*]] = extractelement <4 x half> [[VEC4]], i64 2
+; VI-NEXT:    [[ELT3:%.*]] = extractelement <4 x half> [[VEC4]], i64 3
+; VI-NEXT:    [[CMP1:%.*]] = fcmp fast olt half [[ELT1]], [[ELT0]]
+; VI-NEXT:    [[MIN1:%.*]] = select i1 [[CMP1]], half [[ELT1]], half [[ELT0]]
+; VI-NEXT:    [[CMP2:%.*]] = fcmp fast olt half [[ELT2]], [[MIN1]]
+; VI-NEXT:    [[MIN2:%.*]] = select i1 [[CMP2]], half [[ELT2]], half [[MIN1]]
+; VI-NEXT:    [[CMP3:%.*]] = fcmp fast olt half [[ELT3]], [[MIN2]]
+; VI-NEXT:    [[MIN3:%.*]] = select i1 [[CMP3]], half [[ELT3]], half [[MIN2]]
+; VI-NEXT:    ret half [[MIN3]]
+;
+entry:
+  %elt0 = extractelement <4 x half> %vec4, i64 0
+  %elt1 = extractelement <4 x half> %vec4, i64 1
+  %elt2 = extractelement <4 x half> %vec4, i64 2
+  %elt3 = extractelement <4 x half> %vec4, i64 3
+
+  %cmp1 = fcmp fast olt half %elt1, %elt0
+  %min1 = select i1 %cmp1, half %elt1, half %elt0
+  %cmp2 = fcmp fast olt half %elt2, %min1
+  %min2 = select i1 %cmp2, half %elt2, half %min1
+  %cmp3 = fcmp fast olt half %elt3, %min2
+  %min3 = select i1 %cmp3, half %elt3, half %min2
+
+  ret half %min3
+}
+
+; Tests to make sure reduction does not kick in. vega does not support packed math for types larger than 16 bits.
+define float @reduction_v4float(<4 x float> %a) {
+; GCN-LABEL: @reduction_v4float(
+; GCN-NEXT:  entry:
+; GCN-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; GCN-NEXT:    [[ELT1:%.*]] = extractelement <4 x float> [[A]], i64 1
+; GCN-NEXT:    [[ELT2:%.*]] = extractelement <4 x float> [[A]], i64 2
+; GCN-NEXT:    [[ELT3:%.*]] = extractelement <4 x float> [[A]], i64 3
+; GCN-NEXT:    [[ADD1:%.*]] = fadd fast float [[ELT1]], [[ELT0]]
+; GCN-NEXT:    [[ADD2:%.*]] = fadd fast float [[ELT2]], [[ADD1]]
+; GCN-NEXT:    [[ADD3:%.*]] = fadd fast float [[ELT3]], [[ADD2]]
+; GCN-NEXT:    ret float [[ADD3]]
+;
+entry:
+  %elt0 = extractelement <4 x float> %a, i64 0
+  %elt1 = extractelement <4 x float> %a, i64 1
+  %elt2 = extractelement <4 x float> %a, i64 2
+  %elt3 = extractelement <4 x float> %a, i64 3
+
+  %add1 = fadd fast float %elt1, %elt0
+  %add2 = fadd fast float %elt2, %add1
+  %add3 = fadd fast float %elt3, %add2
+
+  ret float %add3
+}
\ No newline at end of file

Added: llvm/trunk/test/Transforms/SLPVectorizer/ARM/extract-insert.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/ARM/extract-insert.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/ARM/extract-insert.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/ARM/extract-insert.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -S -mtriple=thumb7 -mcpu=swift | FileCheck %s
+
+define <4 x i32> @PR13837(<4 x float> %in) {
+; CHECK-LABEL: @PR13837(
+; CHECK-NEXT:    [[TMP1:%.*]] = fptosi <4 x float> [[IN:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP5]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[V3]]
+;
+  %t0 = extractelement <4 x float> %in, i64 0
+  %t1 = extractelement <4 x float> %in, i64 1
+  %t2 = extractelement <4 x float> %in, i64 2
+  %t3 = extractelement <4 x float> %in, i64 3
+  %c0 = fptosi float %t0 to i32
+  %c1 = fptosi float %t1 to i32
+  %c2 = fptosi float %t2 to i32
+  %c3 = fptosi float %t3 to i32
+  %v0 = insertelement <4 x i32> undef, i32 %c0, i32 0
+  %v1 = insertelement <4 x i32> %v0, i32 %c1, i32 1
+  %v2 = insertelement <4 x i32> %v1, i32 %c2, i32 2
+  %v3 = insertelement <4 x i32> %v2, i32 %c3, i32 3
+  ret <4 x i32> %v3
+}
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/ARM/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/ARM/lit.local.cfg?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/ARM/lit.local.cfg (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/ARM/lit.local.cfg Tue Apr 16 21:52:47 2019
@@ -0,0 +1,2 @@
+if not 'ARM' in config.root.targets:
+    config.unsupported = True

Added: llvm/trunk/test/Transforms/SLPVectorizer/ARM/memory.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/ARM/memory.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/ARM/memory.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/ARM/memory.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+
+; On swift unaligned <2 x double> stores need 4uops and it is there for cheaper
+; to do this scalar.
+
+define void @expensive_double_store(double* noalias %dst, double* noalias %src, i64 %count) {
+; CHECK-LABEL: @expensive_double_store(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[SRC:%.*]], align 8
+; CHECK-NEXT:    store double [[TMP0]], double* [[DST:%.*]], align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[SRC]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, double* [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[DST]], i64 1
+; CHECK-NEXT:    store double [[TMP1]], double* [[ARRAYIDX3]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load double, double* %src, align 8
+  store double %0, double* %dst, align 8
+  %arrayidx2 = getelementptr inbounds double, double* %src, i64 1
+  %1 = load double, double* %arrayidx2, align 8
+  %arrayidx3 = getelementptr inbounds double, double* %dst, i64 1
+  store double %1, double* %arrayidx3, align 8
+  ret void
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/ARM/sroa.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/ARM/sroa.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/ARM/sroa.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/ARM/sroa.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mcpu=swift -mtriple=thumbv7-apple-ios -basicaa -slp-vectorizer < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+
+%class.Complex = type { double, double }
+
+; Code like this is the result of SROA. Make sure we don't vectorize this
+; because the scalar version of the shl/or are handled by the
+; backend and disappear, the vectorized code stays.
+
+define void @SROAed(%class.Complex* noalias nocapture sret %agg.result, [4 x i32] %a.coerce, [4 x i32] %b.coerce) {
+; CHECK-LABEL: @SROAed(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[A_COERCE:%.*]], 0
+; CHECK-NEXT:    [[A_SROA_0_0_INSERT_EXT:%.*]] = zext i32 [[A_COERCE_FCA_0_EXTRACT]] to i64
+; CHECK-NEXT:    [[A_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i32] [[A_COERCE]], 1
+; CHECK-NEXT:    [[A_SROA_0_4_INSERT_EXT:%.*]] = zext i32 [[A_COERCE_FCA_1_EXTRACT]] to i64
+; CHECK-NEXT:    [[A_SROA_0_4_INSERT_SHIFT:%.*]] = shl nuw i64 [[A_SROA_0_4_INSERT_EXT]], 32
+; CHECK-NEXT:    [[A_SROA_0_4_INSERT_INSERT:%.*]] = or i64 [[A_SROA_0_4_INSERT_SHIFT]], [[A_SROA_0_0_INSERT_EXT]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_SROA_0_4_INSERT_INSERT]] to double
+; CHECK-NEXT:    [[A_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i32] [[A_COERCE]], 2
+; CHECK-NEXT:    [[A_SROA_3_8_INSERT_EXT:%.*]] = zext i32 [[A_COERCE_FCA_2_EXTRACT]] to i64
+; CHECK-NEXT:    [[A_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i32] [[A_COERCE]], 3
+; CHECK-NEXT:    [[A_SROA_3_12_INSERT_EXT:%.*]] = zext i32 [[A_COERCE_FCA_3_EXTRACT]] to i64
+; CHECK-NEXT:    [[A_SROA_3_12_INSERT_SHIFT:%.*]] = shl nuw i64 [[A_SROA_3_12_INSERT_EXT]], 32
+; CHECK-NEXT:    [[A_SROA_3_12_INSERT_INSERT:%.*]] = or i64 [[A_SROA_3_12_INSERT_SHIFT]], [[A_SROA_3_8_INSERT_EXT]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[A_SROA_3_12_INSERT_INSERT]] to double
+; CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[B_COERCE:%.*]], 0
+; CHECK-NEXT:    [[B_SROA_0_0_INSERT_EXT:%.*]] = zext i32 [[B_COERCE_FCA_0_EXTRACT]] to i64
+; CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i32] [[B_COERCE]], 1
+; CHECK-NEXT:    [[B_SROA_0_4_INSERT_EXT:%.*]] = zext i32 [[B_COERCE_FCA_1_EXTRACT]] to i64
+; CHECK-NEXT:    [[B_SROA_0_4_INSERT_SHIFT:%.*]] = shl nuw i64 [[B_SROA_0_4_INSERT_EXT]], 32
+; CHECK-NEXT:    [[B_SROA_0_4_INSERT_INSERT:%.*]] = or i64 [[B_SROA_0_4_INSERT_SHIFT]], [[B_SROA_0_0_INSERT_EXT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_SROA_0_4_INSERT_INSERT]] to double
+; CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i32] [[B_COERCE]], 2
+; CHECK-NEXT:    [[B_SROA_3_8_INSERT_EXT:%.*]] = zext i32 [[B_COERCE_FCA_2_EXTRACT]] to i64
+; CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i32] [[B_COERCE]], 3
+; CHECK-NEXT:    [[B_SROA_3_12_INSERT_EXT:%.*]] = zext i32 [[B_COERCE_FCA_3_EXTRACT]] to i64
+; CHECK-NEXT:    [[B_SROA_3_12_INSERT_SHIFT:%.*]] = shl nuw i64 [[B_SROA_3_12_INSERT_EXT]], 32
+; CHECK-NEXT:    [[B_SROA_3_12_INSERT_INSERT:%.*]] = or i64 [[B_SROA_3_12_INSERT_SHIFT]], [[B_SROA_3_8_INSERT_EXT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_SROA_3_12_INSERT_INSERT]] to double
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    [[ADD3:%.*]] = fadd double [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[RE_I_I:%.*]] = getelementptr inbounds [[CLASS_COMPLEX:%.*]], %class.Complex* [[AGG_RESULT:%.*]], i32 0, i32 0
+; CHECK-NEXT:    store double [[ADD]], double* [[RE_I_I]], align 4
+; CHECK-NEXT:    [[IM_I_I:%.*]] = getelementptr inbounds [[CLASS_COMPLEX]], %class.Complex* [[AGG_RESULT]], i32 0, i32 1
+; CHECK-NEXT:    store double [[ADD3]], double* [[IM_I_I]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a.coerce.fca.0.extract = extractvalue [4 x i32] %a.coerce, 0
+  %a.sroa.0.0.insert.ext = zext i32 %a.coerce.fca.0.extract to i64
+  %a.coerce.fca.1.extract = extractvalue [4 x i32] %a.coerce, 1
+  %a.sroa.0.4.insert.ext = zext i32 %a.coerce.fca.1.extract to i64
+  %a.sroa.0.4.insert.shift = shl nuw i64 %a.sroa.0.4.insert.ext, 32
+  %a.sroa.0.4.insert.insert = or i64 %a.sroa.0.4.insert.shift, %a.sroa.0.0.insert.ext
+  %0 = bitcast i64 %a.sroa.0.4.insert.insert to double
+  %a.coerce.fca.2.extract = extractvalue [4 x i32] %a.coerce, 2
+  %a.sroa.3.8.insert.ext = zext i32 %a.coerce.fca.2.extract to i64
+  %a.coerce.fca.3.extract = extractvalue [4 x i32] %a.coerce, 3
+  %a.sroa.3.12.insert.ext = zext i32 %a.coerce.fca.3.extract to i64
+  %a.sroa.3.12.insert.shift = shl nuw i64 %a.sroa.3.12.insert.ext, 32
+  %a.sroa.3.12.insert.insert = or i64 %a.sroa.3.12.insert.shift, %a.sroa.3.8.insert.ext
+  %1 = bitcast i64 %a.sroa.3.12.insert.insert to double
+  %b.coerce.fca.0.extract = extractvalue [4 x i32] %b.coerce, 0
+  %b.sroa.0.0.insert.ext = zext i32 %b.coerce.fca.0.extract to i64
+  %b.coerce.fca.1.extract = extractvalue [4 x i32] %b.coerce, 1
+  %b.sroa.0.4.insert.ext = zext i32 %b.coerce.fca.1.extract to i64
+  %b.sroa.0.4.insert.shift = shl nuw i64 %b.sroa.0.4.insert.ext, 32
+  %b.sroa.0.4.insert.insert = or i64 %b.sroa.0.4.insert.shift, %b.sroa.0.0.insert.ext
+  %2 = bitcast i64 %b.sroa.0.4.insert.insert to double
+  %b.coerce.fca.2.extract = extractvalue [4 x i32] %b.coerce, 2
+  %b.sroa.3.8.insert.ext = zext i32 %b.coerce.fca.2.extract to i64
+  %b.coerce.fca.3.extract = extractvalue [4 x i32] %b.coerce, 3
+  %b.sroa.3.12.insert.ext = zext i32 %b.coerce.fca.3.extract to i64
+  %b.sroa.3.12.insert.shift = shl nuw i64 %b.sroa.3.12.insert.ext, 32
+  %b.sroa.3.12.insert.insert = or i64 %b.sroa.3.12.insert.shift, %b.sroa.3.8.insert.ext
+  %3 = bitcast i64 %b.sroa.3.12.insert.insert to double
+  %add = fadd double %0, %2
+  %add3 = fadd double %1, %3
+  %re.i.i = getelementptr inbounds %class.Complex, %class.Complex* %agg.result, i32 0, i32 0
+  store double %add, double* %re.i.i, align 4
+  %im.i.i = getelementptr inbounds %class.Complex, %class.Complex* %agg.result, i32 0, i32 1
+  store double %add3, double* %im.i.i, align 4
+  ret void
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/NVPTX/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/NVPTX/lit.local.cfg?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/NVPTX/lit.local.cfg (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/NVPTX/lit.local.cfg Tue Apr 16 21:52:47 2019
@@ -0,0 +1,2 @@
+if not 'NVPTX' in config.root.targets:
+    config.unsupported = True

Added: llvm/trunk/test/Transforms/SLPVectorizer/NVPTX/non-vectorizable-intrinsic.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/NVPTX/non-vectorizable-intrinsic.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/NVPTX/non-vectorizable-intrinsic.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/NVPTX/non-vectorizable-intrinsic.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -o - -S -slp-threshold=-1000 | FileCheck %s
+
+target datalayout = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx--nvidiacl"
+
+; CTLZ cannot be vectorized currently because the second argument is a scalar
+; for both the scalar and vector forms of the intrinsic. In the future it
+; should be possible to vectorize such functions.
+; Test causes an assert if LLVM tries to vectorize CTLZ.
+
+define <2 x i8> @cltz_test(<2 x i8> %x) #0 {
+; CHECK-LABEL: @cltz_test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[CALL_I:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP0]], i1 false)
+; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <2 x i8> undef, i8 [[CALL_I]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i8> [[X]], i32 1
+; CHECK-NEXT:    [[CALL_I4:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP1]], i1 false)
+; CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <2 x i8> [[VECINIT]], i8 [[CALL_I4]], i32 1
+; CHECK-NEXT:    ret <2 x i8> [[VECINIT2]]
+;
+entry:
+  %0 = extractelement <2 x i8> %x, i32 0
+  %call.i = call i8 @llvm.ctlz.i8(i8 %0, i1 false)
+  %vecinit = insertelement <2 x i8> undef, i8 %call.i, i32 0
+  %1 = extractelement <2 x i8> %x, i32 1
+  %call.i4 = call i8 @llvm.ctlz.i8(i8 %1, i1 false)
+  %vecinit2 = insertelement <2 x i8> %vecinit, i8 %call.i4, i32 1
+  ret <2 x i8> %vecinit2
+}
+
+define <2 x i8> @cltz_test2(<2 x i8> %x) #1 {
+; CHECK-LABEL: @cltz_test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i8> [[X]], i32 1
+; CHECK-NEXT:    [[CALL_I:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP0]], i1 false)
+; CHECK-NEXT:    [[CALL_I4:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP1]], i1 false)
+; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <2 x i8> undef, i8 [[CALL_I]], i32 0
+; CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <2 x i8> [[VECINIT]], i8 [[CALL_I4]], i32 1
+; CHECK-NEXT:    ret <2 x i8> [[VECINIT2]]
+;
+entry:
+  %0 = extractelement <2 x i8> %x, i32 0
+  %1 = extractelement <2 x i8> %x, i32 1
+  %call.i = call i8 @llvm.ctlz.i8(i8 %0, i1 false)
+  %call.i4 = call i8 @llvm.ctlz.i8(i8 %1, i1 false)
+  %vecinit = insertelement <2 x i8> undef, i8 %call.i, i32 0
+  %vecinit2 = insertelement <2 x i8> %vecinit, i8 %call.i4, i32 1
+  ret <2 x i8> %vecinit2
+}
+
+declare i8 @llvm.ctlz.i8(i8, i1) #3
+
+attributes #0 = { alwaysinline nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }

Added: llvm/trunk/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=nvptx64-nvidia-cuda -mcpu=sm_70 | FileCheck %s
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=nvptx64-nvidia-cuda -mcpu=sm_40 | FileCheck %s -check-prefix=NOVECTOR
+
+define void @fusion(i8* noalias nocapture align 256 dereferenceable(19267584) %arg, i8* noalias nocapture readonly align 256 dereferenceable(19267584) %arg1, i32 %arg2, i32 %arg3) local_unnamed_addr #0 {
+; CHECK-LABEL: @fusion(
+; CHECK-NEXT:    [[TMP:%.*]] = shl nuw nsw i32 [[ARG2:%.*]], 6
+; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP]], [[ARG3:%.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[ARG1:%.*]] to half*
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds half, half* [[TMP10]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8* [[ARG:%.*]] to half*
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds half, half* [[TMP10]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast half* [[TMP11]] to <2 x half>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x half>, <2 x half>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <2 x half> [[TMP2]], <half 0xH5380, half 0xH5380>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <2 x half> [[TMP3]], <half 0xH57F0, half 0xH57F0>
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast half* [[TMP16]] to <2 x half>*
+; CHECK-NEXT:    store <2 x half> [[TMP4]], <2 x half>* [[TMP5]], align 8
+; CHECK-NEXT:    ret void
+;
+; NOVECTOR-LABEL: @fusion(
+; NOVECTOR-NEXT:    [[TMP:%.*]] = shl nuw nsw i32 [[ARG2:%.*]], 6
+; NOVECTOR-NEXT:    [[TMP4:%.*]] = or i32 [[TMP]], [[ARG3:%.*]]
+; NOVECTOR-NEXT:    [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 2
+; NOVECTOR-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP5]] to i64
+; NOVECTOR-NEXT:    [[TMP7:%.*]] = or i64 [[TMP6]], 1
+; NOVECTOR-NEXT:    [[TMP10:%.*]] = bitcast i8* [[ARG1:%.*]] to half*
+; NOVECTOR-NEXT:    [[TMP11:%.*]] = getelementptr inbounds half, half* [[TMP10]], i64 [[TMP6]]
+; NOVECTOR-NEXT:    [[TMP12:%.*]] = load half, half* [[TMP11]], align 8
+; NOVECTOR-NEXT:    [[TMP13:%.*]] = fmul fast half [[TMP12]], 0xH5380
+; NOVECTOR-NEXT:    [[TMP14:%.*]] = fadd fast half [[TMP13]], 0xH57F0
+; NOVECTOR-NEXT:    [[TMP15:%.*]] = bitcast i8* [[ARG:%.*]] to half*
+; NOVECTOR-NEXT:    [[TMP16:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP6]]
+; NOVECTOR-NEXT:    store half [[TMP14]], half* [[TMP16]], align 8
+; NOVECTOR-NEXT:    [[TMP17:%.*]] = getelementptr inbounds half, half* [[TMP10]], i64 [[TMP7]]
+; NOVECTOR-NEXT:    [[TMP18:%.*]] = load half, half* [[TMP17]], align 2
+; NOVECTOR-NEXT:    [[TMP19:%.*]] = fmul fast half [[TMP18]], 0xH5380
+; NOVECTOR-NEXT:    [[TMP20:%.*]] = fadd fast half [[TMP19]], 0xH57F0
+; NOVECTOR-NEXT:    [[TMP21:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP7]]
+; NOVECTOR-NEXT:    store half [[TMP20]], half* [[TMP21]], align 2
+; NOVECTOR-NEXT:    ret void
+;
+  %tmp = shl nuw nsw i32 %arg2, 6
+  %tmp4 = or i32 %tmp, %arg3
+  %tmp5 = shl nuw nsw i32 %tmp4, 2
+  %tmp6 = zext i32 %tmp5 to i64
+  %tmp7 = or i64 %tmp6, 1
+  %tmp10 = bitcast i8* %arg1 to half*
+  %tmp11 = getelementptr inbounds half, half* %tmp10, i64 %tmp6
+  %tmp12 = load half, half* %tmp11, align 8
+  %tmp13 = fmul fast half %tmp12, 0xH5380
+  %tmp14 = fadd fast half %tmp13, 0xH57F0
+  %tmp15 = bitcast i8* %arg to half*
+  %tmp16 = getelementptr inbounds half, half* %tmp15, i64 %tmp6
+  store half %tmp14, half* %tmp16, align 8
+  %tmp17 = getelementptr inbounds half, half* %tmp10, i64 %tmp7
+  %tmp18 = load half, half* %tmp17, align 2
+  %tmp19 = fmul fast half %tmp18, 0xH5380
+  %tmp20 = fadd fast half %tmp19, 0xH57F0
+  %tmp21 = getelementptr inbounds half, half* %tmp15, i64 %tmp7
+  store half %tmp20, half* %tmp21, align 2
+  ret void
+}
+
+attributes #0 = { nounwind }

Added: llvm/trunk/test/Transforms/SLPVectorizer/PowerPC/aggregate.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/PowerPC/aggregate.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/PowerPC/aggregate.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/PowerPC/aggregate.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=powerpc64-linux-gnu -mcpu=pwr9 -mattr=+vsx -slp-vectorizer < %s | FileCheck %s
+
+%struct.S = type { i8*, i8* }
+
+ at kS0 = common global %struct.S zeroinitializer, align 8
+
+define { i64, i64 } @getS() {
+; CHECK-LABEL: @getS(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* bitcast (%struct.S* @kS0 to i64*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* bitcast (i8** getelementptr inbounds (%struct.S, %struct.S* @kS0, i64 0, i32 1) to i64*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i64, i64 } [[TMP2]], i64 [[TMP1]], 1
+; CHECK-NEXT:    ret { i64, i64 } [[TMP3]]
+;
+entry:
+  %0 = load i64, i64* bitcast (%struct.S* @kS0 to i64*), align 8
+  %1 = load i64, i64* bitcast (i8** getelementptr inbounds (%struct.S, %struct.S* @kS0, i64 0, i32 1) to i64*), align 8
+  %2 = insertvalue { i64, i64 } undef, i64 %0, 0
+  %3 = insertvalue { i64, i64 } %2, i64 %1, 1
+  ret { i64, i64 } %3
+}
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/PowerPC/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/PowerPC/lit.local.cfg?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/PowerPC/lit.local.cfg (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/PowerPC/lit.local.cfg Tue Apr 16 21:52:47 2019
@@ -0,0 +1,2 @@
+if not 'PowerPC' in config.root.targets:
+    config.unsupported = True

Added: llvm/trunk/test/Transforms/SLPVectorizer/PowerPC/pr27897.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/PowerPC/pr27897.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/PowerPC/pr27897.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/PowerPC/pr27897.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=powerpc64-linux-gnu -mcpu=pwr8 -mattr=+vsx -slp-vectorizer < %s | FileCheck %s
+
+%struct.A = type { i8*, i8* }
+
+define i64 @foo(%struct.A* nocapture readonly %this) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[END_I:%.*]] = getelementptr inbounds [[STRUCT_A:%.*]], %struct.A* [[THIS:%.*]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8** [[END_I]] to i64*
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.A* [[THIS]] to i64*
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP2]], align 8
+; CHECK-NEXT:    [[SUB_PTR_SUB_I:%.*]] = sub i64 [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[SUB_PTR_SUB_I]], 9
+; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[LOR_LHS_FALSE:%.*]]
+; CHECK:       lor.lhs.false:
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to i8*
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP1]] to i8*
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i8* [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP2]], i64 2, i64 -1
+; CHECK-NEXT:    ret i64 [[DOT]]
+; CHECK:       return:
+; CHECK-NEXT:    ret i64 2
+;
+entry:
+  %end.i = getelementptr inbounds %struct.A, %struct.A* %this, i64 0, i32 1
+  %0 = bitcast i8** %end.i to i64*
+  %1 = load i64, i64* %0, align 8
+  %2 = bitcast %struct.A* %this to i64*
+  %3 = load i64, i64* %2, align 8
+  %sub.ptr.sub.i = sub i64 %1, %3
+  %cmp = icmp sgt i64 %sub.ptr.sub.i, 9
+  br i1 %cmp, label %return, label %lor.lhs.false
+
+lor.lhs.false:
+  %4 = inttoptr i64 %3 to i8*
+  %5 = inttoptr i64 %1 to i8*
+  %cmp2 = icmp ugt i8* %5, %4
+  %. = select i1 %cmp2, i64 2, i64 -1
+  ret i64 %.
+
+return:
+  ret i64 2
+}
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/PowerPC/short-to-double.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/PowerPC/short-to-double.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/PowerPC/short-to-double.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/PowerPC/short-to-double.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,39 @@
+; RUN: opt -S -mtriple=powerpc64-linux-gnu -mcpu=pwr9 -mattr=+vsx -slp-vectorizer < %s | FileCheck %s --check-prefix=CHECK-P9
+; RUN: opt -S -mtriple=powerpc64-linux-gnu -mcpu=pwr8 -mattr=+vsx -slp-vectorizer < %s | FileCheck %s --check-prefix=CHECK-P8
+
+%struct._pp = type { i16, i16, i16, i16 }
+
+; Function Attrs: norecurse nounwind readonly
+define [5 x double] @foo(double %k, i64 %n, %struct._pp* nocapture readonly %p) local_unnamed_addr #0 {
+entry:
+  %cmp17 = icmp sgt i64 %n, 0
+  br i1 %cmp17, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %retval.sroa.0.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %retval.sroa.4.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add10, %for.body ]
+  %.fca.0.insert = insertvalue [5 x double] undef, double %retval.sroa.0.0.lcssa, 0
+  %.fca.1.insert = insertvalue [5 x double] %.fca.0.insert, double %retval.sroa.4.0.lcssa, 1
+  ret [5 x double] %.fca.1.insert
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.020 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %retval.sroa.4.019 = phi double [ %add10, %for.body ], [ 0.000000e+00, %entry ]
+  %retval.sroa.0.018 = phi double [ %add, %for.body ], [ 0.000000e+00, %entry ]
+  %r1 = getelementptr inbounds %struct._pp, %struct._pp* %p, i64 %i.020, i32 2
+  %0 = load i16, i16* %r1, align 2
+  %conv2 = uitofp i16 %0 to double
+  %mul = fmul double %conv2, %k
+  %add = fadd double %retval.sroa.0.018, %mul
+  %g5 = getelementptr inbounds %struct._pp, %struct._pp* %p, i64 %i.020, i32 1
+  %1 = load i16, i16* %g5, align 2
+  %conv7 = uitofp i16 %1 to double
+  %mul8 = fmul double %conv7, %k
+  %add10 = fadd double %retval.sroa.4.019, %mul8
+  %inc = add nuw nsw i64 %i.020, 1
+  %exitcond = icmp eq i64 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-P8: load <2 x i16>
+; CHECK-P9-NOT: load <2 x i16>

Added: llvm/trunk/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,36 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=systemz-unknown -mcpu=z13 -slp-vectorizer -debug-only=SLP \
+; RUN:   -S -disable-output < %s 2>&1 | FileCheck %s
+;
+; Check that SLP vectorizer gets the right cost difference for a compare
+; node.
+
+; Function Attrs: norecurse nounwind readonly
+define void @fun(i8* nocapture, i32 zeroext) local_unnamed_addr #0 {
+.lr.ph.preheader:
+  br label %.lr.ph
+
+.lr.ph:                                           ; preds = %.lr.ph.preheader, %.lr.ph
+  %2 = phi i32 [ %., %.lr.ph ], [ undef, %.lr.ph.preheader ]
+  %3 = phi i32 [ %.9, %.lr.ph ], [ undef, %.lr.ph.preheader ]
+  %4 = icmp ult i32 %2, %1
+  %5 = select i1 %4, i32 0, i32 %1
+  %. = sub i32 %2, %5
+  %6 = icmp ult i32 %3, %1
+  %7 = select i1 %6, i32 0, i32 %1
+  %.9 = sub i32 %3, %7
+  %8 = zext i32 %. to i64
+  %9 = getelementptr inbounds i8, i8* %0, i64 %8
+  %10 = load i8, i8* %9, align 1
+  %11 = zext i32 %.9 to i64
+  %12 = getelementptr inbounds i8, i8* %0, i64 %11
+  %13 = load i8, i8* %12, align 1
+  %14 = icmp eq i8 %10, %13
+  br i1 %14, label %.lr.ph, label %._crit_edge
+
+._crit_edge:                                      ; preds = %.lr.ph
+  ret void
+
+; CHECK: SLP: Adding cost -1 for bundle that starts with   %4 = icmp ult i32 %2, %1.
+}
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/SystemZ/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/SystemZ/lit.local.cfg?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/SystemZ/lit.local.cfg (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/SystemZ/lit.local.cfg Tue Apr 16 21:52:47 2019
@@ -0,0 +1,3 @@
+if not 'SystemZ' in config.root.targets:
+    config.unsupported = True
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=systemz-unknown -mcpu=z13 -slp-vectorizer -S < %s | FileCheck %s
+
+ at bar = external global [4 x [4 x i32]], align 4
+ at dct_luma = external global [4 x [4 x i32]], align 4
+
+define void @foo() local_unnamed_addr {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD277:%.*]] = add nsw i32 undef, undef
+; CHECK-NEXT:    store i32 [[ADD277]], i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 1), align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 0), align 4
+; CHECK-NEXT:    [[ARRAYIDX372:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 0
+; CHECK-NEXT:    [[ARRAYIDX372_1:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 2), align 4
+; CHECK-NEXT:    [[ARRAYIDX372_2:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 3), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[ADD277]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> undef, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = ashr <4 x i32> [[TMP7]], <i32 6, i32 6, i32 6, i32 6>
+; CHECK-NEXT:    [[ARRAYIDX372_3:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[ARRAYIDX372]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* [[TMP9]], align 4
+; CHECK-NEXT:    unreachable
+;
+entry:
+  %add277 = add nsw i32 undef, undef
+  store i32 %add277, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 1), align 4
+  %0 = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 0), align 4
+  %sub355 = add nsw i32 undef, %0
+  %shr.i = ashr i32 %sub355, 6
+  %arrayidx372 = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 0
+  store i32 %shr.i, i32* %arrayidx372, align 4
+  %sub355.1 = add nsw i32 undef, %add277
+  %shr.i.1 = ashr i32 %sub355.1, 6
+  %arrayidx372.1 = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 1
+  store i32 %shr.i.1, i32* %arrayidx372.1, align 4
+  %1 = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 2), align 4
+  %sub355.2 = add nsw i32 undef, %1
+  %shr.i.2 = ashr i32 %sub355.2, 6
+  %arrayidx372.2 = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 2
+  store i32 %shr.i.2, i32* %arrayidx372.2, align 4
+  %2 = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 3), align 4
+  %sub355.3 = add nsw i32 undef, %2
+  %shr.i.3 = ashr i32 %sub355.3, 6
+  %arrayidx372.3 = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 3
+  store i32 %shr.i.3, i32* %arrayidx372.3, align 4
+  unreachable
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/PR32086.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/PR32086.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/PR32086.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/PR32086.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 | FileCheck %s
+
+define void @i64_simplified(i64* noalias %st, i64* noalias %ld) {
+; CHECK-LABEL: @i64_simplified(
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[ST]] to <4 x i64>*
+; CHECK-NEXT:    store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8
+; CHECK-NEXT:    ret void
+;
+  %arrayidx1 = getelementptr inbounds i64, i64* %ld, i64 1
+
+  %t0 = load i64, i64* %ld, align 8
+  %t1 = load i64, i64* %arrayidx1, align 8
+
+  %arrayidx3 = getelementptr inbounds i64, i64* %st, i64 1
+  %arrayidx4 = getelementptr inbounds i64, i64* %st, i64 2
+  %arrayidx5 = getelementptr inbounds i64, i64* %st, i64 3
+
+  store i64 %t0, i64* %st, align 8
+  store i64 %t1, i64* %arrayidx3, align 8
+  store i64 %t0, i64* %arrayidx4, align 8
+  store i64 %t1, i64* %arrayidx5, align 8
+  ret void
+}
+
+define void @i64_simplifiedi_reversed(i64* noalias %st, i64* noalias %ld) {
+; CHECK-LABEL: @i64_simplifiedi_reversed(
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64* [[ST]] to <4 x i64>*
+; CHECK-NEXT:    store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP4]], align 8
+; CHECK-NEXT:    ret void
+;
+  %arrayidx1 = getelementptr inbounds i64, i64* %ld, i64 1
+
+  %t0 = load i64, i64* %ld, align 8
+  %t1 = load i64, i64* %arrayidx1, align 8
+
+  %arrayidx3 = getelementptr inbounds i64, i64* %st, i64 1
+  %arrayidx4 = getelementptr inbounds i64, i64* %st, i64 2
+  %arrayidx5 = getelementptr inbounds i64, i64* %st, i64 3
+
+  store i64 %t1, i64* %st, align 8
+  store i64 %t0, i64* %arrayidx3, align 8
+  store i64 %t1, i64* %arrayidx4, align 8
+  store i64 %t0, i64* %arrayidx5, align 8
+  ret void
+}
+
+define void @i64_simplifiedi_extract(i64* noalias %st, i64* noalias %ld) {
+; CHECK-LABEL: @i64_simplifiedi_extract(
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[ST]] to <4 x i64>*
+; CHECK-NEXT:    store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[SHUFFLE]], i32 3
+; CHECK-NEXT:    store i64 [[TMP4]], i64* [[LD]], align 8
+; CHECK-NEXT:    ret void
+;
+  %arrayidx1 = getelementptr inbounds i64, i64* %ld, i64 1
+
+  %t0 = load i64, i64* %ld, align 8
+  %t1 = load i64, i64* %arrayidx1, align 8
+
+  %arrayidx3 = getelementptr inbounds i64, i64* %st, i64 1
+  %arrayidx4 = getelementptr inbounds i64, i64* %st, i64 2
+  %arrayidx5 = getelementptr inbounds i64, i64* %st, i64 3
+
+  store i64 %t0, i64* %st, align 8
+  store i64 %t0, i64* %arrayidx3, align 8
+  store i64 %t0, i64* %arrayidx4, align 8
+  store i64 %t1, i64* %arrayidx5, align 8
+  store i64 %t1, i64* %ld, align 8
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/PR34635.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/PR34635.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/PR34635.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/PR34635.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown-linux -slp-vectorizer -S -mcpu=corei7 | FileCheck %s
+
+define i32 @main() {
+; CHECK-LABEL: @main(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <8 x i32>, align 32
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32>* [[TMP]] to [8 x i32]*
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32>* [[TMP]] to i8*
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <8 x i32>, <8 x i32>* [[TMP]], i64 0, i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* [[TMP1]], i64 0, i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* [[TMP1]], i64 0, i64 2
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* [[TMP1]], i64 0, i64 3
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* [[TMP1]], i64 0, i64 4
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* [[TMP1]], i64 0, i64 6
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* [[TMP1]], i64 0, i64 5
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* [[TMP1]], i64 0, i64 7
+; CHECK-NEXT:    store <8 x i32> <i32 -221320154, i32 -756426931, i32 563883532, i32 382683935, i32 144890241, i32 -1052877364, i32 -1052877364, i32 -1016007675>, <8 x i32>* [[TMP]], align 32
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP2]] to i8*
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP4]], align 32
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp slt i32 [[TMP14]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 [[TMP14]], i32 [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP15]] to i32
+; CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp slt i32 [[TMP18]], [[TMP16]]
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP18]], i32 [[TMP16]]
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP19]], i32 2, i32 [[TMP16]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp slt i32 [[TMP22]], [[TMP20]]
+; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP22]], i32 [[TMP20]]
+; CHECK-NEXT:    [[TMP25:%.*]] = select i1 [[TMP23]], i32 3, i32 [[TMP21]]
+; CHECK-NEXT:    [[TMP26:%.*]] = load i32, i32* [[TMP8]], align 16
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp slt i32 [[TMP26]], [[TMP24]]
+; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 [[TMP26]], i32 [[TMP24]]
+; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i32 4, i32 [[TMP25]]
+; CHECK-NEXT:    [[TMP30:%.*]] = load i32, i32* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp slt i32 [[TMP30]], [[TMP28]]
+; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP30]], i32 [[TMP28]]
+; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 5, i32 [[TMP29]]
+; CHECK-NEXT:    [[TMP34:%.*]] = load i32, i32* [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp slt i32 [[TMP34]], [[TMP32]]
+; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 [[TMP34]], i32 [[TMP32]]
+; CHECK-NEXT:    [[TMP37:%.*]] = select i1 [[TMP35]], i32 6, i32 [[TMP33]]
+; CHECK-NEXT:    [[TMP38:%.*]] = load i32, i32* [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP39:%.*]] = icmp slt i32 [[TMP38]], [[TMP36]]
+; CHECK-NEXT:    [[TMP40:%.*]] = select i1 [[TMP39]], i32 7, i32 [[TMP37]]
+; CHECK-NEXT:    store i32 [[TMP40]], i32* [[TMP2]], align 4
+; CHECK-NEXT:    ret i32 0
+;
+bb:
+  %tmp = alloca <8 x i32>, align 32
+  %tmp1 = bitcast <8 x i32>* %tmp to [8 x i32]*
+  %tmp2 = alloca i32, align 4
+  %tmp3 = bitcast <8 x i32>* %tmp to i8*
+  %tmp4 = getelementptr inbounds <8 x i32>, <8 x i32>* %tmp, i64 0, i64 0
+  %tmp5 = getelementptr inbounds [8 x i32], [8 x i32]* %tmp1, i64 0, i64 1
+  %tmp6 = getelementptr inbounds [8 x i32], [8 x i32]* %tmp1, i64 0, i64 2
+  %tmp7 = getelementptr inbounds [8 x i32], [8 x i32]* %tmp1, i64 0, i64 3
+  %tmp8 = getelementptr inbounds [8 x i32], [8 x i32]* %tmp1, i64 0, i64 4
+  %tmp9 = getelementptr inbounds [8 x i32], [8 x i32]* %tmp1, i64 0, i64 6
+  %tmp10 = getelementptr inbounds [8 x i32], [8 x i32]* %tmp1, i64 0, i64 5
+  %tmp11 = getelementptr inbounds [8 x i32], [8 x i32]* %tmp1, i64 0, i64 7
+  store <8 x i32> <i32 -221320154, i32 -756426931, i32 563883532, i32 382683935, i32 144890241, i32 -1052877364, i32 -1052877364, i32 -1016007675>, <8 x i32>* %tmp, align 32
+  %tmp12 = bitcast i32* %tmp2 to i8*
+  %tmp13 = load i32, i32* %tmp4, align 32
+  %tmp14 = load i32, i32* %tmp5, align 4
+  %tmp15 = icmp slt i32 %tmp14, %tmp13
+  %tmp16 = select i1 %tmp15, i32 %tmp14, i32 %tmp13
+  %tmp17 = zext i1 %tmp15 to i32
+  %tmp18 = load i32, i32* %tmp6, align 8
+  %tmp19 = icmp slt i32 %tmp18, %tmp16
+  %tmp20 = select i1 %tmp19, i32 %tmp18, i32 %tmp16
+  %tmp21 = select i1 %tmp19, i32 2, i32 %tmp16
+  %tmp22 = load i32, i32* %tmp7, align 4
+  %tmp23 = icmp slt i32 %tmp22, %tmp20
+  %tmp24 = select i1 %tmp23, i32 %tmp22, i32 %tmp20
+  %tmp25 = select i1 %tmp23, i32 3, i32 %tmp21
+  %tmp26 = load i32, i32* %tmp8, align 16
+  %tmp27 = icmp slt i32 %tmp26, %tmp24
+  %tmp28 = select i1 %tmp27, i32 %tmp26, i32 %tmp24
+  %tmp29 = select i1 %tmp27, i32 4, i32 %tmp25
+  %tmp30 = load i32, i32* %tmp10, align 4
+  %tmp31 = icmp slt i32 %tmp30, %tmp28
+  %tmp32 = select i1 %tmp31, i32 %tmp30, i32 %tmp28
+  %tmp33 = select i1 %tmp31, i32 5, i32 %tmp29
+  %tmp34 = load i32, i32* %tmp9, align 8
+  %tmp35 = icmp slt i32 %tmp34, %tmp32
+  %tmp36 = select i1 %tmp35, i32 %tmp34, i32 %tmp32
+  %tmp37 = select i1 %tmp35, i32 6, i32 %tmp33
+  %tmp38 = load i32, i32* %tmp11, align 4
+  %tmp39 = icmp slt i32 %tmp38, %tmp36
+  %tmp40 = select i1 %tmp39, i32 7, i32 %tmp37
+  store i32 %tmp40, i32* %tmp2, align 4
+  ret i32 0
+}
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/PR35628_1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/PR35628_1.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/PR35628_1.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/PR35628_1.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+
+define void @mainTest(i32* %ptr) #0  {
+; CHECK-LABEL: @mainTest(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32* [[PTR:%.*]], null
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP:%.*]], label [[BAIL_OUT:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[DUMMY_PHI:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[OP_EXTRA5:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[PTR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i32> [[TMP4]], [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 1, undef
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], undef
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], undef
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP13]], [[TMP5]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP8]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = add i32 [[TMP16]], 1
+; CHECK-NEXT:    [[OP_EXTRA3:%.*]] = add i32 [[OP_EXTRA]], [[TMP7]]
+; CHECK-NEXT:    [[OP_EXTRA4:%.*]] = add i32 [[OP_EXTRA3]], [[TMP6]]
+; CHECK-NEXT:    [[OP_EXTRA5]] = add i32 [[OP_EXTRA4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP15]], undef
+; CHECK-NEXT:    br label [[LOOP]]
+; CHECK:       bail_out:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp eq i32* %ptr, null
+  br i1 %cmp, label %loop, label %bail_out
+
+loop:
+  %dummy_phi = phi i32 [ 1, %entry ], [ %18, %loop ]
+  %0 = load i32, i32 * %ptr , align 4
+  %1 = mul i32 %0, %0
+  %2 = add i32 1, %1
+  %3 = getelementptr inbounds i32, i32 * %ptr, i64 1
+  %4 = load i32, i32 * %3 , align 4
+  %5 = mul i32 %4, %4
+  %6 = add i32 %2, %4
+  %7 = add i32 %6, %5
+  %8 = getelementptr inbounds i32, i32 *%ptr, i64 2
+  %9 = load i32, i32 * %8 , align 4
+  %10 = mul i32 %9, %9
+  %11 = add i32 %7, %9
+  %12 = add i32 %11, %10
+  %13 = sext i32 %9 to i64
+  %14 = getelementptr inbounds i32, i32 *%ptr, i64 3
+  %15 = load i32, i32 * %14 , align 4
+  %16 = mul i32 %15, %15
+  %17 = add i32 %12, %15
+  %18 = add i32 %17, %16
+  br label %loop
+
+bail_out:
+  ret void
+}
+
+attributes #0 = { "target-cpu"="westmere" }
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/PR35628_2.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/PR35628_2.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/PR35628_2.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=haswell | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+
+define void @test() #0 {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[DUMMY_PHI:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[OP_EXTRA3:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 2, [[ENTRY]] ], [ [[TMP6:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[DUMMY_ADD:%.*]] = add i16 0, 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[TMP4]], <i64 3, i64 2, i64 1, i64 0>
+; CHECK-NEXT:    [[TMP6]] = extractelement <4 x i64> [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[DUMMY_SHL:%.*]] = shl i64 [[TMP7]], 32
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i64> <i64 1, i64 1, i64 1, i64 1>, [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = ashr exact <4 x i64> [[TMP8]], <i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[SUM1:%.*]] = add i64 undef, undef
+; CHECK-NEXT:    [[SUM2:%.*]] = add i64 [[SUM1]], undef
+; CHECK-NEXT:    [[ZSUM:%.*]] = add i64 [[SUM2]], 0
+; CHECK-NEXT:    [[JOIN:%.*]] = add i64 [[TMP6]], [[ZSUM]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i64> [[TMP9]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[BIN_RDX]], <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i64> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = add i64 [[TMP10]], 0
+; CHECK-NEXT:    [[OP_EXTRA3]] = add i64 [[OP_EXTRA]], [[TMP6]]
+; CHECK-NEXT:    [[LAST:%.*]] = add i64 [[JOIN]], undef
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %dummy_phi = phi i64 [ 1, %entry ], [ %last, %loop ]
+  %0 = phi i64 [ 2, %entry ], [ %fork, %loop ]
+  %inc1 = add i64 %0, 1
+  %inc2 = add i64 %0, 2
+  %inc11 = add i64 1, %inc1
+  %exact1 = ashr exact i64 %inc11, 32
+  %inc3 = add i64 %0, 3
+  %dummy_add = add i16 0, 0
+  %inc12 = add i64 1, %inc2
+  %exact2 = ashr exact i64 %inc12, 32
+  %dummy_shl = shl i64 %inc3, 32
+  %inc13 = add i64 1, %inc3
+  %exact3 = ashr exact i64 %inc13, 32
+  %fork = add i64 %0, 0
+  %sum1 = add i64 %exact3, %exact2
+  %sum2 = add i64 %sum1, %exact1
+  %zsum = add i64 %sum2, 0
+  %sext22 = add i64 1, %fork
+  %exact4 = ashr exact i64 %sext22, 32
+  %join = add i64 %fork, %zsum
+  %last = add i64 %join, %exact4
+  br label %loop
+}
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/PR35777.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/PR35777.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/PR35777.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/PR35777.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -verify -slp-vectorizer -o - -S -mtriple=x86_64-apple-macosx10.13.0 | FileCheck %s
+
+ at global = local_unnamed_addr global [6 x double] zeroinitializer, align 16
+
+define { i64, i64 } @patatino(double %arg) {
+; CHECK-LABEL: @patatino(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, <2 x double>* bitcast ([6 x double]* @global to <2 x double>*), align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 2) to <2 x double>*), align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[ARG:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[ARG]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP0]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 4) to <2 x double>*), align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fptosi <2 x double> [[TMP7]] to <2 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = sext <2 x i32> [[TMP8]] to <2 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP17:%.*]] = insertvalue { i64, i64 } [[TMP16]], i64 [[TMP11]], 1
+; CHECK-NEXT:    ret { i64, i64 } [[TMP17]]
+;
+bb:
+  %tmp = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 0), align 16
+  %tmp1 = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 2), align 16
+  %tmp2 = fmul double %tmp1, %arg
+  %tmp3 = fadd double %tmp, %tmp2
+  %tmp4 = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 4), align 16
+  %tmp5 = fadd double %tmp4, %tmp3
+  %tmp6 = fptosi double %tmp5 to i32
+  %tmp7 = sext i32 %tmp6 to i64
+  %tmp8 = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 1), align 8
+  %tmp9 = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 3), align 8
+  %tmp10 = fmul double %tmp9, %arg
+  %tmp11 = fadd double %tmp8, %tmp10
+  %tmp12 = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 5), align 8
+  %tmp13 = fadd double %tmp12, %tmp11
+  %tmp14 = fptosi double %tmp13 to i32
+  %tmp15 = sext i32 %tmp14 to i64
+  %tmp16 = insertvalue { i64, i64 } undef, i64 %tmp7, 0
+  %tmp17 = insertvalue { i64, i64 } %tmp16, i64 %tmp15, 1
+  ret { i64, i64 } %tmp17
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/PR35865.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/PR35865.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/PR35865.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/PR35865.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer < %s -S -o - -mtriple=x86_64-apple-macosx10.10.0 -mcpu=core2 | FileCheck %s
+
+define void @_Z10fooConvertPDv4_xS0_S0_PKS_() {
+; CHECK-LABEL: @_Z10fooConvertPDv4_xS0_S0_PKS_(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <16 x half> undef, i32 4
+; CHECK-NEXT:    [[CONV_I_4_I:%.*]] = fpext half [[TMP0]] to float
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[CONV_I_4_I]] to i32
+; CHECK-NEXT:    [[VECINS_I_4_I:%.*]] = insertelement <8 x i32> undef, i32 [[TMP1]], i32 4
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x half> undef, i32 5
+; CHECK-NEXT:    [[CONV_I_5_I:%.*]] = fpext half [[TMP2]] to float
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[CONV_I_5_I]] to i32
+; CHECK-NEXT:    [[VECINS_I_5_I:%.*]] = insertelement <8 x i32> [[VECINS_I_4_I]], i32 [[TMP3]], i32 5
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = extractelement <16 x half> undef, i32 4
+  %conv.i.4.i = fpext half %0 to float
+  %1 = bitcast float %conv.i.4.i to i32
+  %vecins.i.4.i = insertelement <8 x i32> undef, i32 %1, i32 4
+  %2 = extractelement <16 x half> undef, i32 5
+  %conv.i.5.i = fpext half %2 to float
+  %3 = bitcast float %conv.i.5.i to i32
+  %vecins.i.5.i = insertelement <8 x i32> %vecins.i.4.i, i32 %3, i32 5
+  ret void
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/PR36280.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/PR36280.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/PR36280.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/PR36280.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 | FileCheck %s
+
+define float @jacobi(float* %p, float %x, float %y, float %z) {
+; CHECK-LABEL: @jacobi(
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr float, float* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr float, float* [[P]], i64 2
+; CHECK-NEXT:    [[P1:%.*]] = load float, float* [[GEP1]]
+; CHECK-NEXT:    [[P2:%.*]] = load float, float* [[GEP2]]
+; CHECK-NEXT:    [[MUL1:%.*]] = fmul float [[P1]], [[X:%.*]]
+; CHECK-NEXT:    [[MUL2:%.*]] = fmul float [[P2]], [[Y:%.*]]
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd float [[MUL1]], [[Z:%.*]]
+; CHECK-NEXT:    [[ADD2:%.*]] = fadd float [[MUL2]], [[ADD1]]
+; CHECK-NEXT:    ret float [[ADD2]]
+;
+  %gep1 = getelementptr float, float* %p, i64 1
+  %gep2 = getelementptr float, float* %p, i64 2
+  %p1 = load float, float* %gep1
+  %p2 = load float, float* %gep2
+  %mul1 = fmul float %p1, %x
+  %mul2 = fmul float %p2, %y
+  %add1 = fadd float %mul1, %z
+  %add2 = fadd float %mul2, %add1
+  ret float %add2
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/PR39774.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/PR39774.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/PR39774.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/PR39774.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,237 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-7 | FileCheck %s --check-prefixes=ALL,CHECK
+; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-8 -slp-min-tree-size=6 | FileCheck %s --check-prefixes=ALL,FORCE_REDUCTION
+
+define void @Test(i32) {
+; CHECK-LABEL: @Test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP15:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
+; CHECK-NEXT:    [[VAL_1:%.*]] = and i32 [[TMP2]], undef
+; CHECK-NEXT:    [[VAL_2:%.*]] = and i32 [[VAL_1]], [[TMP0:%.*]]
+; CHECK-NEXT:    [[VAL_3:%.*]] = and i32 [[VAL_2]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_4:%.*]] = and i32 [[VAL_3]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_5:%.*]] = and i32 [[VAL_4]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_7:%.*]] = and i32 [[VAL_5]], undef
+; CHECK-NEXT:    [[VAL_8:%.*]] = and i32 [[VAL_7]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_9:%.*]] = and i32 [[VAL_8]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_10:%.*]] = and i32 [[VAL_9]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_12:%.*]] = and i32 [[VAL_10]], undef
+; CHECK-NEXT:    [[VAL_13:%.*]] = and i32 [[VAL_12]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_14:%.*]] = and i32 [[VAL_13]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_15:%.*]] = and i32 [[VAL_14]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_16:%.*]] = and i32 [[VAL_15]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_17:%.*]] = and i32 [[VAL_16]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_19:%.*]] = and i32 [[VAL_17]], undef
+; CHECK-NEXT:    [[VAL_21:%.*]] = and i32 [[VAL_19]], undef
+; CHECK-NEXT:    [[VAL_22:%.*]] = and i32 [[VAL_21]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_23:%.*]] = and i32 [[VAL_22]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_24:%.*]] = and i32 [[VAL_23]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_25:%.*]] = and i32 [[VAL_24]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_26:%.*]] = and i32 [[VAL_25]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_27:%.*]] = and i32 [[VAL_26]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_28:%.*]] = and i32 [[VAL_27]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_29:%.*]] = and i32 [[VAL_28]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_30:%.*]] = and i32 [[VAL_29]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_31:%.*]] = and i32 [[VAL_30]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_32:%.*]] = and i32 [[VAL_31]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_33:%.*]] = and i32 [[VAL_32]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_35:%.*]] = and i32 [[VAL_33]], undef
+; CHECK-NEXT:    [[VAL_36:%.*]] = and i32 [[VAL_35]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_37:%.*]] = and i32 [[VAL_36]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_38:%.*]] = and i32 [[VAL_37]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_40:%.*]] = and i32 [[VAL_38]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = and <8 x i32> [[TMP3]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = and <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = and <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = and i32 [[TMP4]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA5:%.*]] = and i32 [[OP_EXTRA]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA6:%.*]] = and i32 [[OP_EXTRA5]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA7:%.*]] = and i32 [[OP_EXTRA6]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA8:%.*]] = and i32 [[OP_EXTRA7]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA9:%.*]] = and i32 [[OP_EXTRA8]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA10:%.*]] = and i32 [[OP_EXTRA9]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA11:%.*]] = and i32 [[OP_EXTRA10]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA12:%.*]] = and i32 [[OP_EXTRA11]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA13:%.*]] = and i32 [[OP_EXTRA12]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA14:%.*]] = and i32 [[OP_EXTRA13]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA15:%.*]] = and i32 [[OP_EXTRA14]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA16:%.*]] = and i32 [[OP_EXTRA15]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA17:%.*]] = and i32 [[OP_EXTRA16]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA18:%.*]] = and i32 [[OP_EXTRA17]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA19:%.*]] = and i32 [[OP_EXTRA18]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA20:%.*]] = and i32 [[OP_EXTRA19]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA21:%.*]] = and i32 [[OP_EXTRA20]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA22:%.*]] = and i32 [[OP_EXTRA21]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA23:%.*]] = and i32 [[OP_EXTRA22]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA24:%.*]] = and i32 [[OP_EXTRA23]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA25:%.*]] = and i32 [[OP_EXTRA24]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA26:%.*]] = and i32 [[OP_EXTRA25]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA27:%.*]] = and i32 [[OP_EXTRA26]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA28:%.*]] = and i32 [[OP_EXTRA27]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA29:%.*]] = and i32 [[OP_EXTRA28]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA30:%.*]] = and i32 [[OP_EXTRA29]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_42:%.*]] = and i32 [[VAL_40]], undef
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> undef, i32 [[OP_EXTRA30]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 14910, i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = and <2 x i32> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <2 x i32> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i32> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> undef, i32 [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP15]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP14]], i32 1
+; CHECK-NEXT:    br label [[LOOP]]
+;
+; FORCE_REDUCTION-LABEL: @Test(
+; FORCE_REDUCTION-NEXT:  entry:
+; FORCE_REDUCTION-NEXT:    br label [[LOOP:%.*]]
+; FORCE_REDUCTION:       loop:
+; FORCE_REDUCTION-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP13:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
+; FORCE_REDUCTION-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+; FORCE_REDUCTION-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 1
+; FORCE_REDUCTION-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240>
+; FORCE_REDUCTION-NEXT:    [[VAL_1:%.*]] = and i32 [[TMP2]], undef
+; FORCE_REDUCTION-NEXT:    [[VAL_2:%.*]] = and i32 [[VAL_1]], [[TMP0:%.*]]
+; FORCE_REDUCTION-NEXT:    [[VAL_3:%.*]] = and i32 [[VAL_2]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_4:%.*]] = and i32 [[VAL_3]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_5:%.*]] = and i32 [[VAL_4]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_7:%.*]] = and i32 [[VAL_5]], undef
+; FORCE_REDUCTION-NEXT:    [[VAL_8:%.*]] = and i32 [[VAL_7]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_9:%.*]] = and i32 [[VAL_8]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_10:%.*]] = and i32 [[VAL_9]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_12:%.*]] = and i32 [[VAL_10]], undef
+; FORCE_REDUCTION-NEXT:    [[VAL_13:%.*]] = and i32 [[VAL_12]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_14:%.*]] = and i32 [[VAL_13]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_15:%.*]] = and i32 [[VAL_14]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_16:%.*]] = and i32 [[VAL_15]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_17:%.*]] = and i32 [[VAL_16]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_19:%.*]] = and i32 [[VAL_17]], undef
+; FORCE_REDUCTION-NEXT:    [[VAL_20:%.*]] = add i32 [[TMP2]], 1496
+; FORCE_REDUCTION-NEXT:    [[VAL_21:%.*]] = and i32 [[VAL_19]], [[VAL_20]]
+; FORCE_REDUCTION-NEXT:    [[VAL_22:%.*]] = and i32 [[VAL_21]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_23:%.*]] = and i32 [[VAL_22]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_24:%.*]] = and i32 [[VAL_23]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_25:%.*]] = and i32 [[VAL_24]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_26:%.*]] = and i32 [[VAL_25]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_27:%.*]] = and i32 [[VAL_26]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_28:%.*]] = and i32 [[VAL_27]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_29:%.*]] = and i32 [[VAL_28]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_30:%.*]] = and i32 [[VAL_29]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_31:%.*]] = and i32 [[VAL_30]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_32:%.*]] = and i32 [[VAL_31]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_33:%.*]] = and i32 [[VAL_32]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_34:%.*]] = add i32 [[TMP2]], 8555
+; FORCE_REDUCTION-NEXT:    [[VAL_35:%.*]] = and i32 [[VAL_33]], [[VAL_34]]
+; FORCE_REDUCTION-NEXT:    [[VAL_36:%.*]] = and i32 [[VAL_35]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_37:%.*]] = and i32 [[VAL_36]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; FORCE_REDUCTION-NEXT:    [[BIN_RDX:%.*]] = and <4 x i32> [[TMP3]], [[RDX_SHUF]]
+; FORCE_REDUCTION-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; FORCE_REDUCTION-NEXT:    [[BIN_RDX2:%.*]] = and <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; FORCE_REDUCTION-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
+; FORCE_REDUCTION-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], [[VAL_20]]
+; FORCE_REDUCTION-NEXT:    [[TMP6:%.*]] = and i32 [[TMP5]], [[VAL_34]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA:%.*]] = and i32 [[TMP6]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA3:%.*]] = and i32 [[OP_EXTRA]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA4:%.*]] = and i32 [[OP_EXTRA3]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA5:%.*]] = and i32 [[OP_EXTRA4]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA6:%.*]] = and i32 [[OP_EXTRA5]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA7:%.*]] = and i32 [[OP_EXTRA6]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA8:%.*]] = and i32 [[OP_EXTRA7]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA9:%.*]] = and i32 [[OP_EXTRA8]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA10:%.*]] = and i32 [[OP_EXTRA9]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA11:%.*]] = and i32 [[OP_EXTRA10]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA12:%.*]] = and i32 [[OP_EXTRA11]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA13:%.*]] = and i32 [[OP_EXTRA12]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA14:%.*]] = and i32 [[OP_EXTRA13]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA15:%.*]] = and i32 [[OP_EXTRA14]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA16:%.*]] = and i32 [[OP_EXTRA15]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA17:%.*]] = and i32 [[OP_EXTRA16]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA18:%.*]] = and i32 [[OP_EXTRA17]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA19:%.*]] = and i32 [[OP_EXTRA18]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA20:%.*]] = and i32 [[OP_EXTRA19]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA21:%.*]] = and i32 [[OP_EXTRA20]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA22:%.*]] = and i32 [[OP_EXTRA21]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA23:%.*]] = and i32 [[OP_EXTRA22]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA24:%.*]] = and i32 [[OP_EXTRA23]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA25:%.*]] = and i32 [[OP_EXTRA24]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA26:%.*]] = and i32 [[OP_EXTRA25]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA27:%.*]] = and i32 [[OP_EXTRA26]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA28:%.*]] = and i32 [[OP_EXTRA27]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA29:%.*]] = and i32 [[OP_EXTRA28]], [[TMP2]]
+; FORCE_REDUCTION-NEXT:    [[VAL_38:%.*]] = and i32 [[VAL_37]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_39:%.*]] = add i32 [[TMP2]], 12529
+; FORCE_REDUCTION-NEXT:    [[VAL_40:%.*]] = and i32 [[OP_EXTRA29]], [[VAL_39]]
+; FORCE_REDUCTION-NEXT:    [[VAL_41:%.*]] = add i32 [[TMP2]], 13685
+; FORCE_REDUCTION-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[VAL_40]], i32 0
+; FORCE_REDUCTION-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP2]], i32 1
+; FORCE_REDUCTION-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> undef, i32 [[VAL_41]], i32 0
+; FORCE_REDUCTION-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 14910, i32 1
+; FORCE_REDUCTION-NEXT:    [[TMP11:%.*]] = and <2 x i32> [[TMP8]], [[TMP10]]
+; FORCE_REDUCTION-NEXT:    [[TMP12:%.*]] = add <2 x i32> [[TMP8]], [[TMP10]]
+; FORCE_REDUCTION-NEXT:    [[TMP13]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> <i32 0, i32 3>
+; FORCE_REDUCTION-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %local_4_39.us = phi i32 [ %val_42, %loop ], [ 0, %entry ]
+  %local_8_43.us = phi i32 [ %val_43, %loop ], [ 0, %entry ]
+  %val_0 = add i32 %local_4_39.us, 0
+  %val_1 = and i32 %local_8_43.us, %val_0
+  %val_2 = and i32 %val_1, %0
+  %val_3 = and i32 %val_2, %0
+  %val_4 = and i32 %val_3, %0
+  %val_5 = and i32 %val_4, %0
+  %val_6 = add i32 %local_8_43.us, 55
+  %val_7 = and i32 %val_5, %val_6
+  %val_8 = and i32 %val_7, %0
+  %val_9 = and i32 %val_8, %0
+  %val_10 = and i32 %val_9, %0
+  %val_11 = add i32 %local_8_43.us, 285
+  %val_12 = and i32 %val_10, %val_11
+  %val_13 = and i32 %val_12, %0
+  %val_14 = and i32 %val_13, %0
+  %val_15 = and i32 %val_14, %0
+  %val_16 = and i32 %val_15, %0
+  %val_17 = and i32 %val_16, %0
+  %val_18 = add i32 %local_8_43.us, 1240
+  %val_19 = and i32 %val_17, %val_18
+  %val_20 = add i32 %local_8_43.us, 1496
+  %val_21 = and i32 %val_19, %val_20
+  %val_22 = and i32 %val_21, %0
+  %val_23 = and i32 %val_22, %0
+  %val_24 = and i32 %val_23, %0
+  %val_25 = and i32 %val_24, %0
+  %val_26 = and i32 %val_25, %0
+  %val_27 = and i32 %val_26, %0
+  %val_28 = and i32 %val_27, %0
+  %val_29 = and i32 %val_28, %0
+  %val_30 = and i32 %val_29, %0
+  %val_31 = and i32 %val_30, %0
+  %val_32 = and i32 %val_31, %0
+  %val_33 = and i32 %val_32, %0
+  %val_34 = add i32 %local_8_43.us, 8555
+  %val_35 = and i32 %val_33, %val_34
+  %val_36 = and i32 %val_35, %0
+  %val_37 = and i32 %val_36, %0
+  %val_38 = and i32 %val_37, %0
+  %val_39 = add i32 %local_8_43.us, 12529
+  %val_40 = and i32 %val_38, %val_39
+  %val_41 = add i32 %local_8_43.us, 13685
+  %val_42 = and i32 %val_40, %val_41
+  %val_43 = add i32 %local_8_43.us, 14910
+  br label %loop
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/PR40310.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/PR40310.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/PR40310.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/PR40310.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake < %s | FileCheck %s
+
+define void @mainTest(i32 %param, i32 * %vals, i32 %len) {
+; CHECK-LABEL: @mainTest(
+; CHECK-NEXT:  bci_15.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 31, i32 undef>, i32 [[PARAM:%.*]], i32 1
+; CHECK-NEXT:    br label [[BCI_15:%.*]]
+; CHECK:       bci_15:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP7:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER:%.*]] ]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 15
+; CHECK-NEXT:    store atomic i32 [[TMP3]], i32* [[VALS:%.*]] unordered, align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add <16 x i32> [[SHUFFLE]], <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 -1>
+; CHECK-NEXT:    [[V14:%.*]] = and i32 [[TMP2]], undef
+; CHECK-NEXT:    [[V16:%.*]] = and i32 undef, [[V14]]
+; CHECK-NEXT:    [[V18:%.*]] = and i32 undef, [[V16]]
+; CHECK-NEXT:    [[V20:%.*]] = and i32 undef, [[V18]]
+; CHECK-NEXT:    [[V22:%.*]] = and i32 undef, [[V20]]
+; CHECK-NEXT:    [[V24:%.*]] = and i32 undef, [[V22]]
+; CHECK-NEXT:    [[V26:%.*]] = and i32 undef, [[V24]]
+; CHECK-NEXT:    [[V28:%.*]] = and i32 undef, [[V26]]
+; CHECK-NEXT:    [[V30:%.*]] = and i32 undef, [[V28]]
+; CHECK-NEXT:    [[V32:%.*]] = and i32 undef, [[V30]]
+; CHECK-NEXT:    [[V34:%.*]] = and i32 undef, [[V32]]
+; CHECK-NEXT:    [[V36:%.*]] = and i32 undef, [[V34]]
+; CHECK-NEXT:    [[V38:%.*]] = and i32 undef, [[V36]]
+; CHECK-NEXT:    [[V40:%.*]] = and i32 undef, [[V38]]
+; CHECK-NEXT:    [[V42:%.*]] = and i32 undef, [[V40]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = and <16 x i32> [[TMP4]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[BIN_RDX]], <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = and <16 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <16 x i32> [[BIN_RDX2]], <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = and <16 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <16 x i32> [[BIN_RDX4]], <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX6:%.*]] = and <16 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x i32> [[BIN_RDX6]], i32 0
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = and i32 [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[V43:%.*]] = and i32 undef, [[V42]]
+; CHECK-NEXT:    [[V44:%.*]] = add i32 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> undef, i32 [[V44]], i32 0
+; CHECK-NEXT:    [[TMP7]] = insertelement <2 x i32> [[TMP6]], i32 [[OP_EXTRA]], i32 1
+; CHECK-NEXT:    br i1 true, label [[BCI_15]], label [[LOOPEXIT:%.*]]
+; CHECK:       loopexit:
+; CHECK-NEXT:    ret void
+;
+bci_15.preheader:
+  br label %bci_15
+
+bci_15:                                       ; preds = %bci_15.preheader, %bci_15
+  %local_0_ = phi i32 [ %v43, %bci_15 ], [ %param, %bci_15.preheader ]
+  %local_4_ = phi i32 [ %v44, %bci_15 ], [ 31, %bci_15.preheader ]
+  %v12 = add i32 %local_0_, -1
+  store atomic i32 %local_0_, i32 * %vals unordered, align 4
+  %v13 = add i32 %local_4_, 1
+  %v14 = and i32 %local_4_, %v12
+  %v15 = add i32 %local_4_, 2
+  %v16 = and i32 %v13, %v14
+  %v17 = add i32 %local_4_, 3
+  %v18 = and i32 %v15, %v16
+  %v19 = add i32 %local_4_, 4
+  %v20 = and i32 %v17, %v18
+  %v21 = add i32 %local_4_, 5
+  %v22 = and i32 %v19, %v20
+  %v23 = add i32 %local_4_, 6
+  %v24 = and i32 %v21, %v22
+  %v25 = add i32 %local_4_, 7
+  %v26 = and i32 %v23, %v24
+  %v27 = add i32 %local_4_, 8
+  %v28 = and i32 %v25, %v26
+  %v29 = add i32 %local_4_, 9
+  %v30 = and i32 %v27, %v28
+  %v31 = add i32 %local_4_, 10
+  %v32 = and i32 %v29, %v30
+  %v33 = add i32 %local_4_, 11
+  %v34 = and i32 %v31, %v32
+  %v35 = add i32 %local_4_, 12
+  %v36 = and i32 %v33, %v34
+  %v37 = add i32 %local_4_, 13
+  %v38 = and i32 %v35, %v36
+  %v39 = add i32 %local_4_, 14
+  %v40 = and i32 %v37, %v38
+  %v41 = add i32 %local_4_, 15
+  %v42 = and i32 %v39, %v40
+  %v43 = and i32 %v41, %v42
+  %v44 = add i32 %local_4_, 16
+  br i1 true, label %bci_15, label %loopexit
+
+loopexit:
+  ret void
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/addsub.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/addsub.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/addsub.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/addsub.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,390 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at b = common global [4 x i32] zeroinitializer, align 16
+ at c = common global [4 x i32] zeroinitializer, align 16
+ at d = common global [4 x i32] zeroinitializer, align 16
+ at e = common global [4 x i32] zeroinitializer, align 16
+ at a = common global [4 x i32] zeroinitializer, align 16
+ at fb = common global [4 x float] zeroinitializer, align 16
+ at fc = common global [4 x float] zeroinitializer, align 16
+ at fa = common global [4 x float] zeroinitializer, align 16
+ at fd = common global [4 x float] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define void @addsub() #0 {
+; CHECK-LABEL: @addsub(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @b to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @c to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @d to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @e to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = sub nsw <4 x i32> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 0), align 4
+  %1 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 0), align 4
+  %add = add nsw i32 %0, %1
+  %2 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 0), align 4
+  %3 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 0), align 4
+  %add1 = add nsw i32 %2, %3
+  %add2 = add nsw i32 %add, %add1
+  store i32 %add2, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 0), align 4
+  %4 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 1), align 4
+  %5 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 1), align 4
+  %add3 = add nsw i32 %4, %5
+  %6 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 1), align 4
+  %7 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 1), align 4
+  %add4 = add nsw i32 %6, %7
+  %sub = sub nsw i32 %add3, %add4
+  store i32 %sub, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 1), align 4
+  %8 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 2), align 4
+  %9 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 2), align 4
+  %add5 = add nsw i32 %8, %9
+  %10 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 2), align 4
+  %11 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 2), align 4
+  %add6 = add nsw i32 %10, %11
+  %add7 = add nsw i32 %add5, %add6
+  store i32 %add7, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 2), align 4
+  %12 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 3), align 4
+  %13 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 3), align 4
+  %add8 = add nsw i32 %12, %13
+  %14 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 3), align 4
+  %15 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 3), align 4
+  %add9 = add nsw i32 %14, %15
+  %sub10 = sub nsw i32 %add8, %add9
+  store i32 %sub10, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 3), align 4
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define void @subadd() #0 {
+; CHECK-LABEL: @subadd(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @b to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @c to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @d to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @e to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <4 x i32> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 0), align 4
+  %1 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 0), align 4
+  %add = add nsw i32 %0, %1
+  %2 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 0), align 4
+  %3 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 0), align 4
+  %add1 = add nsw i32 %2, %3
+  %sub = sub nsw i32 %add, %add1
+  store i32 %sub, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 0), align 4
+  %4 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 1), align 4
+  %5 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 1), align 4
+  %add2 = add nsw i32 %4, %5
+  %6 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 1), align 4
+  %7 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 1), align 4
+  %add3 = add nsw i32 %6, %7
+  %add4 = add nsw i32 %add2, %add3
+  store i32 %add4, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 1), align 4
+  %8 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 2), align 4
+  %9 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 2), align 4
+  %add5 = add nsw i32 %8, %9
+  %10 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 2), align 4
+  %11 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 2), align 4
+  %add6 = add nsw i32 %10, %11
+  %sub7 = sub nsw i32 %add5, %add6
+  store i32 %sub7, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 2), align 4
+  %12 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 3), align 4
+  %13 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 3), align 4
+  %add8 = add nsw i32 %12, %13
+  %14 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 3), align 4
+  %15 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 3), align 4
+  %add9 = add nsw i32 %14, %15
+  %add10 = add nsw i32 %add8, %add9
+  store i32 %add10, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 3), align 4
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define void @faddfsub() #0 {
+; CHECK-LABEL: @faddfsub(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fb to <4 x float>*), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fc to <4 x float>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([4 x float]* @fa to <4 x float>*), align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4
+  %1 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4
+  %add = fadd float %0, %1
+  store float %add, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4
+  %2 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4
+  %3 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4
+  %sub = fsub float %2, %3
+  store float %sub, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4
+  %4 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4
+  %5 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4
+  %add1 = fadd float %4, %5
+  store float %add1, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4
+  %6 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4
+  %7 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4
+  %sub2 = fsub float %6, %7
+  store float %sub2, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define void @fsubfadd() #0 {
+; CHECK-LABEL: @fsubfadd(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fb to <4 x float>*), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fc to <4 x float>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x float> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([4 x float]* @fa to <4 x float>*), align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4
+  %1 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4
+  %sub = fsub float %0, %1
+  store float %sub, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4
+  %2 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4
+  %3 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4
+  %add = fadd float %2, %3
+  store float %add, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4
+  %4 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4
+  %5 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4
+  %sub1 = fsub float %4, %5
+  store float %sub1, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4
+  %6 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4
+  %7 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4
+  %add2 = fadd float %6, %7
+  store float %add2, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define void @faddfsub_select() #0 {
+; CHECK-LABEL: @faddfsub_select(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fb to <4 x float>*), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fc to <4 x float>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([4 x float]* @fa to <4 x float>*), align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4
+  %1 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4
+  %add = fadd float %0, %1
+  store float %add, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4
+  %2 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4
+  %3 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4
+  %add1 = fadd float %2, %3
+  store float %add1, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4
+  %4 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4
+  %5 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4
+  %add2 = fadd float %4, %5
+  store float %add2, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4
+  %6 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4
+  %7 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4
+  %sub = fsub float %6, %7
+  store float %sub, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4
+  ret void
+}
+
+; Check vectorization of following code for float data type-
+;  fc[0] = fb[0]+fa[0]; //swapped fb and fa
+;  fc[1] = fa[1]-fb[1];
+;  fc[2] = fa[2]+fb[2];
+;  fc[3] = fa[3]-fb[3];
+
+define void @reorder_alt() #0 {
+; CHECK-LABEL: @reorder_alt(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fa to <4 x float>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fb to <4 x float>*), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([4 x float]* @fc to <4 x float>*), align 4
+; CHECK-NEXT:    ret void
+;
+  %1 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4
+  %2 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4
+  %3 = fadd float %1, %2
+  store float %3, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4
+  %4 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4
+  %5 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4
+  %6 = fsub float %4, %5
+  store float %6, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4
+  %7 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4
+  %8 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4
+  %9 = fadd float %7, %8
+  store float %9, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4
+  %10 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4
+  %11 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4
+  %12 = fsub float %10, %11
+  store float %12, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4
+  ret void
+}
+
+; Check vectorization of following code for float data type-
+;  fc[0] = fa[0]+(fb[0]-fd[0]);
+;  fc[1] = fa[1]-(fb[1]+fd[1]);
+;  fc[2] = fa[2]+(fb[2]-fd[2]);
+;  fc[3] = fa[3]-(fd[3]+fb[3]); //swapped fd and fb
+
+define void @reorder_alt_subTree() #0 {
+; CHECK-LABEL: @reorder_alt_subTree(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fa to <4 x float>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fd to <4 x float>*), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fb to <4 x float>*), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <4 x float> [[TMP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fsub <4 x float> [[TMP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    store <4 x float> [[TMP9]], <4 x float>* bitcast ([4 x float]* @fc to <4 x float>*), align 4
+; CHECK-NEXT:    ret void
+;
+  %1 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4
+  %2 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4
+  %3 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fd, i32 0, i64 0), align 4
+  %4 = fsub float %2, %3
+  %5 = fadd float %1, %4
+  store float %5, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4
+  %6 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4
+  %7 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4
+  %8 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fd, i32 0, i64 1), align 4
+  %9 = fadd float %7, %8
+  %10 = fsub float %6, %9
+  store float %10, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4
+  %11 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4
+  %12 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4
+  %13 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fd, i32 0, i64 2), align 4
+  %14 = fsub float %12, %13
+  %15 = fadd float %11, %14
+  store float %15, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4
+  %16 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4
+  %17 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fd, i32 0, i64 3), align 4
+  %18 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4
+  %19 = fadd float %17, %18
+  %20 = fsub float %16, %19
+  store float %20, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4
+  ret void
+}
+
+; Check vectorization of following code for double data type-
+;  c[0] = (a[0]+b[0])-d[0];
+;  c[1] = d[1]+(a[1]+b[1]); //swapped d[1] and (a[1]+b[1])
+
+define void @reorder_alt_rightsubTree(double* nocapture %c, double* noalias nocapture readonly %a, double* noalias nocapture readonly %b, double* noalias nocapture readonly %d) {
+; CHECK-LABEL: @reorder_alt_rightsubTree(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, double* [[D:%.*]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[D]] to <2 x double>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[A]] to <2 x double>*
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double* [[B]] to <2 x double>*
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[TMP8]], align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd <2 x double> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fsub <2 x double> [[TMP10]], [[TMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x double> [[TMP10]], [[TMP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 1
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double* [[C]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP13]], <2 x double>* [[TMP15]], align 8
+; CHECK-NEXT:    ret void
+;
+  %1 = load double, double* %a
+  %2 = load double, double* %b
+  %3 = fadd double %1, %2
+  %4 = load double, double* %d
+  %5 = fsub double %3, %4
+  store double %5, double* %c
+  %6 = getelementptr inbounds double, double* %d, i64 1
+  %7 = load double, double* %6
+  %8 = getelementptr inbounds double, double* %a, i64 1
+  %9 = load double, double* %8
+  %10 = getelementptr inbounds double, double* %b, i64 1
+  %11 = load double, double* %10
+  %12 = fadd double %9, %11
+  %13 = fadd double %7, %12
+  %14 = getelementptr inbounds double, double* %c, i64 1
+  store double %13, double* %14
+  ret void
+}
+
+; Dont vectorization of following code for float data type as sub is not commutative-
+;  fc[0] = fb[0]+fa[0];
+;  fc[1] = fa[1]-fb[1];
+;  fc[2] = fa[2]+fb[2];
+;  fc[3] = fb[3]-fa[3];
+;  In the above code we can swap the 1st and 2nd operation as fadd is commutative
+;  but not 2nd or 4th as fsub is not commutative.
+
+define void @no_vec_shuff_reorder() #0 {
+; CHECK-LABEL: @no_vec_shuff_reorder(
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd float [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    store float [[TMP3]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = fsub float [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    store float [[TMP6]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd float [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    store float [[TMP9]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = fsub float [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    store float [[TMP12]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4
+; CHECK-NEXT:    ret void
+;
+  %1 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4
+  %2 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4
+  %3 = fadd float %1, %2
+  store float %3, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4
+  %4 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4
+  %5 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4
+  %6 = fsub float %4, %5
+  store float %6, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4
+  %7 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4
+  %8 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4
+  %9 = fadd float %7, %8
+  store float %9, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4
+  %10 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4
+  %11 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4
+  %12 = fsub float %10, %11
+  store float %12, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4
+  ret void
+}
+
+
+attributes #0 = { nounwind }
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/aggregate.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/aggregate.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/aggregate.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/aggregate.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=x86_64-unknown-linux -mcpu=corei7 -slp-vectorizer < %s | FileCheck %s
+
+%struct.S = type { i8*, i8* }
+
+ at kS0 = common global %struct.S zeroinitializer, align 8
+
+define { i64, i64 } @getS() {
+; CHECK-LABEL: @getS(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* bitcast (%struct.S* @kS0 to i64*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* bitcast (i8** getelementptr inbounds (%struct.S, %struct.S* @kS0, i64 0, i32 1) to i64*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i64, i64 } [[TMP2]], i64 [[TMP1]], 1
+; CHECK-NEXT:    ret { i64, i64 } [[TMP3]]
+;
+entry:
+  %0 = load i64, i64* bitcast (%struct.S* @kS0 to i64*), align 8
+  %1 = load i64, i64* bitcast (i8** getelementptr inbounds (%struct.S, %struct.S* @kS0, i64 0, i32 1) to i64*), align 8
+  %2 = insertvalue { i64, i64 } undef, i64 %0, 0
+  %3 = insertvalue { i64, i64 } %2, i64 %1, 1
+  ret { i64, i64 } %3
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/align.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/align.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/align.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/align.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Simple 3-pair chain with loads and stores
+define void @test1(double* %a, double* %b, double* %c) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AGG_TMP_I_I_SROA_0:%.*]] = alloca [3 x double], align 16
+; CHECK-NEXT:    [[STORE1:%.*]] = getelementptr inbounds [3 x double], [3 x double]* [[AGG_TMP_I_I_SROA_0]], i64 0, i64 1
+; CHECK-NEXT:    [[STORE2:%.*]] = getelementptr inbounds [3 x double], [3 x double]* [[AGG_TMP_I_I_SROA_0]], i64 0, i64 2
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[B]] to <2 x double>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[STORE1]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %agg.tmp.i.i.sroa.0 = alloca [3 x double], align 16
+  %i0 = load double, double* %a
+  %i1 = load double, double* %b
+  %mul = fmul double %i0, %i1
+  %store1 = getelementptr inbounds [3 x double], [3 x double]* %agg.tmp.i.i.sroa.0, i64 0, i64 1
+  %store2 = getelementptr inbounds [3 x double], [3 x double]* %agg.tmp.i.i.sroa.0, i64 0, i64 2
+  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
+  %i3 = load double, double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
+  %i4 = load double, double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %store1
+  store double %mul5, double* %store2, align 16
+  ret void
+}
+
+; Float has 4 byte abi alignment on x86_64. We must use the alignmnet of the
+; value being loaded/stored not the alignment of the pointer type.
+
+define void @test2(float * %a, float * %b) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[A2:%.*]] = getelementptr inbounds float, float* [[A]], i64 2
+; CHECK-NEXT:    [[A3:%.*]] = getelementptr inbounds float, float* [[A]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[A]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[B1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[B2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
+; CHECK-NEXT:    [[B3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[B]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l0 = load float, float* %a
+  %a1 = getelementptr inbounds float, float* %a, i64 1
+  %l1 = load float, float* %a1
+  %a2 = getelementptr inbounds float, float* %a, i64 2
+  %l2 = load float, float* %a2
+  %a3 = getelementptr inbounds float, float* %a, i64 3
+  %l3 = load float, float* %a3
+  store float %l0, float* %b
+  %b1 = getelementptr inbounds float, float* %b, i64 1
+  store float %l1, float* %b1
+  %b2 = getelementptr inbounds float, float* %b, i64 2
+  store float %l2, float* %b2
+  %b3 = getelementptr inbounds float, float* %b, i64 3
+  store float %l3, float* %b3
+  ret void
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/alternate-calls.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/alternate-calls.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/alternate-calls.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+
+define <8 x float> @ceil_floor(<8 x float> %a) {
+; CHECK-LABEL: @ceil_floor(
+; CHECK-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
+; CHECK-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
+; CHECK-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
+; CHECK-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
+; CHECK-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
+; CHECK-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
+; CHECK-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
+; CHECK-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
+; CHECK-NEXT:    [[AB1:%.*]] = call float @llvm.floor.f32(float [[A1]])
+; CHECK-NEXT:    [[AB2:%.*]] = call float @llvm.floor.f32(float [[A2]])
+; CHECK-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
+; CHECK-NEXT:    [[AB4:%.*]] = call float @llvm.ceil.f32(float [[A4]])
+; CHECK-NEXT:    [[AB5:%.*]] = call float @llvm.ceil.f32(float [[A5]])
+; CHECK-NEXT:    [[AB6:%.*]] = call float @llvm.floor.f32(float [[A6]])
+; CHECK-NEXT:    [[AB7:%.*]] = call float @llvm.floor.f32(float [[A7]])
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; CHECK-NEXT:    ret <8 x float> [[R7]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %ab0 = call float @llvm.ceil.f32(float %a0)
+  %ab1 = call float @llvm.floor.f32(float %a1)
+  %ab2 = call float @llvm.floor.f32(float %a2)
+  %ab3 = call float @llvm.ceil.f32(float %a3)
+  %ab4 = call float @llvm.ceil.f32(float %a4)
+  %ab5 = call float @llvm.ceil.f32(float %a5)
+  %ab6 = call float @llvm.floor.f32(float %a6)
+  %ab7 = call float @llvm.floor.f32(float %a7)
+  %r0 = insertelement <8 x float> undef, float %ab0, i32 0
+  %r1 = insertelement <8 x float>   %r0, float %ab1, i32 1
+  %r2 = insertelement <8 x float>   %r1, float %ab2, i32 2
+  %r3 = insertelement <8 x float>   %r2, float %ab3, i32 3
+  %r4 = insertelement <8 x float>   %r3, float %ab4, i32 4
+  %r5 = insertelement <8 x float>   %r4, float %ab5, i32 5
+  %r6 = insertelement <8 x float>   %r5, float %ab6, i32 6
+  %r7 = insertelement <8 x float>   %r6, float %ab7, i32 7
+  ret <8 x float> %r7
+}
+
+declare float @llvm.ceil.f32(float)
+declare float @llvm.floor.f32(float)

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/alternate-cast.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/alternate-cast.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/alternate-cast.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,489 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+
+define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
+; SSE-LABEL: @sitofp_uitofp(
+; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
+; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
+; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; SSE-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
+; SSE-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
+; SSE-NEXT:    [[AB2:%.*]] = sitofp i32 [[A2]] to float
+; SSE-NEXT:    [[AB3:%.*]] = sitofp i32 [[A3]] to float
+; SSE-NEXT:    [[AB4:%.*]] = uitofp i32 [[A4]] to float
+; SSE-NEXT:    [[AB5:%.*]] = uitofp i32 [[A5]] to float
+; SSE-NEXT:    [[AB6:%.*]] = uitofp i32 [[A6]] to float
+; SSE-NEXT:    [[AB7:%.*]] = uitofp i32 [[A7]] to float
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; SSE-NEXT:    ret <8 x float> [[R7]]
+;
+; SLM-LABEL: @sitofp_uitofp(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
+; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
+; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; SLM-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
+; SLM-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
+; SLM-NEXT:    [[AB2:%.*]] = sitofp i32 [[A2]] to float
+; SLM-NEXT:    [[AB3:%.*]] = sitofp i32 [[A3]] to float
+; SLM-NEXT:    [[AB4:%.*]] = uitofp i32 [[A4]] to float
+; SLM-NEXT:    [[AB5:%.*]] = uitofp i32 [[A5]] to float
+; SLM-NEXT:    [[AB6:%.*]] = uitofp i32 [[A6]] to float
+; SLM-NEXT:    [[AB7:%.*]] = uitofp i32 [[A7]] to float
+; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
+; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
+; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; SLM-NEXT:    ret <8 x float> [[R7]]
+;
+; AVX-LABEL: @sitofp_uitofp(
+; AVX-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
+; AVX-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
+; AVX-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:    ret <8 x float> [[R7]]
+;
+; AVX512-LABEL: @sitofp_uitofp(
+; AVX512-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
+; AVX512-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    ret <8 x float> [[R7]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %ab0 = sitofp i32 %a0 to float
+  %ab1 = sitofp i32 %a1 to float
+  %ab2 = sitofp i32 %a2 to float
+  %ab3 = sitofp i32 %a3 to float
+  %ab4 = uitofp i32 %a4 to float
+  %ab5 = uitofp i32 %a5 to float
+  %ab6 = uitofp i32 %a6 to float
+  %ab7 = uitofp i32 %a7 to float
+  %r0 = insertelement <8 x float> undef, float %ab0, i32 0
+  %r1 = insertelement <8 x float>   %r0, float %ab1, i32 1
+  %r2 = insertelement <8 x float>   %r1, float %ab2, i32 2
+  %r3 = insertelement <8 x float>   %r2, float %ab3, i32 3
+  %r4 = insertelement <8 x float>   %r3, float %ab4, i32 4
+  %r5 = insertelement <8 x float>   %r4, float %ab5, i32 5
+  %r6 = insertelement <8 x float>   %r5, float %ab6, i32 6
+  %r7 = insertelement <8 x float>   %r6, float %ab7, i32 7
+  ret <8 x float> %r7
+}
+
+define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
+; SSE-LABEL: @fptosi_fptoui(
+; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
+; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
+; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
+; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
+; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
+; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
+; SSE-NEXT:    [[AB0:%.*]] = fptosi float [[A0]] to i32
+; SSE-NEXT:    [[AB1:%.*]] = fptosi float [[A1]] to i32
+; SSE-NEXT:    [[AB2:%.*]] = fptosi float [[A2]] to i32
+; SSE-NEXT:    [[AB3:%.*]] = fptosi float [[A3]] to i32
+; SSE-NEXT:    [[AB4:%.*]] = fptoui float [[A4]] to i32
+; SSE-NEXT:    [[AB5:%.*]] = fptoui float [[A5]] to i32
+; SSE-NEXT:    [[AB6:%.*]] = fptoui float [[A6]] to i32
+; SSE-NEXT:    [[AB7:%.*]] = fptoui float [[A7]] to i32
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; SSE-NEXT:    ret <8 x i32> [[R7]]
+;
+; SLM-LABEL: @fptosi_fptoui(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
+; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
+; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
+; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
+; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
+; SLM-NEXT:    [[AB0:%.*]] = fptosi float [[A0]] to i32
+; SLM-NEXT:    [[AB1:%.*]] = fptosi float [[A1]] to i32
+; SLM-NEXT:    [[AB2:%.*]] = fptosi float [[A2]] to i32
+; SLM-NEXT:    [[AB3:%.*]] = fptosi float [[A3]] to i32
+; SLM-NEXT:    [[AB4:%.*]] = fptoui float [[A4]] to i32
+; SLM-NEXT:    [[AB5:%.*]] = fptoui float [[A5]] to i32
+; SLM-NEXT:    [[AB6:%.*]] = fptoui float [[A6]] to i32
+; SLM-NEXT:    [[AB7:%.*]] = fptoui float [[A7]] to i32
+; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
+; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; SLM-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX-LABEL: @fptosi_fptoui(
+; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
+; AVX-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
+; AVX-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
+; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
+; AVX-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
+; AVX-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
+; AVX-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
+; AVX-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
+; AVX-NEXT:    [[AB0:%.*]] = fptosi float [[A0]] to i32
+; AVX-NEXT:    [[AB1:%.*]] = fptosi float [[A1]] to i32
+; AVX-NEXT:    [[AB2:%.*]] = fptosi float [[A2]] to i32
+; AVX-NEXT:    [[AB3:%.*]] = fptosi float [[A3]] to i32
+; AVX-NEXT:    [[AB4:%.*]] = fptoui float [[A4]] to i32
+; AVX-NEXT:    [[AB5:%.*]] = fptoui float [[A5]] to i32
+; AVX-NEXT:    [[AB6:%.*]] = fptoui float [[A6]] to i32
+; AVX-NEXT:    [[AB7:%.*]] = fptoui float [[A7]] to i32
+; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
+; AVX-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; AVX-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; AVX-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; AVX-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
+; AVX-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; AVX-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; AVX-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX512-LABEL: @fptosi_fptoui(
+; AVX512-NEXT:    [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32>
+; AVX512-NEXT:    [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
+; AVX512-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    ret <8 x i32> [[R7]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %ab0 = fptosi float %a0 to i32
+  %ab1 = fptosi float %a1 to i32
+  %ab2 = fptosi float %a2 to i32
+  %ab3 = fptosi float %a3 to i32
+  %ab4 = fptoui float %a4 to i32
+  %ab5 = fptoui float %a5 to i32
+  %ab6 = fptoui float %a6 to i32
+  %ab7 = fptoui float %a7 to i32
+  %r0 = insertelement <8 x i32> undef, i32 %ab0, i32 0
+  %r1 = insertelement <8 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <8 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <8 x i32>   %r2, i32 %ab3, i32 3
+  %r4 = insertelement <8 x i32>   %r3, i32 %ab4, i32 4
+  %r5 = insertelement <8 x i32>   %r4, i32 %ab5, i32 5
+  %r6 = insertelement <8 x i32>   %r5, i32 %ab6, i32 6
+  %r7 = insertelement <8 x i32>   %r6, i32 %ab7, i32 7
+  ret <8 x i32> %r7
+}
+
+define <8 x float> @fneg_fabs(<8 x float> %a) {
+; CHECK-LABEL: @fneg_fabs(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[A:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <8 x i32> [[TMP1]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i32> [[TMP1]], <i32 undef, i32 undef, i32 undef, i32 undef, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP4]] to <8 x float>
+; CHECK-NEXT:    ret <8 x float> [[TMP5]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %aa0 = bitcast float %a0 to i32
+  %aa1 = bitcast float %a1 to i32
+  %aa2 = bitcast float %a2 to i32
+  %aa3 = bitcast float %a3 to i32
+  %aa4 = bitcast float %a4 to i32
+  %aa5 = bitcast float %a5 to i32
+  %aa6 = bitcast float %a6 to i32
+  %aa7 = bitcast float %a7 to i32
+  %ab0 = xor i32 %aa0, -2147483648
+  %ab1 = xor i32 %aa1, -2147483648
+  %ab2 = xor i32 %aa2, -2147483648
+  %ab3 = xor i32 %aa3, -2147483648
+  %ab4 = and i32 %aa4, 2147483647
+  %ab5 = and i32 %aa5, 2147483647
+  %ab6 = and i32 %aa6, 2147483647
+  %ab7 = and i32 %aa7, 2147483647
+  %ac0 = bitcast i32 %ab0 to float
+  %ac1 = bitcast i32 %ab1 to float
+  %ac2 = bitcast i32 %ab2 to float
+  %ac3 = bitcast i32 %ab3 to float
+  %ac4 = bitcast i32 %ab4 to float
+  %ac5 = bitcast i32 %ab5 to float
+  %ac6 = bitcast i32 %ab6 to float
+  %ac7 = bitcast i32 %ab7 to float
+  %r0 = insertelement <8 x float> undef, float %ac0, i32 0
+  %r1 = insertelement <8 x float>   %r0, float %ac1, i32 1
+  %r2 = insertelement <8 x float>   %r1, float %ac2, i32 2
+  %r3 = insertelement <8 x float>   %r2, float %ac3, i32 3
+  %r4 = insertelement <8 x float>   %r3, float %ac4, i32 4
+  %r5 = insertelement <8 x float>   %r4, float %ac5, i32 5
+  %r6 = insertelement <8 x float>   %r5, float %ac6, i32 6
+  %r7 = insertelement <8 x float>   %r6, float %ac7, i32 7
+  ret <8 x float> %r7
+}
+
+define <8 x i32> @sext_zext(<8 x i16> %a) {
+; CHECK-LABEL: @sext_zext(
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
+; CHECK-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[R7]]
+;
+  %a0 = extractelement <8 x i16> %a, i32 0
+  %a1 = extractelement <8 x i16> %a, i32 1
+  %a2 = extractelement <8 x i16> %a, i32 2
+  %a3 = extractelement <8 x i16> %a, i32 3
+  %a4 = extractelement <8 x i16> %a, i32 4
+  %a5 = extractelement <8 x i16> %a, i32 5
+  %a6 = extractelement <8 x i16> %a, i32 6
+  %a7 = extractelement <8 x i16> %a, i32 7
+  %ab0 = sext i16 %a0 to i32
+  %ab1 = sext i16 %a1 to i32
+  %ab2 = sext i16 %a2 to i32
+  %ab3 = sext i16 %a3 to i32
+  %ab4 = zext i16 %a4 to i32
+  %ab5 = zext i16 %a5 to i32
+  %ab6 = zext i16 %a6 to i32
+  %ab7 = zext i16 %a7 to i32
+  %r0 = insertelement <8 x i32> undef, i32 %ab0, i32 0
+  %r1 = insertelement <8 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <8 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <8 x i32>   %r2, i32 %ab3, i32 3
+  %r4 = insertelement <8 x i32>   %r3, i32 %ab4, i32 4
+  %r5 = insertelement <8 x i32>   %r4, i32 %ab5, i32 5
+  %r6 = insertelement <8 x i32>   %r5, i32 %ab6, i32 6
+  %r7 = insertelement <8 x i32>   %r6, i32 %ab7, i32 7
+  ret <8 x i32> %r7
+}
+
+define <8 x float> @sitofp_4i32_8i16(<4 x i32> %a, <8 x i16> %b) {
+; CHECK-LABEL: @sitofp_4i32_8i16(
+; CHECK-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
+; CHECK-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
+; CHECK-NEXT:    [[B2:%.*]] = extractelement <8 x i16> [[B]], i32 2
+; CHECK-NEXT:    [[B3:%.*]] = extractelement <8 x i16> [[B]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; CHECK-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
+; CHECK-NEXT:    [[AB5:%.*]] = sitofp i16 [[B1]] to float
+; CHECK-NEXT:    [[AB6:%.*]] = sitofp i16 [[B2]] to float
+; CHECK-NEXT:    [[AB7:%.*]] = sitofp i16 [[B3]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; CHECK-NEXT:    ret <8 x float> [[R7]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %b0 = extractelement <8 x i16> %b, i32 0
+  %b1 = extractelement <8 x i16> %b, i32 1
+  %b2 = extractelement <8 x i16> %b, i32 2
+  %b3 = extractelement <8 x i16> %b, i32 3
+  %ab0 = sitofp i32 %a0 to float
+  %ab1 = sitofp i32 %a1 to float
+  %ab2 = sitofp i32 %a2 to float
+  %ab3 = sitofp i32 %a3 to float
+  %ab4 = sitofp i16 %b0 to float
+  %ab5 = sitofp i16 %b1 to float
+  %ab6 = sitofp i16 %b2 to float
+  %ab7 = sitofp i16 %b3 to float
+  %r0 = insertelement <8 x float> undef, float %ab0, i32 0
+  %r1 = insertelement <8 x float>   %r0, float %ab1, i32 1
+  %r2 = insertelement <8 x float>   %r1, float %ab2, i32 2
+  %r3 = insertelement <8 x float>   %r2, float %ab3, i32 3
+  %r4 = insertelement <8 x float>   %r3, float %ab4, i32 4
+  %r5 = insertelement <8 x float>   %r4, float %ab5, i32 5
+  %r6 = insertelement <8 x float>   %r5, float %ab6, i32 6
+  %r7 = insertelement <8 x float>   %r6, float %ab7, i32 7
+  ret <8 x float> %r7
+}
+
+; Inspired by PR38154
+define <8 x float> @sitofp_uitofp_4i32_8i16_16i8(<4 x i32> %a, <8 x i16> %b, <16 x i8> %c) {
+; SSE-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
+; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2
+; SSE-NEXT:    [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3
+; SSE-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
+; SSE-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
+; SSE-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
+; SSE-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
+; SSE-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
+; SSE-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
+; SSE-NEXT:    [[AB2:%.*]] = uitofp i32 [[A2]] to float
+; SSE-NEXT:    [[AB3:%.*]] = uitofp i32 [[A3]] to float
+; SSE-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
+; SSE-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
+; SSE-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
+; SSE-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; SSE-NEXT:    ret <8 x float> [[R7]]
+;
+; SLM-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3
+; SLM-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
+; SLM-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
+; SLM-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
+; SLM-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
+; SLM-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
+; SLM-NEXT:    [[AB2:%.*]] = uitofp i32 [[A2]] to float
+; SLM-NEXT:    [[AB3:%.*]] = uitofp i32 [[A3]] to float
+; SLM-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
+; SLM-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
+; SLM-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
+; SLM-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
+; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
+; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
+; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; SLM-NEXT:    ret <8 x float> [[R7]]
+;
+; AVX-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
+; AVX-NEXT:    [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0
+; AVX-NEXT:    [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1
+; AVX-NEXT:    [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2
+; AVX-NEXT:    [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3
+; AVX-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
+; AVX-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
+; AVX-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
+; AVX-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
+; AVX-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
+; AVX-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
+; AVX-NEXT:    [[AB2:%.*]] = uitofp i32 [[A2]] to float
+; AVX-NEXT:    [[AB3:%.*]] = uitofp i32 [[A3]] to float
+; AVX-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
+; AVX-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
+; AVX-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
+; AVX-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
+; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
+; AVX-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
+; AVX-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
+; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
+; AVX-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; AVX-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; AVX-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; AVX-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; AVX-NEXT:    ret <8 x float> [[R7]]
+;
+; AVX512-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
+; AVX512-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
+; AVX512-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
+; AVX512-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
+; AVX512-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
+; AVX512-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float>
+; AVX512-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
+; AVX512-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
+; AVX512-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
+; AVX512-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
+; AVX512-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; AVX512-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP3]], i32 0
+; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; AVX512-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP4]], i32 1
+; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; AVX512-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP5]], i32 2
+; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; AVX512-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP6]], i32 3
+; AVX512-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; AVX512-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; AVX512-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; AVX512-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; AVX512-NEXT:    ret <8 x float> [[R7]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %b0 = extractelement <8 x i16> %b, i32 0
+  %b1 = extractelement <8 x i16> %b, i32 1
+  %c0 = extractelement <16 x i8> %c, i32 0
+  %c1 = extractelement <16 x i8> %c, i32 1
+  %ab0 = sitofp i32 %a0 to float
+  %ab1 = sitofp i32 %a1 to float
+  %ab2 = uitofp i32 %a2 to float
+  %ab3 = uitofp i32 %a3 to float
+  %ab4 = sitofp i16 %b0 to float
+  %ab5 = uitofp i16 %b1 to float
+  %ab6 = sitofp  i8 %c0 to float
+  %ab7 = uitofp  i8 %c1 to float
+  %r0 = insertelement <8 x float> undef, float %ab0, i32 0
+  %r1 = insertelement <8 x float>   %r0, float %ab1, i32 1
+  %r2 = insertelement <8 x float>   %r1, float %ab2, i32 2
+  %r3 = insertelement <8 x float>   %r2, float %ab3, i32 3
+  %r4 = insertelement <8 x float>   %r3, float %ab4, i32 4
+  %r5 = insertelement <8 x float>   %r4, float %ab5, i32 5
+  %r6 = insertelement <8 x float>   %r5, float %ab6, i32 6
+  %r7 = insertelement <8 x float>   %r6, float %ab7, i32 7
+  ret <8 x float> %r7
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/alternate-fp.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/alternate-fp.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/alternate-fp.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,161 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+
+define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: @fadd_fsub_v8f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]]
+; CHECK-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[R7]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %b0 = extractelement <8 x float> %b, i32 0
+  %b1 = extractelement <8 x float> %b, i32 1
+  %b2 = extractelement <8 x float> %b, i32 2
+  %b3 = extractelement <8 x float> %b, i32 3
+  %b4 = extractelement <8 x float> %b, i32 4
+  %b5 = extractelement <8 x float> %b, i32 5
+  %b6 = extractelement <8 x float> %b, i32 6
+  %b7 = extractelement <8 x float> %b, i32 7
+  %ab0 = fadd float %a0, %b0
+  %ab1 = fsub float %a1, %b1
+  %ab2 = fsub float %a2, %b2
+  %ab3 = fadd float %a3, %b3
+  %ab4 = fadd float %a4, %b4
+  %ab5 = fsub float %a5, %b5
+  %ab6 = fsub float %a6, %b6
+  %ab7 = fadd float %a7, %b7
+  %r0 = insertelement <8 x float> undef, float %ab0, i32 0
+  %r1 = insertelement <8 x float>   %r0, float %ab1, i32 1
+  %r2 = insertelement <8 x float>   %r1, float %ab2, i32 2
+  %r3 = insertelement <8 x float>   %r2, float %ab3, i32 3
+  %r4 = insertelement <8 x float>   %r3, float %ab4, i32 4
+  %r5 = insertelement <8 x float>   %r4, float %ab5, i32 5
+  %r6 = insertelement <8 x float>   %r5, float %ab6, i32 6
+  %r7 = insertelement <8 x float>   %r6, float %ab7, i32 7
+  ret <8 x float> %r7
+}
+
+define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {
+; SSE-LABEL: @fmul_fdiv_v8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]]
+; SSE-NEXT:    [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]]
+; SSE-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
+; SSE-NEXT:    ret <8 x float> [[R7]]
+;
+; SLM-LABEL: @fmul_fdiv_v8f32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP4:%.*]] = fdiv <4 x float> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[B]], <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP7:%.*]] = fmul <4 x float> [[TMP5]], [[TMP6]]
+; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SLM-NEXT:    [[TMP9:%.*]] = fdiv <4 x float> [[TMP5]], [[TMP6]]
+; SLM-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> undef, <8 x i32> <i32 undef, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SLM-NEXT:    [[R3:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP3]], <8 x i32> <i32 4, i32 1, i32 2, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; SLM-NEXT:    [[R4:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 undef, i32 undef, i32 undef>
+; SLM-NEXT:    [[R6:%.*]] = shufflevector <8 x float> [[R4]], <8 x float> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 9, i32 10, i32 undef>
+; SLM-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[R6]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 11>
+; SLM-NEXT:    ret <8 x float> [[R7]]
+;
+; AVX-LABEL: @fmul_fdiv_v8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]]
+; AVX-NEXT:    [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]]
+; AVX-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
+; AVX-NEXT:    ret <8 x float> [[R7]]
+;
+; AVX512-LABEL: @fmul_fdiv_v8f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]]
+; AVX512-NEXT:    [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]]
+; AVX512-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
+; AVX512-NEXT:    ret <8 x float> [[R7]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %b0 = extractelement <8 x float> %b, i32 0
+  %b1 = extractelement <8 x float> %b, i32 1
+  %b2 = extractelement <8 x float> %b, i32 2
+  %b3 = extractelement <8 x float> %b, i32 3
+  %b4 = extractelement <8 x float> %b, i32 4
+  %b5 = extractelement <8 x float> %b, i32 5
+  %b6 = extractelement <8 x float> %b, i32 6
+  %b7 = extractelement <8 x float> %b, i32 7
+  %ab0 = fmul float %a0, %b0
+  %ab1 = fdiv float %a1, %b1
+  %ab2 = fdiv float %a2, %b2
+  %ab3 = fmul float %a3, %b3
+  %ab4 = fmul float %a4, %b4
+  %ab5 = fdiv float %a5, %b5
+  %ab6 = fdiv float %a6, %b6
+  %ab7 = fmul float %a7, %b7
+  %r0 = insertelement <8 x float> undef, float %ab0, i32 0
+  %r1 = insertelement <8 x float>   %r0, float %ab1, i32 1
+  %r2 = insertelement <8 x float>   %r1, float %ab2, i32 2
+  %r3 = insertelement <8 x float>   %r2, float %ab3, i32 3
+  %r4 = insertelement <8 x float>   %r3, float %ab4, i32 4
+  %r5 = insertelement <8 x float>   %r4, float %ab5, i32 5
+  %r6 = insertelement <8 x float>   %r5, float %ab6, i32 6
+  %r7 = insertelement <8 x float>   %r6, float %ab7, i32 7
+  ret <8 x float> %r7
+}
+
+define <4 x float> @fmul_fdiv_v4f32_const(<4 x float> %a) {
+; SSE-LABEL: @fmul_fdiv_v4f32_const(
+; SSE-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
+; SSE-NEXT:    ret <4 x float> [[TMP1]]
+;
+; SLM-LABEL: @fmul_fdiv_v4f32_const(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
+; SLM-NEXT:    [[AB0:%.*]] = fmul float [[A0]], 2.000000e+00
+; SLM-NEXT:    [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00
+; SLM-NEXT:    [[R0:%.*]] = insertelement <4 x float> undef, float [[AB0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[A1]], i32 1
+; SLM-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[A2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i32 3
+; SLM-NEXT:    ret <4 x float> [[R3]]
+;
+; AVX-LABEL: @fmul_fdiv_v4f32_const(
+; AVX-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
+; AVX-NEXT:    ret <4 x float> [[TMP1]]
+;
+; AVX512-LABEL: @fmul_fdiv_v4f32_const(
+; AVX512-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
+; AVX512-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %ab0 = fmul float %a0, 2.0
+  %ab1 = fmul float %a1, 1.0
+  %ab2 = fdiv float %a2, 1.0
+  %ab3 = fdiv float %a3, 0.5
+  %r0 = insertelement <4 x float> undef, float %ab0, i32 0
+  %r1 = insertelement <4 x float>   %r0, float %ab1, i32 1
+  %r2 = insertelement <4 x float>   %r1, float %ab2, i32 2
+  %r3 = insertelement <4 x float>   %r2, float %ab3, i32 3
+  ret <4 x float> %r3
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/alternate-int.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/alternate-int.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/alternate-int.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/alternate-int.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,571 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+
+define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: @add_sub_v8i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[R7]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %b0 = extractelement <8 x i32> %b, i32 0
+  %b1 = extractelement <8 x i32> %b, i32 1
+  %b2 = extractelement <8 x i32> %b, i32 2
+  %b3 = extractelement <8 x i32> %b, i32 3
+  %b4 = extractelement <8 x i32> %b, i32 4
+  %b5 = extractelement <8 x i32> %b, i32 5
+  %b6 = extractelement <8 x i32> %b, i32 6
+  %b7 = extractelement <8 x i32> %b, i32 7
+  %ab0 = add i32 %a0, %b0
+  %ab1 = add i32 %a1, %b1
+  %ab2 = add i32 %a2, %b2
+  %ab3 = add i32 %a3, %b3
+  %ab4 = sub i32 %a4, %b4
+  %ab5 = sub i32 %a5, %b5
+  %ab6 = sub i32 %a6, %b6
+  %ab7 = sub i32 %a7, %b7
+  %r0 = insertelement <8 x i32> undef, i32 %ab0, i32 0
+  %r1 = insertelement <8 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <8 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <8 x i32>   %r2, i32 %ab3, i32 3
+  %r4 = insertelement <8 x i32>   %r3, i32 %ab4, i32 4
+  %r5 = insertelement <8 x i32>   %r4, i32 %ab5, i32 5
+  %r6 = insertelement <8 x i32>   %r5, i32 %ab6, i32 6
+  %r7 = insertelement <8 x i32>   %r6, i32 %ab7, i32 7
+  ret <8 x i32> %r7
+}
+
+define <4 x i32> @add_and_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @add_and_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[R3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x i32> [[R3]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %ab0 = add i32 %a0, %b0
+  %ab1 = add i32 %a1, %b1
+  %ab2 = and i32 %a2, %b2
+  %ab3 = and i32 %a3, %b3
+  %r0 = insertelement <4 x i32> undef, i32 %ab0, i32 0
+  %r1 = insertelement <4 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <4 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <4 x i32>   %r2, i32 %ab3, i32 3
+  ret <4 x i32> %r3
+}
+
+define <4 x i32> @add_mul_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: @add_mul_v4i32(
+; SSE-NEXT:    [[TMP1:%.*]] = mul <4 x i32> [[A:%.*]], [[B:%.*]]
+; SSE-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[A]], [[B]]
+; SSE-NEXT:    [[R3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; SSE-NEXT:    ret <4 x i32> [[R3]]
+;
+; SLM-LABEL: @add_mul_v4i32(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3
+; SLM-NEXT:    [[B0:%.*]] = extractelement <4 x i32> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <4 x i32> [[B]], i32 1
+; SLM-NEXT:    [[B2:%.*]] = extractelement <4 x i32> [[B]], i32 2
+; SLM-NEXT:    [[B3:%.*]] = extractelement <4 x i32> [[B]], i32 3
+; SLM-NEXT:    [[AB0:%.*]] = mul i32 [[A0]], [[B0]]
+; SLM-NEXT:    [[AB1:%.*]] = add i32 [[A1]], [[B1]]
+; SLM-NEXT:    [[AB2:%.*]] = add i32 [[A2]], [[B2]]
+; SLM-NEXT:    [[AB3:%.*]] = mul i32 [[A3]], [[B3]]
+; SLM-NEXT:    [[R0:%.*]] = insertelement <4 x i32> undef, i32 [[AB0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <4 x i32> [[R0]], i32 [[AB1]], i32 1
+; SLM-NEXT:    [[R2:%.*]] = insertelement <4 x i32> [[R1]], i32 [[AB2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <4 x i32> [[R2]], i32 [[AB3]], i32 3
+; SLM-NEXT:    ret <4 x i32> [[R3]]
+;
+; AVX-LABEL: @add_mul_v4i32(
+; AVX-NEXT:    [[TMP1:%.*]] = mul <4 x i32> [[A:%.*]], [[B:%.*]]
+; AVX-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[A]], [[B]]
+; AVX-NEXT:    [[R3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; AVX-NEXT:    ret <4 x i32> [[R3]]
+;
+; AVX512-LABEL: @add_mul_v4i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = mul <4 x i32> [[A:%.*]], [[B:%.*]]
+; AVX512-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[A]], [[B]]
+; AVX512-NEXT:    [[R3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; AVX512-NEXT:    ret <4 x i32> [[R3]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %ab0 = mul i32 %a0, %b0
+  %ab1 = add i32 %a1, %b1
+  %ab2 = add i32 %a2, %b2
+  %ab3 = mul i32 %a3, %b3
+  %r0 = insertelement <4 x i32> undef, i32 %ab0, i32 0
+  %r1 = insertelement <4 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <4 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <4 x i32>   %r2, i32 %ab3, i32 3
+  ret <4 x i32> %r3
+}
+
+define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; SSE-LABEL: @ashr_shl_v8i32(
+; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; SSE-NEXT:    [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
+; SSE-NEXT:    [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
+; SSE-NEXT:    [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2
+; SSE-NEXT:    [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3
+; SSE-NEXT:    [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
+; SSE-NEXT:    [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
+; SSE-NEXT:    [[AB2:%.*]] = ashr i32 [[A2]], [[B2]]
+; SSE-NEXT:    [[AB3:%.*]] = ashr i32 [[A3]], [[B3]]
+; SSE-NEXT:    [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]]
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; SSE-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <8 x i32> [[R7]]
+;
+; SLM-LABEL: @ashr_shl_v8i32(
+; SLM-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
+; SLM-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
+; SLM-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX-LABEL: @ashr_shl_v8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
+; AVX-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
+; AVX-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX512-LABEL: @ashr_shl_v8i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
+; AVX512-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
+; AVX512-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    ret <8 x i32> [[R7]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %b0 = extractelement <8 x i32> %b, i32 0
+  %b1 = extractelement <8 x i32> %b, i32 1
+  %b2 = extractelement <8 x i32> %b, i32 2
+  %b3 = extractelement <8 x i32> %b, i32 3
+  %b4 = extractelement <8 x i32> %b, i32 4
+  %b5 = extractelement <8 x i32> %b, i32 5
+  %b6 = extractelement <8 x i32> %b, i32 6
+  %b7 = extractelement <8 x i32> %b, i32 7
+  %ab0 = ashr i32 %a0, %b0
+  %ab1 = ashr i32 %a1, %b1
+  %ab2 = ashr i32 %a2, %b2
+  %ab3 = ashr i32 %a3, %b3
+  %ab4 = shl  i32 %a4, %b4
+  %ab5 = shl  i32 %a5, %b5
+  %ab6 = shl  i32 %a6, %b6
+  %ab7 = shl  i32 %a7, %b7
+  %r0 = insertelement <8 x i32> undef, i32 %ab0, i32 0
+  %r1 = insertelement <8 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <8 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <8 x i32>   %r2, i32 %ab3, i32 3
+  %r4 = insertelement <8 x i32>   %r3, i32 %ab4, i32 4
+  %r5 = insertelement <8 x i32>   %r4, i32 %ab5, i32 5
+  %r6 = insertelement <8 x i32>   %r5, i32 %ab6, i32 6
+  %r7 = insertelement <8 x i32>   %r6, i32 %ab7, i32 7
+  ret <8 x i32> %r7
+}
+
+define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {
+; SSE-LABEL: @ashr_shl_v8i32_const(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 2>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], <i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x i32> [[R7]]
+;
+; SLM-LABEL: @ashr_shl_v8i32_const(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 2>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], <i32 3, i32 3, i32 3, i32 3>
+; SLM-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX1-LABEL: @ashr_shl_v8i32_const(
+; AVX1-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 2>
+; AVX1-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], <i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX2-LABEL: @ashr_shl_v8i32_const(
+; AVX2-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX512-LABEL: @ashr_shl_v8i32_const(
+; AVX512-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    ret <8 x i32> [[R7]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %ab0 = ashr i32 %a0, 2
+  %ab1 = ashr i32 %a1, 2
+  %ab2 = ashr i32 %a2, 2
+  %ab3 = ashr i32 %a3, 2
+  %ab4 = shl  i32 %a4, 3
+  %ab5 = shl  i32 %a5, 3
+  %ab6 = shl  i32 %a6, 3
+  %ab7 = shl  i32 %a7, 3
+  %r0 = insertelement <8 x i32> undef, i32 %ab0, i32 0
+  %r1 = insertelement <8 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <8 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <8 x i32>   %r2, i32 %ab3, i32 3
+  %r4 = insertelement <8 x i32>   %r3, i32 %ab4, i32 4
+  %r5 = insertelement <8 x i32>   %r4, i32 %ab5, i32 5
+  %r6 = insertelement <8 x i32>   %r5, i32 %ab6, i32 6
+  %r7 = insertelement <8 x i32>   %r6, i32 %ab7, i32 7
+  ret <8 x i32> %r7
+}
+
+define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; SSE-LABEL: @ashr_lshr_shl_v8i32(
+; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
+; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
+; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; SSE-NEXT:    [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
+; SSE-NEXT:    [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
+; SSE-NEXT:    [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2
+; SSE-NEXT:    [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3
+; SSE-NEXT:    [[B4:%.*]] = extractelement <8 x i32> [[B]], i32 4
+; SSE-NEXT:    [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5
+; SSE-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6
+; SSE-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
+; SSE-NEXT:    [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
+; SSE-NEXT:    [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
+; SSE-NEXT:    [[AB2:%.*]] = lshr i32 [[A2]], [[B2]]
+; SSE-NEXT:    [[AB3:%.*]] = lshr i32 [[A3]], [[B3]]
+; SSE-NEXT:    [[AB4:%.*]] = lshr i32 [[A4]], [[B4]]
+; SSE-NEXT:    [[AB5:%.*]] = lshr i32 [[A5]], [[B5]]
+; SSE-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
+; SSE-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; SSE-NEXT:    ret <8 x i32> [[R7]]
+;
+; SLM-LABEL: @ashr_lshr_shl_v8i32(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
+; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; SLM-NEXT:    [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
+; SLM-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6
+; SLM-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
+; SLM-NEXT:    [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
+; SLM-NEXT:    [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
+; SLM-NEXT:    [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]]
+; SLM-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
+; SLM-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
+; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; SLM-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2
+; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP2]], i32 2
+; SLM-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3
+; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4
+; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5
+; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5
+; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; SLM-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX1-LABEL: @ashr_lshr_shl_v8i32(
+; AVX1-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; AVX1-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
+; AVX1-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; AVX1-NEXT:    [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
+; AVX1-NEXT:    [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
+; AVX1-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6
+; AVX1-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
+; AVX1-NEXT:    [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
+; AVX1-NEXT:    [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
+; AVX1-NEXT:    [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]]
+; AVX1-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
+; AVX1-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
+; AVX1-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
+; AVX1-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; AVX1-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2
+; AVX1-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP2]], i32 2
+; AVX1-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3
+; AVX1-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3
+; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4
+; AVX1-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4
+; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5
+; AVX1-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5
+; AVX1-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; AVX1-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; AVX1-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX2-LABEL: @ashr_lshr_shl_v8i32(
+; AVX2-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 6
+; AVX2-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; AVX2-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6
+; AVX2-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[TMP5:%.*]] = lshr <8 x i32> [[A]], [[B]]
+; AVX2-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
+; AVX2-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
+; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; AVX2-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 0
+; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; AVX2-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP7]], i32 1
+; AVX2-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; AVX2-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP8]], i32 2
+; AVX2-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; AVX2-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP9]], i32 3
+; AVX2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4
+; AVX2-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP10]], i32 4
+; AVX2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5
+; AVX2-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP11]], i32 5
+; AVX2-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; AVX2-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; AVX2-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX512-LABEL: @ashr_lshr_shl_v8i32(
+; AVX512-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 6
+; AVX512-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; AVX512-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6
+; AVX512-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    [[TMP5:%.*]] = lshr <8 x i32> [[A]], [[B]]
+; AVX512-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
+; AVX512-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
+; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; AVX512-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 0
+; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; AVX512-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP7]], i32 1
+; AVX512-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; AVX512-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP8]], i32 2
+; AVX512-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; AVX512-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP9]], i32 3
+; AVX512-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4
+; AVX512-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP10]], i32 4
+; AVX512-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5
+; AVX512-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP11]], i32 5
+; AVX512-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; AVX512-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; AVX512-NEXT:    ret <8 x i32> [[R7]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %b0 = extractelement <8 x i32> %b, i32 0
+  %b1 = extractelement <8 x i32> %b, i32 1
+  %b2 = extractelement <8 x i32> %b, i32 2
+  %b3 = extractelement <8 x i32> %b, i32 3
+  %b4 = extractelement <8 x i32> %b, i32 4
+  %b5 = extractelement <8 x i32> %b, i32 5
+  %b6 = extractelement <8 x i32> %b, i32 6
+  %b7 = extractelement <8 x i32> %b, i32 7
+  %ab0 = ashr i32 %a0, %b0
+  %ab1 = ashr i32 %a1, %b1
+  %ab2 = lshr i32 %a2, %b2
+  %ab3 = lshr i32 %a3, %b3
+  %ab4 = lshr i32 %a4, %b4
+  %ab5 = lshr i32 %a5, %b5
+  %ab6 = shl  i32 %a6, %b6
+  %ab7 = shl  i32 %a7, %b7
+  %r0 = insertelement <8 x i32> undef, i32 %ab0, i32 0
+  %r1 = insertelement <8 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <8 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <8 x i32>   %r2, i32 %ab3, i32 3
+  %r4 = insertelement <8 x i32>   %r3, i32 %ab4, i32 4
+  %r5 = insertelement <8 x i32>   %r4, i32 %ab5, i32 5
+  %r6 = insertelement <8 x i32>   %r5, i32 %ab6, i32 6
+  %r7 = insertelement <8 x i32>   %r6, i32 %ab7, i32 7
+  ret <8 x i32> %r7
+}
+
+define <8 x i32> @add_v8i32_undefs(<8 x i32> %a) {
+; CHECK-LABEL: @add_v8i32_undefs(
+; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], <i32 undef, i32 4, i32 8, i32 16, i32 undef, i32 4, i32 8, i32 16>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %ab0 = add i32 %a0, undef
+  %ab1 = add i32 %a1, 4
+  %ab2 = add i32 %a2, 8
+  %ab3 = add i32 %a3, 16
+  %ab4 = add i32 %a4, undef
+  %ab5 = add i32 %a5, 4
+  %ab6 = add i32 %a6, 8
+  %ab7 = add i32 %a7, 16
+  %r0 = insertelement <8 x i32> undef, i32 %ab0, i32 0
+  %r1 = insertelement <8 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <8 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <8 x i32>   %r2, i32 %ab3, i32 3
+  %r4 = insertelement <8 x i32>   %r3, i32 %ab4, i32 4
+  %r5 = insertelement <8 x i32>   %r4, i32 %ab5, i32 5
+  %r6 = insertelement <8 x i32>   %r5, i32 %ab6, i32 6
+  %r7 = insertelement <8 x i32>   %r6, i32 %ab7, i32 7
+  ret <8 x i32> %r7
+}
+
+define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) {
+; CHECK-LABEL: @sdiv_v8i32_undefs(
+; CHECK-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1
+; CHECK-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; CHECK-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; CHECK-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; CHECK-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
+; CHECK-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; CHECK-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
+; CHECK-NEXT:    [[AB2:%.*]] = sdiv i32 [[A2]], 8
+; CHECK-NEXT:    [[AB3:%.*]] = sdiv i32 [[A3]], 16
+; CHECK-NEXT:    [[AB5:%.*]] = sdiv i32 [[A5]], 4
+; CHECK-NEXT:    [[AB6:%.*]] = sdiv i32 [[A6]], 8
+; CHECK-NEXT:    [[AB7:%.*]] = sdiv i32 [[A7]], 16
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x i32> undef, i32 [[AB1]], i32 1
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; CHECK-NEXT:    ret <8 x i32> [[R7]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %ab0 = sdiv i32 %a0, undef
+  %ab1 = sdiv i32 %a1, 4
+  %ab2 = sdiv i32 %a2, 8
+  %ab3 = sdiv i32 %a3, 16
+  %ab4 = sdiv i32 %a4, undef
+  %ab5 = sdiv i32 %a5, 4
+  %ab6 = sdiv i32 %a6, 8
+  %ab7 = sdiv i32 %a7, 16
+  %r0 = insertelement <8 x i32> undef, i32 %ab0, i32 0
+  %r1 = insertelement <8 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <8 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <8 x i32>   %r2, i32 %ab3, i32 3
+  %r4 = insertelement <8 x i32>   %r3, i32 %ab4, i32 4
+  %r5 = insertelement <8 x i32>   %r4, i32 %ab5, i32 5
+  %r6 = insertelement <8 x i32>   %r5, i32 %ab6, i32 6
+  %r7 = insertelement <8 x i32>   %r6, i32 %ab7, i32 7
+  ret <8 x i32> %r7
+}
+
+define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) {
+; CHECK-LABEL: @add_sub_v8i32_splat(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
+; CHECK-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[R7]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %ab0 = add i32 %a0, %b
+  %ab1 = add i32 %b, %a1
+  %ab2 = add i32 %a2, %b
+  %ab3 = add i32 %b, %a3
+  %ab4 = sub i32 %b, %a4
+  %ab5 = sub i32 %b, %a5
+  %ab6 = sub i32 %b, %a6
+  %ab7 = sub i32 %b, %a7
+  %r0 = insertelement <8 x i32> undef, i32 %ab0, i32 0
+  %r1 = insertelement <8 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <8 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <8 x i32>   %r2, i32 %ab3, i32 3
+  %r4 = insertelement <8 x i32>   %r3, i32 %ab4, i32 4
+  %r5 = insertelement <8 x i32>   %r4, i32 %ab5, i32 5
+  %r6 = insertelement <8 x i32>   %r5, i32 %ab6, i32 6
+  %r7 = insertelement <8 x i32>   %r6, i32 %ab7, i32 7
+  ret <8 x i32> %r7
+}