[llvm] r358552 - Revert "Temporarily Revert "Add basic loop fusion pass.""

Tue Apr 16 21:53:01 PDT 2019

Added: llvm/trunk/test/Transforms/InstCombine/2005-03-04-ShiftOverflow.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2005-03-04-ShiftOverflow.ll?rev=358552&view=auto
==============================================================================

--- llvm/trunk/test/Transforms/InstCombine/2005-03-04-ShiftOverflow.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2005-03-04-ShiftOverflow.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,9 @@
+; RUN: opt < %s -instcombine -S | \
+; RUN:   not grep "ret i1 false"
+
+define i1 @test(i64 %tmp.169) {
+        %tmp.1710 = lshr i64 %tmp.169, 1                ; <i64> [#uses=1]
+        %tmp.1912 = icmp ugt i64 %tmp.1710, 0           ; <i1> [#uses=1]
+        ret i1 %tmp.1912
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2005-04-07-UDivSelectCrash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2005-04-07-UDivSelectCrash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2005-04-07-UDivSelectCrash.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2005-04-07-UDivSelectCrash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,8 @@
+; RUN: opt < %s -instcombine -disable-output
+
+define i32 @test(i1 %C, i32 %tmp.15) {
+        %tmp.16 = select i1 %C, i32 8, i32 1            ; <i32> [#uses=1]
+        %tmp.18 = udiv i32 %tmp.15, %tmp.16             ; <i32> [#uses=1]
+        ret i32 %tmp.18
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2005-06-15-DivSelectCrash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2005-06-15-DivSelectCrash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2005-06-15-DivSelectCrash.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2005-06-15-DivSelectCrash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,10 @@
+; RUN: opt < %s -instcombine -disable-output
+
+define i32 @_Z13func_31585107li(i32 %l_39521025, i32 %l_59244666) {
+        %shortcirc_val = select i1 false, i32 1, i32 0          ; <i32> [#uses=1]
+        %tmp.8 = udiv i32 0, %shortcirc_val             ; <i32> [#uses=1]
+        %tmp.9 = icmp eq i32 %tmp.8, 0          ; <i1> [#uses=1]
+        %retval = select i1 %tmp.9, i32 %l_59244666, i32 -1621308501            ; <i32> [#uses=1]
+        ret i32 %retval
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2005-06-15-ShiftSetCCCrash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2005-06-15-ShiftSetCCCrash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2005-06-15-ShiftSetCCCrash.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2005-06-15-ShiftSetCCCrash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,9 @@
+; RUN: opt < %s -instcombine -disable-output
+; PR577
+
+define i1 @test() {
+        %tmp.3 = shl i32 0, 41          ; <i32> [#uses=1]
+        %tmp.4 = icmp ne i32 %tmp.3, 0          ; <i1> [#uses=1]
+        ret i1 %tmp.4
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2005-06-16-RangeCrash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2005-06-16-RangeCrash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2005-06-16-RangeCrash.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2005-06-16-RangeCrash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,9 @@
+; RUN: opt < %s -instcombine -disable-output
+; PR585
+
+define i1 @test() {
+        %tmp.26 = sdiv i32 0, -2147483648               ; <i32> [#uses=1]
+        %tmp.27 = icmp eq i32 %tmp.26, 0                ; <i1> [#uses=1]
+        ret i1 %tmp.27
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2005-07-07-DeadPHILoop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2005-07-07-DeadPHILoop.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2005-07-07-DeadPHILoop.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2005-07-07-DeadPHILoop.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,14 @@
+; RUN: opt < %s -instcombine -disable-output
+
+; This example caused instcombine to spin into an infinite loop.
+
+define void @test(i32* %P) {
+        ret void
+
+Dead:           ; preds = %Dead
+        %X = phi i32 [ %Y, %Dead ]              ; <i32> [#uses=1]
+        %Y = sdiv i32 %X, 10            ; <i32> [#uses=2]
+        store i32 %Y, i32* %P
+        br label %Dead
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2006-02-13-DemandedMiscompile.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2006-02-13-DemandedMiscompile.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2006-02-13-DemandedMiscompile.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2006-02-13-DemandedMiscompile.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,10 @@
+; RUN: opt < %s -instcombine -S | \
+; RUN:   not grep undef
+
+define i32 @test(i8 %A) {
+        %B = sext i8 %A to i32          ; <i32> [#uses=1]
+        %C = ashr i32 %B, 8             ; <i32> [#uses=1]
+        ret i32 %C
+}
+
+

Added: llvm/trunk/test/Transforms/InstCombine/2006-02-28-Crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2006-02-28-Crash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2006-02-28-Crash.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2006-02-28-Crash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,8 @@
+; RUN: opt < %s -instcombine -disable-output
+
+define i32 @test() {
+        %tmp203 = icmp eq i32 1, 2              ; <i1> [#uses=1]
+        %tmp203.upgrd.1 = zext i1 %tmp203 to i32                ; <i32> [#uses=1]
+        ret i32 %tmp203.upgrd.1
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2006-03-30-ExtractElement.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2006-03-30-ExtractElement.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2006-03-30-ExtractElement.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2006-03-30-ExtractElement.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,8 @@
+; RUN: opt < %s -instcombine -disable-output
+
+define float @test(<4 x float> %V) {
+        %V2 = insertelement <4 x float> %V, float 1.000000e+00, i32 3           ; <<4 x float>> [#uses=1]
+        %R = extractelement <4 x float> %V2, i32 2              ; <float> [#uses=1]
+        ret float %R
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2006-04-28-ShiftShiftLongLong.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2006-04-28-ShiftShiftLongLong.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2006-04-28-ShiftShiftLongLong.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2006-04-28-ShiftShiftLongLong.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,13 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; This cannot be turned into a sign extending cast!
+
+define i64 @test(i64 %X) {
+        %Y = shl i64 %X, 16             ; <i64> [#uses=1]
+; CHECK: %Y = shl i64 %X, 16
+        %Z = ashr i64 %Y, 16            ; <i64> [#uses=1]
+; CHECK: %Z = ashr exact i64 %Y, 16
+        ret i64 %Z
+; CHECK: ret i64 %Z
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2006-05-04-DemandedBitCrash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2006-05-04-DemandedBitCrash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2006-05-04-DemandedBitCrash.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2006-05-04-DemandedBitCrash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,51 @@
+; RUN: opt < %s -instcombine -disable-output
+; END.
+
+define void @test() {
+bb38.i:
+	%varspec.0.i1014 = bitcast i64 123814269237067777 to i64		; <i64> [#uses=1]
+	%locspec.0.i1015 = bitcast i32 1 to i32		; <i32> [#uses=2]
+	%tmp51391.i1018 = lshr i64 %varspec.0.i1014, 16		; <i64> [#uses=1]
+	%tmp51392.i1019 = trunc i64 %tmp51391.i1018 to i32		; <i32> [#uses=2]
+	%tmp51392.mask.i1020 = lshr i32 %tmp51392.i1019, 29		; <i32> [#uses=1]
+	%tmp7.i1021 = and i32 %tmp51392.mask.i1020, 1		; <i32> [#uses=2]
+	%tmp18.i1026 = lshr i32 %tmp51392.i1019, 31		; <i32> [#uses=2]
+	%tmp18.i1027 = trunc i32 %tmp18.i1026 to i8		; <i8> [#uses=1]
+	br i1 false, label %cond_false1148.i1653, label %bb377.i1259
+
+bb377.i1259:		; preds = %bb38.i
+	br i1 false, label %cond_true541.i1317, label %cond_false1148.i1653
+
+cond_true541.i1317:		; preds = %bb377.i1259
+	%tmp545.i1318 = lshr i32 %locspec.0.i1015, 10		; <i32> [#uses=1]
+	%tmp550.i1319 = lshr i32 %locspec.0.i1015, 4		; <i32> [#uses=1]
+	%tmp550551.i1320 = and i32 %tmp550.i1319, 63		; <i32> [#uses=1]
+	%tmp553.i1321 = icmp ult i32 %tmp550551.i1320, 4		; <i1> [#uses=1]
+	%tmp558.i1322 = icmp eq i32 %tmp7.i1021, 0		; <i1> [#uses=1]
+	%bothcond.i1326 = or i1 %tmp553.i1321, false		; <i1> [#uses=1]
+	%bothcond1.i1327 = or i1 %bothcond.i1326, false		; <i1> [#uses=1]
+	%bothcond2.not.i1328 = or i1 %bothcond1.i1327, false		; <i1> [#uses=1]
+	%bothcond3.i1329 = or i1 %bothcond2.not.i1328, %tmp558.i1322		; <i1> [#uses=0]
+	br i1 false, label %cond_true583.i1333, label %cond_next592.i1337
+
+cond_true583.i1333:		; preds = %cond_true541.i1317
+	br i1 false, label %cond_true586.i1335, label %cond_next592.i1337
+
+cond_true586.i1335:		; preds = %cond_true583.i1333
+	br label %cond_true.i
+
+cond_next592.i1337:		; preds = %cond_true583.i1333, %cond_true541.i1317
+	%mask_z.0.i1339 = phi i32 [ %tmp18.i1026, %cond_true541.i1317 ], [ 0, %cond_true583.i1333 ]		; <i32> [#uses=0]
+	%tmp594.i1340 = and i32 %tmp545.i1318, 15		; <i32> [#uses=0]
+	br label %cond_true.i
+
+cond_false1148.i1653:		; preds = %bb377.i1259, %bb38.i
+	%tmp1150.i1654 = icmp eq i32 %tmp7.i1021, 0		; <i1> [#uses=1]
+	%tmp1160.i1656 = icmp eq i8 %tmp18.i1027, 0		; <i1> [#uses=1]
+	%bothcond8.i1658 = or i1 %tmp1150.i1654, %tmp1160.i1656		; <i1> [#uses=1]
+	%bothcond9.i1659 = or i1 %bothcond8.i1658, false		; <i1> [#uses=0]
+	br label %cond_true.i
+
+cond_true.i:		; preds = %cond_false1148.i1653, %cond_next592.i1337, %cond_true586.i1335
+	ret void
+}

Added: llvm/trunk/test/Transforms/InstCombine/2006-09-15-CastToBool.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2006-09-15-CastToBool.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2006-09-15-CastToBool.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2006-09-15-CastToBool.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,14 @@
+; RUN: opt < %s -instcombine -S | grep and
+; PR913
+
+define i32 @test(i32* %tmp1) {
+        %tmp.i = load i32, i32* %tmp1                ; <i32> [#uses=1]
+        %tmp = bitcast i32 %tmp.i to i32                ; <i32> [#uses=1]
+        %tmp2.ui = lshr i32 %tmp, 5             ; <i32> [#uses=1]
+        %tmp2 = bitcast i32 %tmp2.ui to i32             ; <i32> [#uses=1]
+        %tmp3 = and i32 %tmp2, 1                ; <i32> [#uses=1]
+        %tmp3.upgrd.1 = icmp ne i32 %tmp3, 0            ; <i1> [#uses=1]
+        %tmp34 = zext i1 %tmp3.upgrd.1 to i32           ; <i32> [#uses=1]
+        ret i32 %tmp34
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2006-10-19-SignedToUnsignedCastAndConst-2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2006-10-19-SignedToUnsignedCastAndConst-2.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2006-10-19-SignedToUnsignedCastAndConst-2.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2006-10-19-SignedToUnsignedCastAndConst-2.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,10 @@
+; The optimizer should be able to remove cast operation here.
+; RUN: opt < %s -instcombine -S | \
+; RUN:    not grep sext.*i32
+
+define i1 @eq_signed_to_small_unsigned(i8 %SB) {
+        %Y = sext i8 %SB to i32         ; <i32> [#uses=1]
+        %C = icmp eq i32 %Y, 17         ; <i1> [#uses=1]
+        ret i1 %C
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2006-10-20-mask.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2006-10-20-mask.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2006-10-20-mask.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2006-10-20-mask.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,11 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; CHECK: and
+
+define i64 @foo(i64 %tmp, i64 %tmp2) {
+        %tmp.upgrd.1 = trunc i64 %tmp to i32            ; <i32> [#uses=1]
+        %tmp2.upgrd.2 = trunc i64 %tmp2 to i32          ; <i32> [#uses=1]
+        %tmp3 = and i32 %tmp.upgrd.1, %tmp2.upgrd.2             ; <i32> [#uses=1]
+        %tmp4 = zext i32 %tmp3 to i64           ; <i64> [#uses=1]
+        ret i64 %tmp4
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2006-10-26-VectorReassoc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2006-10-26-VectorReassoc.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2006-10-26-VectorReassoc.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2006-10-26-VectorReassoc.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,145 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; (V * C1) * C2 => V * (C1 * C2)
+; Verify this doesn't fold when no fast-math-flags are specified
+define <4 x float> @test_fmul(<4 x float> %V) {
+; CHECK-LABEL: @test_fmul(
+; CHECK-NEXT:     [[TMP1:%.*]] = fmul <4 x float> [[V:%.*]], <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>
+; CHECK-NEXT:     [[TMP2:%.*]] = fmul <4 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+05, float -3.000000e+00, float 4.000000e+00>
+; CHECK-NEXT:     ret <4 x float> [[TMP2]]
+        %Y = fmul <4 x float> %V, < float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00 >
+        %Z = fmul <4 x float> %Y, < float 1.000000e+00, float 2.000000e+05, float -3.000000e+00, float 4.000000e+00 >
+        ret <4 x float> %Z
+}
+
+; (V * C1) * C2 => V * (C1 * C2)
+; Verify this folds with 'fast'
+define <4 x float> @test_fmul_fast(<4 x float> %V) {
+; CHECK-LABEL: @test_fmul_fast(
+; CHECK-NEXT:     [[TMP1:%.*]] = fmul fast <4 x float> [[V:%.*]], <float 1.000000e+00, float 4.000000e+05, float -9.000000e+00, float 1.600000e+01>
+; CHECK-NEXT:     ret <4 x float> [[TMP1]]
+        %Y = fmul fast <4 x float> %V, < float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00 >
+        %Z = fmul fast <4 x float> %Y, < float 1.000000e+00, float 2.000000e+05, float -3.000000e+00, float 4.000000e+00 >
+        ret <4 x float> %Z
+}
+
+; (V * C1) * C2 => V * (C1 * C2)
+; Verify this folds with 'reassoc' and 'nsz' ('nsz' not technically required)
+define <4 x float> @test_fmul_reassoc_nsz(<4 x float> %V) {
+; CHECK-LABEL: @test_fmul_reassoc_nsz(
+; CHECK-NEXT:     [[TMP1:%.*]] = fmul reassoc nsz <4 x float> [[V:%.*]], <float 1.000000e+00, float 4.000000e+05, float -9.000000e+00, float 1.600000e+01>
+; CHECK-NEXT:     ret <4 x float> [[TMP1]]
+        %Y = fmul reassoc nsz <4 x float> %V, < float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00 >
+        %Z = fmul reassoc nsz <4 x float> %Y, < float 1.000000e+00, float 2.000000e+05, float -3.000000e+00, float 4.000000e+00 >
+        ret <4 x float> %Z
+}
+
+; (V * C1) * C2 => V * (C1 * C2)
+; TODO: This doesn't require 'nsz'.  It should fold to V * { 1.0, 4.0e+05, -9.0, 16.0 }
+define <4 x float> @test_fmul_reassoc(<4 x float> %V) {
+; CHECK-LABEL: @test_fmul_reassoc(
+; CHECK-NEXT:     [[TMP1:%.*]] = fmul reassoc <4 x float> [[V:%.*]], <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>
+; CHECK-NEXT:     [[TMP2:%.*]] = fmul reassoc <4 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+05, float -3.000000e+00, float 4.000000e+00>
+; CHECK-NEXT:     ret <4 x float> [[TMP2]]
+        %Y = fmul reassoc <4 x float> %V, < float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00 >
+        %Z = fmul reassoc <4 x float> %Y, < float 1.000000e+00, float 2.000000e+05, float -3.000000e+00, float 4.000000e+00 >
+        ret <4 x float> %Z
+}
+
+; (V + C1) + C2 => V + (C1 + C2)
+; Verify this doesn't fold when no fast-math-flags are specified
+define <4 x float> @test_fadd(<4 x float> %V) {
+; CHECK-LABEL: @test_fadd(
+; CHECK-NEXT:     [[TMP1:%.*]] = fadd <4 x float> [[V:%.*]], <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>
+; CHECK-NEXT:     [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00, float -3.000000e+00, float 4.000000e+00>
+; CHECK-NEXT:     ret <4 x float> [[TMP2]]
+        %Y = fadd <4 x float> %V, < float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00 >
+        %Z = fadd <4 x float> %Y, < float 1.000000e+00, float 2.000000e+00, float -3.000000e+00, float 4.000000e+00 >
+        ret <4 x float> %Z
+}
+
+; (V + C1) + C2 => V + (C1 + C2)
+; Verify this folds with 'fast'
+define <4 x float> @test_fadd_fast(<4 x float> %V) {
+; CHECK-LABEL: @test_fadd_fast(
+; CHECK-NEXT:     [[TMP1:%.*]] = fadd fast <4 x float> [[V:%.*]], <float 2.000000e+00, float 4.000000e+00, float 0.000000e+00, float 8.000000e+00>
+; CHECK-NEXT:     ret <4 x float> [[TMP1]]
+        %Y = fadd fast <4 x float> %V, < float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00 >
+        %Z = fadd fast <4 x float> %Y, < float 1.000000e+00, float 2.000000e+00, float -3.000000e+00, float 4.000000e+00 >
+        ret <4 x float> %Z
+}
+
+; (V + C1) + C2 => V + (C1 + C2)
+; Verify this folds with 'reassoc' and 'nsz' ('nsz' not technically required)
+define <4 x float> @test_fadd_reassoc_nsz(<4 x float> %V) {
+; CHECK-LABEL: @test_fadd_reassoc_nsz(
+; CHECK-NEXT:     [[TMP1:%.*]] = fadd reassoc nsz <4 x float> [[V:%.*]], <float 2.000000e+00, float 4.000000e+00, float 0.000000e+00, float 8.000000e+00>
+; CHECK-NEXT:     ret <4 x float> [[TMP1]]
+        %Y = fadd reassoc nsz <4 x float> %V, < float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00 >
+        %Z = fadd reassoc nsz <4 x float> %Y, < float 1.000000e+00, float 2.000000e+00, float -3.000000e+00, float 4.000000e+00 >
+        ret <4 x float> %Z
+}
+
+; (V + C1) + C2 => V + (C1 + C2)
+; TODO: This doesn't require 'nsz'.  It should fold to V + { 2.0, 4.0, 0.0, 8.0 }
+define <4 x float> @test_fadd_reassoc(<4 x float> %V) {
+; CHECK-LABEL: @test_fadd_reassoc(
+; CHECK-NEXT:     [[TMP1:%.*]] = fadd reassoc <4 x float> [[V:%.*]], <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>
+; CHECK-NEXT:     [[TMP2:%.*]] = fadd reassoc <4 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00, float -3.000000e+00, float 4.000000e+00>
+; CHECK-NEXT:     ret <4 x float> [[TMP2]]
+        %Y = fadd reassoc <4 x float> %V, < float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00 >
+        %Z = fadd reassoc <4 x float> %Y, < float 1.000000e+00, float 2.000000e+00, float -3.000000e+00, float 4.000000e+00 >
+        ret <4 x float> %Z
+}
+
+; ( A + C1 ) + ( B + -C1 )
+; Verify this doesn't fold when no fast-math-flags are specified
+define <4 x float> @test_fadds_cancel_(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_fadds_cancel_(
+; CHECK-NEXT:     [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>
+; CHECK-NEXT:     [[TMP2:%.*]] = fadd <4 x float> [[B:%.*]], <float -1.000000e+00, float -2.000000e+00, float -3.000000e+00, float -4.000000e+00>
+; CHECK-NEXT:     [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:     ret <4 x float> [[TMP3]]
+        %X = fadd <4 x float> %A, < float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00 >
+        %Y = fadd <4 x float> %B, < float -1.000000e+00, float -2.000000e+00, float -3.000000e+00, float -4.000000e+00 >
+        %Z = fadd <4 x float> %X, %Y
+        ret <4 x float> %Z
+}
+
+; ( A + C1 ) + ( B + -C1 )
+; Verify this folds to 'A + B' with 'fast'
+define <4 x float> @test_fadds_cancel_fast(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_fadds_cancel_fast(
+; CHECK-NEXT:     [[TMP1:%.*]] = fadd fast <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:     ret <4 x float> [[TMP1]]
+        %X = fadd fast <4 x float> %A, < float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00 >
+        %Y = fadd fast <4 x float> %B, < float -1.000000e+00, float -2.000000e+00, float -3.000000e+00, float -4.000000e+00 >
+        %Z = fadd fast <4 x float> %X, %Y
+        ret <4 x float> %Z
+}
+
+; ( A + C1 ) + ( B + -C1 )
+; Verify this folds to 'A + B' with 'reassoc' and 'nsz' ('nsz' is required)
+define <4 x float> @test_fadds_cancel_reassoc_nsz(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_fadds_cancel_reassoc_nsz(
+; CHECK-NEXT:     [[TMP1:%.*]] = fadd reassoc nsz <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:     ret <4 x float> [[TMP1]]
+        %X = fadd reassoc nsz <4 x float> %A, < float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00 >
+        %Y = fadd reassoc nsz <4 x float> %B, < float -1.000000e+00, float -2.000000e+00, float -3.000000e+00, float -4.000000e+00 >
+        %Z = fadd reassoc nsz <4 x float> %X, %Y
+        ret <4 x float> %Z
+}
+
+; ( A + C1 ) + ( B + -C1 )
+; Verify the fold is not done with only 'reassoc' ('nsz' is required).
+define <4 x float> @test_fadds_cancel_reassoc(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_fadds_cancel_reassoc(
+; CHECK-NEXT:     [[TMP1:%.*]] = fadd reassoc <4 x float> [[A:%.*]], <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>
+; CHECK-NEXT:     [[TMP2:%.*]] = fadd reassoc <4 x float> [[B:%.*]], <float -1.000000e+00, float -2.000000e+00, float -3.000000e+00, float -4.000000e+00>
+; CHECK-NEXT:     [[TMP3:%.*]] = fadd reassoc <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:     ret <4 x float> [[TMP3]]
+        %X = fadd reassoc <4 x float> %A, < float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00 >
+        %Y = fadd reassoc <4 x float> %B, < float -1.000000e+00, float -2.000000e+00, float -3.000000e+00, float -4.000000e+00 >
+        %Z = fadd reassoc <4 x float> %X, %Y
+        ret <4 x float> %Z
+}

Added: llvm/trunk/test/Transforms/InstCombine/2006-11-10-ashr-miscompile.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2006-11-10-ashr-miscompile.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2006-11-10-ashr-miscompile.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2006-11-10-ashr-miscompile.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,9 @@
+; RUN: opt < %s -instcombine -S | grep lshr
+; Verify this is not turned into -1.
+
+define i32 @test(i8 %amt) {
+        %shift.upgrd.1 = zext i8 %amt to i32            ; <i32> [#uses=1]
+        %B = lshr i32 -1, %shift.upgrd.1                ; <i32> [#uses=1]
+        ret i32 %B
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2006-12-01-BadFPVectorXform.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2006-12-01-BadFPVectorXform.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2006-12-01-BadFPVectorXform.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2006-12-01-BadFPVectorXform.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,14 @@
+; NOTE: Assertions have been autogenerated by update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+define <4 x float> @test(<4 x float> %tmp26, <4 x float> %tmp53) {
+        ; (X+Y)-Y != X for fp vectors.
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    [[TMP64:%.*]] = fadd <4 x float> %tmp26, %tmp53
+; CHECK-NEXT:    [[TMP75:%.*]] = fsub <4 x float> [[TMP64]], %tmp53
+; CHECK-NEXT:    ret <4 x float> [[TMP75]]
+;
+  %tmp64 = fadd <4 x float> %tmp26, %tmp53
+  %tmp75 = fsub <4 x float> %tmp64, %tmp53
+  ret <4 x float> %tmp75
+}

Added: llvm/trunk/test/Transforms/InstCombine/2006-12-05-fp-to-int-ext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2006-12-05-fp-to-int-ext.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2006-12-05-fp-to-int-ext.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2006-12-05-fp-to-int-ext.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,12 @@
+; RUN: opt < %s -instcombine -S | grep zext
+
+; Never merge these two conversions, even though it's possible: this is
+; significantly more expensive than the two conversions on some targets
+; and it causes libgcc to be compile __fixunsdfdi into a recursive 
+; function.
+define i64 @test(double %D) {
+        %A = fptoui double %D to i32            ; <i32> [#uses=1]
+        %B = zext i32 %A to i64         ; <i64> [#uses=1]
+        ret i64 %B
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2006-12-08-Phi-ICmp-Op-Fold.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2006-12-08-Phi-ICmp-Op-Fold.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2006-12-08-Phi-ICmp-Op-Fold.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2006-12-08-Phi-ICmp-Op-Fold.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,51 @@
+; RUN: opt < %s -instcombine -S | \
+; RUN:   grep "icmp sgt"
+; END.
+target datalayout = "e-p:32:32"
+target triple = "i686-pc-linux-gnu"
+	%struct.point = type { i32, i32 }
+
+define i32 @visible(i32 %direction, i64 %p1.0, i64 %p2.0, i64 %p3.0) {
+entry:
+	%p1_addr = alloca %struct.point		; <%struct.point*> [#uses=2]
+	%p2_addr = alloca %struct.point		; <%struct.point*> [#uses=2]
+	%p3_addr = alloca %struct.point		; <%struct.point*> [#uses=2]
+	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
+	%tmp = bitcast %struct.point* %p1_addr to { i64 }*		; <{ i64 }*> [#uses=1]
+	%tmp.upgrd.1 = getelementptr { i64 }, { i64 }* %tmp, i64 0, i32 0		; <i64*> [#uses=1]
+	store i64 %p1.0, i64* %tmp.upgrd.1
+	%tmp1 = bitcast %struct.point* %p2_addr to { i64 }*		; <{ i64 }*> [#uses=1]
+	%tmp2 = getelementptr { i64 }, { i64 }* %tmp1, i64 0, i32 0		; <i64*> [#uses=1]
+	store i64 %p2.0, i64* %tmp2
+	%tmp3 = bitcast %struct.point* %p3_addr to { i64 }*		; <{ i64 }*> [#uses=1]
+	%tmp4 = getelementptr { i64 }, { i64 }* %tmp3, i64 0, i32 0		; <i64*> [#uses=1]
+	store i64 %p3.0, i64* %tmp4
+	%tmp.upgrd.2 = icmp eq i32 %direction, 0		; <i1> [#uses=1]
+	%tmp5 = bitcast %struct.point* %p1_addr to { i64 }*		; <{ i64 }*> [#uses=1]
+	%tmp6 = getelementptr { i64 }, { i64 }* %tmp5, i64 0, i32 0		; <i64*> [#uses=1]
+	%tmp.upgrd.3 = load i64, i64* %tmp6		; <i64> [#uses=1]
+	%tmp7 = bitcast %struct.point* %p2_addr to { i64 }*		; <{ i64 }*> [#uses=1]
+	%tmp8 = getelementptr { i64 }, { i64 }* %tmp7, i64 0, i32 0		; <i64*> [#uses=1]
+	%tmp9 = load i64, i64* %tmp8		; <i64> [#uses=1]
+	%tmp10 = bitcast %struct.point* %p3_addr to { i64 }*		; <{ i64 }*> [#uses=1]
+	%tmp11 = getelementptr { i64 }, { i64 }* %tmp10, i64 0, i32 0		; <i64*> [#uses=1]
+	%tmp12 = load i64, i64* %tmp11		; <i64> [#uses=1]
+	%tmp13 = call i32 @determinant( i64 %tmp.upgrd.3, i64 %tmp9, i64 %tmp12 )		; <i32> [#uses=2]
+	br i1 %tmp.upgrd.2, label %cond_true, label %cond_false
+
+cond_true:		; preds = %entry
+	%tmp14 = icmp slt i32 %tmp13, 0		; <i1> [#uses=1]
+	%tmp14.upgrd.4 = zext i1 %tmp14 to i32		; <i32> [#uses=1]
+	br label %return
+
+cond_false:		; preds = %entry
+	%tmp26 = icmp sgt i32 %tmp13, 0		; <i1> [#uses=1]
+	%tmp26.upgrd.5 = zext i1 %tmp26 to i32		; <i32> [#uses=1]
+	br label %return
+
+return:		; preds = %cond_false, %cond_true
+	%retval.0 = phi i32 [ %tmp14.upgrd.4, %cond_true ], [ %tmp26.upgrd.5, %cond_false ]		; <i32> [#uses=1]
+	ret i32 %retval.0
+}
+
+declare i32 @determinant(i64, i64, i64)

Added: llvm/trunk/test/Transforms/InstCombine/2006-12-08-Select-ICmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2006-12-08-Select-ICmp.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2006-12-08-Select-ICmp.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2006-12-08-Select-ICmp.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,42 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; CHECK: select
+; END.
+
+target datalayout = "e-p:32:32"
+target triple = "i686-pc-linux-gnu"
+        %struct.point = type { i32, i32 }
+
+define i32 @visible(i32 %direction, i64 %p1.0, i64 %p2.0, i64 %p3.0) {
+entry:
+        %p1_addr = alloca %struct.point         ; <%struct.point*> [#uses=2]
+        %p2_addr = alloca %struct.point         ; <%struct.point*> [#uses=2]
+        %p3_addr = alloca %struct.point         ; <%struct.point*> [#uses=2]
+        %tmp = bitcast %struct.point* %p1_addr to { i64 }*              ; <{ i64 }*> [#uses=1]
+        %tmp.upgrd.1 = getelementptr { i64 }, { i64 }* %tmp, i32 0, i32 0                ; <i64*> [#uses=1]
+        store i64 %p1.0, i64* %tmp.upgrd.1
+        %tmp1 = bitcast %struct.point* %p2_addr to { i64 }*             ; <{ i64 }*> [#uses=1]
+        %tmp2 = getelementptr { i64 }, { i64 }* %tmp1, i32 0, i32 0              ; <i64*> [#uses=1]
+        store i64 %p2.0, i64* %tmp2
+        %tmp3 = bitcast %struct.point* %p3_addr to { i64 }*             ; <{ i64 }*> [#uses=1]
+        %tmp4 = getelementptr { i64 }, { i64 }* %tmp3, i32 0, i32 0              ; <i64*> [#uses=1]
+        store i64 %p3.0, i64* %tmp4
+        %tmp.upgrd.2 = icmp eq i32 %direction, 0                ; <i1> [#uses=1]
+        %tmp5 = bitcast %struct.point* %p1_addr to { i64 }*             ; <{ i64 }*> [#uses=1]
+        %tmp6 = getelementptr { i64 }, { i64 }* %tmp5, i32 0, i32 0              ; <i64*> [#uses=1]
+        %tmp.upgrd.3 = load i64, i64* %tmp6          ; <i64> [#uses=1]
+        %tmp7 = bitcast %struct.point* %p2_addr to { i64 }*             ; <{ i64 }*> [#uses=1]
+        %tmp8 = getelementptr { i64 }, { i64 }* %tmp7, i32 0, i32 0              ; <i64*> [#uses=1]
+        %tmp9 = load i64, i64* %tmp8         ; <i64> [#uses=1]
+        %tmp10 = bitcast %struct.point* %p3_addr to { i64 }*            ; <{ i64 }*> [#uses=1]
+        %tmp11 = getelementptr { i64 }, { i64 }* %tmp10, i32 0, i32 0            ; <i64*> [#uses=1]
+        %tmp12 = load i64, i64* %tmp11               ; <i64> [#uses=1]
+        %tmp13 = call i32 @determinant( i64 %tmp.upgrd.3, i64 %tmp9, i64 %tmp12 )         ; <i32> [#uses=2]
+        %tmp14 = icmp slt i32 %tmp13, 0         ; <i1> [#uses=1]
+        %tmp26 = icmp sgt i32 %tmp13, 0         ; <i1> [#uses=1]
+        %retval.0.in = select i1 %tmp.upgrd.2, i1 %tmp14, i1 %tmp26             ; <i1> [#uses=1]
+        %retval.0 = zext i1 %retval.0.in to i32         ; <i32> [#uses=1]
+        ret i32 %retval.0
+}
+
+declare i32 @determinant(i64, i64, i64)
+

Added: llvm/trunk/test/Transforms/InstCombine/2006-12-15-Range-Test.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2006-12-15-Range-Test.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2006-12-15-Range-Test.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2006-12-15-Range-Test.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,31 @@
+; RUN: opt < %s -instcombine -S | \
+; RUN:   grep icmp | count 1
+; RUN: opt < %s -instcombine -S | \
+; RUN:   grep "icmp ugt" | count 1
+; END.
+
+target datalayout = "e-p:32:32"
+target triple = "i686-pc-linux-gnu"
+ at r = external global [17 x i32]         ; <[17 x i32]*> [#uses=1]
+
+define i1 @print_pgm_cond_true(i32 %tmp12.reload, i32* %tmp16.out) {
+newFuncRoot:
+        br label %cond_true
+
+bb27.exitStub:          ; preds = %cond_true
+        store i32 %tmp16, i32* %tmp16.out
+        ret i1 true
+
+cond_next23.exitStub:           ; preds = %cond_true
+        store i32 %tmp16, i32* %tmp16.out
+        ret i1 false
+
+cond_true:              ; preds = %newFuncRoot
+        %tmp15 = getelementptr [17 x i32], [17 x i32]* @r, i32 0, i32 %tmp12.reload         ; <i32*> [#uses=1]
+        %tmp16 = load i32, i32* %tmp15               ; <i32> [#uses=4]
+        %tmp18 = icmp slt i32 %tmp16, -31               ; <i1> [#uses=1]
+        %tmp21 = icmp sgt i32 %tmp16, 31                ; <i1> [#uses=1]
+        %bothcond = or i1 %tmp18, %tmp21                ; <i1> [#uses=1]
+        br i1 %bothcond, label %bb27.exitStub, label %cond_next23.exitStub
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2006-12-23-Select-Cmp-Cmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2006-12-23-Select-Cmp-Cmp.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2006-12-23-Select-Cmp-Cmp.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2006-12-23-Select-Cmp-Cmp.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,30 @@
+; For PR1065. This causes an assertion in instcombine if a select with two cmp
+; operands is encountered.
+; RUN: opt < %s -instcombine -disable-output
+; END.
+
+target datalayout = "e-p:32:32"
+target triple = "i686-pc-linux-gnu"
+	%struct.internal_state = type { i32 }
+	%struct.mng_data = type { i32, i8*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8, i32, i32, i32, i8, i32, i32, i32, i32, i16, i16, i16, i8, i8, double, double, double, i8, i8, i8, i8, i32, i32, i32, i32, i32, i8, i32, i32, i8*, i8* (i32)*, void (i8*, i32)*, void (i8*, i8*, i32)*, i8 (%struct.mng_data*)*, i8 (%struct.mng_data*)*, i8 (%struct.mng_data*, i8*, i32, i32*)*, i8 (%struct.mng_data*, i8*, i32, i32*)*, i8 (%struct.mng_data*, i32, i8, i32, i32, i32, i32, i8*)*, i8 (%struct.mng_data*, i32, i32, i8*)*, i8 (%struct.mng_data*, i32, i32)*, i8 (%struct.mng_data*, i8, i8*, i8*, i8*, i8*)*, i8 (%struct.mng_data*)*, i8 (%struct.mng_data*, i8*)*, i8 (%struct.mng_data*, i8*)*, i8 (%struct.mng_data*, i32, i32)*, i8 (%struct.mng_data*, i32, i32, i8*)*, i8 (%struct.mng_data*, i8, i8, i32, i32)*, i8* (%struct.mng_data*, i32)*, i8* (%struct.mng_data*, i32)*, i8* (%struct.mng_data*, i32)*, i8 (%struct.mng_data*, i32, i32, i32, i32)*, i32 (%struct.mng_data*)*, i8 (%struct.mng_data*, i32)*, i8 (%struct.mng_data*, i32)*, i8 (%struct.mng_data*, i32, i32, i32, i32, i32, i32, i32, i32)*, i8 (%struct.mng_data*, i8)*, i8 (%struct.mng_data*, i32, i8*)*, i8 (%struct.mng_data*, i32, i8, i8*)*, i8, i32, i32, i8*, i8*, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i32, i32, i8, i8, i8, i8, i8, i32, i8, i8, i8, i32, i8*, i32, i8*, i32, i8, i8, i8, i32, i8*, i8*, i32, i32, i8*, i8*, %struct.mng_pushdata*, %struct.mng_pushdata*, %struct.mng_pushdata*, %struct.mng_pushdata*, i8, i8, i32, i32, i8*, i8, i8, i32, i32, i32, i32, i32, i32, i8, i8, i8, i8, i32, i32, i8*, i32, i32, i32, i8, i8, i32, i32, i32, i32, i8, i8, i8, i8, i8, i8, i8, i8, i8, i32, i8*, i8*, i8*, i32, i8*, i8*, i8*, i8*, i8*, %struct.mng_savedata*, i32, i32, i32, i32, i8, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8*, i8*, i8*, i8, i8, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8*, i8*, i8*, i8*, i8*, i8*, [256 x i8], double, void ()*, void ()*, void ()*, void ()*, void ()*, void ()*, void ()*, void ()*, void ()*, void ()*, void ()*, void ()*, i16, i8, i8, i8, i8, i8, i32, i32, i8, i32, i32, i32, i32, i16, i16, i16, i8, i16, i8, i32, i32, i32, i32, i8, i32, i32, i8, i32, i32, i32, i32, i8, i32, i32, i8, i32, i32, i32, i32, i32, i8, i32, i8, i16, i16, i16, i16, i32, [256 x %struct.mng_palette8e], i32, [256 x i8], i32, i32, i32, i32, i32, i32, i32, i32, i32, i8, i32, i8*, i16, i16, i16, i8*, i8, i8, i32, i32, i32, i32, i8, void ()*, void ()*, void ()*, void ()*, void ()*, void ()*, i8*, i8, i8, i8, i32, i8*, i8*, i16, i16, i16, i16, i32, i32, i8*, %struct.z_stream, i32, i32, i32, i32, i32, i32, i8, i8, [256 x i32], i8 }
+	%struct.mng_palette8e = type { i8, i8, i8 }
+	%struct.mng_pushdata = type { i8*, i8*, i32, i8, i8*, i32 }
+	%struct.mng_savedata = type { i8, i8, i8, i8, i8, i8, i8, i16, i16, i16, i8, i16, i8, i8, i32, i32, i8, i32, i32, i32, i32, i32, [256 x %struct.mng_palette8e], i32, [256 x i8], i32, i32, i32, i32, i32, i32, i32, i32, i32, i8, i32, i8*, i16, i16, i16 }
+	%struct.z_stream = type { i8*, i32, i32, i8*, i32, i32, i8*, %struct.internal_state*, i8* (i8*, i32, i32)*, void (i8*, i8*)*, i8*, i32, i32, i32 }
+
+define void @mng_write_basi() {
+entry:
+	%tmp = load i8, i8* null		; <i8> [#uses=1]
+	%tmp.upgrd.1 = icmp ugt i8 %tmp, 8		; <i1> [#uses=1]
+	%tmp.upgrd.2 = load i16, i16* null		; <i16> [#uses=2]
+	%tmp3 = icmp eq i16 %tmp.upgrd.2, 255		; <i1> [#uses=1]
+	%tmp7 = icmp eq i16 %tmp.upgrd.2, -1		; <i1> [#uses=1]
+	%bOpaque.0.in = select i1 %tmp.upgrd.1, i1 %tmp7, i1 %tmp3		; <i1> [#uses=1]
+	br i1 %bOpaque.0.in, label %cond_next90, label %bb95
+
+cond_next90:		; preds = %entry
+	ret void
+
+bb95:		; preds = %entry
+	ret void
+}

Added: llvm/trunk/test/Transforms/InstCombine/2007-01-13-ExtCompareMiscompile.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-01-13-ExtCompareMiscompile.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-01-13-ExtCompareMiscompile.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-01-13-ExtCompareMiscompile.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,10 @@
+; RUN: opt < %s -instcombine -S | grep "icmp ugt"
+; PR1107
+; PR1940
+
+define i1 @test(i8 %A, i8 %B) {
+	%a = zext i8 %A to i32
+	%b = zext i8 %B to i32
+	%c = icmp sgt i32 %a, %b
+	ret i1 %c
+}

Added: llvm/trunk/test/Transforms/InstCombine/2007-01-18-VectorInfLoop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-01-18-VectorInfLoop.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-01-18-VectorInfLoop.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-01-18-VectorInfLoop.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,7 @@
+; RUN: opt < %s -instcombine -disable-output
+
+define <4 x i32> @test(<4 x i32> %A) {
+    %B = xor <4 x i32> %A, < i32 -1, i32 -1, i32 -1, i32 -1 > 
+    %C = and <4 x i32> %B, < i32 -1, i32 -1, i32 -1, i32 -1 >
+    ret <4 x i32> %C
+}

Added: llvm/trunk/test/Transforms/InstCombine/2007-02-01-LoadSinkAlloca.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-02-01-LoadSinkAlloca.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-02-01-LoadSinkAlloca.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-02-01-LoadSinkAlloca.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,45 @@
+; RUN: opt < %s -instcombine -mem2reg -S | grep "%A = alloca" 
+; RUN: opt < %s -instcombine -mem2reg -S | \
+; RUN:    not grep "%B = alloca"
+; END.
+
+; Ensure that instcombine doesn't sink the loads in entry/cond_true into 
+; cond_next.  Doing so prevents mem2reg from promoting the B alloca.
+
+define i32 @test2(i32 %C) {
+entry:
+	%A = alloca i32
+	%B = alloca i32
+	%tmp = call i32 (...) @bar( i32* %A )		; <i32> [#uses=0]
+	%T = load i32, i32* %A		; <i32> [#uses=1]
+	%tmp2 = icmp eq i32 %C, 0		; <i1> [#uses=1]
+	br i1 %tmp2, label %cond_next, label %cond_true
+
+cond_true:		; preds = %entry
+	store i32 123, i32* %B
+	call i32 @test2( i32 123 )		; <i32>:0 [#uses=0]
+	%T1 = load i32, i32* %B		; <i32> [#uses=1]
+	br label %cond_next
+
+cond_next:		; preds = %cond_true, %entry
+	%tmp1.0 = phi i32 [ %T1, %cond_true ], [ %T, %entry ]		; <i32> [#uses=1]
+	%tmp7 = call i32 (...) @baq( )		; <i32> [#uses=0]
+	%tmp8 = call i32 (...) @baq( )		; <i32> [#uses=0]
+	%tmp9 = call i32 (...) @baq( )		; <i32> [#uses=0]
+	%tmp10 = call i32 (...) @baq( )		; <i32> [#uses=0]
+	%tmp11 = call i32 (...) @baq( )		; <i32> [#uses=0]
+	%tmp12 = call i32 (...) @baq( )		; <i32> [#uses=0]
+	%tmp13 = call i32 (...) @baq( )		; <i32> [#uses=0]
+	%tmp14 = call i32 (...) @baq( )		; <i32> [#uses=0]
+	%tmp15 = call i32 (...) @baq( )		; <i32> [#uses=0]
+	%tmp16 = call i32 (...) @baq( )		; <i32> [#uses=0]
+	%tmp17 = call i32 (...) @baq( )		; <i32> [#uses=0]
+	%tmp18 = call i32 (...) @baq( )		; <i32> [#uses=0]
+	%tmp19 = call i32 (...) @baq( )		; <i32> [#uses=0]
+	%tmp20 = call i32 (...) @baq( )		; <i32> [#uses=0]
+	ret i32 %tmp1.0
+}
+
+declare i32 @bar(...)
+
+declare i32 @baq(...)

Added: llvm/trunk/test/Transforms/InstCombine/2007-02-07-PointerCast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-02-07-PointerCast.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-02-07-PointerCast.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-02-07-PointerCast.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,22 @@
+;RUN: opt < %s -instcombine -S | grep zext
+
+; Make sure the uint isn't removed.  Instcombine in llvm 1.9 was dropping the 
+; uint cast which was causing a sign extend. This only affected code with 
+; pointers in the high half of memory, so it wasn't noticed much
+; compile a kernel though...
+
+target datalayout = "e-p:32:32"
+ at str = internal constant [6 x i8] c"%llx\0A\00"         ; <[6 x i8]*> [#uses=1]
+
+declare i32 @printf(i8*, ...)
+
+define i32 @main(i32 %x, i8** %a) {
+entry:
+        %tmp = getelementptr [6 x i8], [6 x i8]* @str, i32 0, i64 0               ; <i8*> [#uses=1]
+        %tmp1 = load i8*, i8** %a            ; <i8*> [#uses=1]
+        %tmp2 = ptrtoint i8* %tmp1 to i32               ; <i32> [#uses=1]
+        %tmp3 = zext i32 %tmp2 to i64           ; <i64> [#uses=1]
+        %tmp.upgrd.1 = call i32 (i8*, ...) @printf( i8* %tmp, i64 %tmp3 )              ; <i32> [#uses=0]
+        ret i32 0
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2007-02-23-PhiFoldInfLoop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-02-23-PhiFoldInfLoop.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-02-23-PhiFoldInfLoop.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-02-23-PhiFoldInfLoop.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,31 @@
+; RUN: opt < %s -instcombine -S | grep ret
+; PR1217
+
+target datalayout = "e-p:32:32"
+target triple = "i686-pc-linux-gnu"
+	%struct.termbox = type { %struct.termbox*, i32, i32, i32, i32, i32 }
+
+
+define void @ggenorien() {
+entry:
+	%tmp68 = icmp eq %struct.termbox* null, null		; <i1> [#uses=1]
+	br i1 %tmp68, label %cond_next448, label %bb80
+
+bb80:		; preds = %entry
+	ret void
+
+cond_next448:		; preds = %entry
+	br i1 false, label %bb756, label %bb595
+
+bb595:		; preds = %cond_next448
+	br label %bb609
+
+bb609:		; preds = %bb756, %bb595
+	%termnum.6240.0 = phi i32 [ 2, %bb595 ], [ %termnum.6, %bb756 ]		; <i32> [#uses=1]
+	%tmp755 = add i32 %termnum.6240.0, 1		; <i32> [#uses=1]
+	br label %bb756
+
+bb756:		; preds = %bb609, %cond_next448
+	%termnum.6 = phi i32 [ %tmp755, %bb609 ], [ 2, %cond_next448 ]		; <i32> [#uses=1]
+	br label %bb609
+}

Added: llvm/trunk/test/Transforms/InstCombine/2007-03-13-CompareMerge.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-03-13-CompareMerge.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-03-13-CompareMerge.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-03-13-CompareMerge.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,9 @@
+; RUN: opt < %s -instcombine -S | grep "icmp sle"
+; PR1244
+
+define i1 @test(i32 %c.3.i, i32 %d.292.2.i) {
+   %tmp266.i = icmp slt i32 %c.3.i, %d.292.2.i     
+   %tmp276.i = icmp eq i32 %c.3.i, %d.292.2.i 
+   %sel_tmp80 = or i1 %tmp266.i, %tmp276.i 
+   ret i1 %sel_tmp80
+}

Added: llvm/trunk/test/Transforms/InstCombine/2007-03-19-BadTruncChangePR1261.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-03-19-BadTruncChangePR1261.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-03-19-BadTruncChangePR1261.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-03-19-BadTruncChangePR1261.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,10 @@
+; RUN: opt < %s -instcombine -S | grep zext
+; PR1261. 
+
+define i16 @test(i31 %zzz) {
+  %A = sext i31 %zzz to i32
+  %B = add i32 %A, 16384
+  %C = lshr i32 %B, 15
+  %D = trunc i32 %C to i16
+  ret i16 %D
+}

Added: llvm/trunk/test/Transforms/InstCombine/2007-03-21-SignedRangeTest.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-03-21-SignedRangeTest.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-03-21-SignedRangeTest.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-03-21-SignedRangeTest.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; For PR1248
+
+define i1 @test(i32 %tmp6) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    [[TMP6_OFF:%.*]] = add i32 %tmp6, 83
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP6_OFF]], 11
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %tmp7 = sdiv i32 %tmp6, 12
+  icmp ne i32 %tmp7, -6
+  ret i1 %1
+}
+
+define <2 x i1> @test_vec(<2 x i32> %tmp6) {
+; CHECK-LABEL: @test_vec(
+; CHECK-NEXT:    [[TMP6_OFF:%.*]] = add <2 x i32> %tmp6, <i32 83, i32 83>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt <2 x i32> [[TMP6_OFF]], <i32 11, i32 11>
+; CHECK-NEXT:    ret <2 x i1> [[TMP1]]
+;
+  %tmp7 = sdiv <2 x i32> %tmp6, <i32 12, i32 12>
+  icmp ne <2 x i32> %tmp7, <i32 -6, i32 -6>
+  ret <2 x i1> %1
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2007-03-25-BadShiftMask.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-03-25-BadShiftMask.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-03-25-BadShiftMask.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-03-25-BadShiftMask.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; PR1271
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+%struct..0anon = type { i32, i32 }
+%struct..1anon = type { double }
+
+define i32 @main() {
+; CHECK-LABEL: @main(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[U:%.*]] = alloca %struct..1anon, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds %struct..1anon, %struct..1anon* [[U]], i64 0, i32 0
+; CHECK-NEXT:    store double 0x7FF0000000000000, double* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP34:%.*]] = bitcast %struct..1anon* [[U]] to %struct..0anon*
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds %struct..0anon, %struct..0anon* [[TMP34]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP89:%.*]] = and i32 [[TMP6]], 2146435072
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[TMP89]], 2146435072
+; CHECK-NEXT:    br i1 [[TMP0]], label %cond_false, label %cond_true
+; CHECK:       cond_true:
+; CHECK-NEXT:    ret i32 0
+; CHECK:       cond_false:
+; CHECK-NEXT:    ret i32 1
+;
+entry:
+  %u = alloca %struct..1anon, align 8
+  %tmp1 = getelementptr %struct..1anon, %struct..1anon* %u, i32 0, i32 0
+  store double 0x7FF0000000000000, double* %tmp1
+  %tmp3 = getelementptr %struct..1anon, %struct..1anon* %u, i32 0, i32 0
+  %tmp34 = bitcast double* %tmp3 to %struct..0anon*
+  %tmp5 = getelementptr %struct..0anon, %struct..0anon* %tmp34, i32 0, i32 1
+  %tmp6 = load i32, i32* %tmp5
+  %tmp7 = shl i32 %tmp6, 1
+  %tmp8 = lshr i32 %tmp7, 21
+  %tmp89 = trunc i32 %tmp8 to i16
+  icmp ne i16 %tmp89, 2047
+  zext i1 %0 to i8
+  icmp ne i8 %1, 0
+  br i1 %2, label %cond_true, label %cond_false
+
+cond_true:
+  ret i32 0
+
+cond_false:
+  ret i32 1
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2007-03-25-DoubleShift.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-03-25-DoubleShift.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-03-25-DoubleShift.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-03-25-DoubleShift.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,9 @@
+; PR1271
+; RUN: opt < %s -instcombine -S | grep and
+define i1 @test(i32 %tmp13) {
+entry:
+	%tmp14 = shl i32 %tmp13, 12		; <i32> [#uses=1]
+	%tmp15 = lshr i32 %tmp14, 12		; <i32> [#uses=1]
+	%res = icmp ne i32 %tmp15, 0		; <i1>:3 [#uses=1]
+        ret i1 %res
+}

Added: llvm/trunk/test/Transforms/InstCombine/2007-03-26-BadShiftMask.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-03-26-BadShiftMask.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-03-26-BadShiftMask.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-03-26-BadShiftMask.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,35 @@
+; PR1271
+; RUN: opt < %s -instcombine -S | \
+; RUN:    grep "ashr exact i32 %.mp137, 2"
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
+target triple = "i686-pc-linux-gnu"
+
+
+define i1 @test(i32* %tmp141, i32* %tmp145, 
+            i32 %b8, i32 %iftmp.430.0, i32* %tmp134.out, i32* %tmp137.out)
+{
+newFuncRoot:
+	%tmp133 = and i32 %b8, 1		; <i32> [#uses=1]
+	%tmp134 = shl i32 %tmp133, 3		; <i32> [#uses=3]
+	%tmp136 = ashr i32 %b8, 1		; <i32> [#uses=1]
+	%tmp137 = shl i32 %tmp136, 3		; <i32> [#uses=3]
+	%tmp139 = ashr i32 %tmp134, 2		; <i32> [#uses=1]
+	store i32 %tmp139, i32* %tmp141
+	%tmp143 = ashr i32 %tmp137, 2		; <i32> [#uses=1]
+	store i32 %tmp143, i32* %tmp145
+	icmp eq i32 %iftmp.430.0, 0		; <i1>:0 [#uses=1]
+	zext i1 %0 to i8		; <i8>:1 [#uses=1]
+	icmp ne i8 %1, 0		; <i1>:2 [#uses=1]
+	br i1 %2, label %cond_true147.exitStub, label %cond_false252.exitStub
+
+cond_true147.exitStub:		; preds = %newFuncRoot
+	store i32 %tmp134, i32* %tmp134.out
+	store i32 %tmp137, i32* %tmp137.out
+	ret i1 true
+
+cond_false252.exitStub:		; preds = %newFuncRoot
+	store i32 %tmp134, i32* %tmp134.out
+	store i32 %tmp137, i32* %tmp137.out
+	ret i1 false
+}

Added: llvm/trunk/test/Transforms/InstCombine/2007-04-08-SingleEltVectorCrash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-04-08-SingleEltVectorCrash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-04-08-SingleEltVectorCrash.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-04-08-SingleEltVectorCrash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,7 @@
+; RUN: opt < %s -instcombine -disable-output
+; PR1304
+
+define i64 @bork(<1 x i64> %vec) {
+  %tmp = extractelement <1 x i64> %vec, i32 0
+  ret i64 %tmp
+}

Added: llvm/trunk/test/Transforms/InstCombine/2007-05-10-icmp-or.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-05-10-icmp-or.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-05-10-icmp-or.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-05-10-icmp-or.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,8 @@
+; RUN: opt < %s -instcombine -disable-output
+define i1 @test(i32 %tmp9) {
+        %tmp20 = icmp ugt i32 %tmp9, 255                ; <i1> [#uses=1]
+        %tmp11.not = icmp sgt i32 %tmp9, 255            ; <i1> [#uses=1]
+        %bothcond = or i1 %tmp20, %tmp11.not            ; <i1> [#uses=1]
+        ret i1 %bothcond
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2007-05-14-Crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-05-14-Crash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-05-14-Crash.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-05-14-Crash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,18 @@
+; RUN: opt < %s -instcombine -disable-output
+
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
+target triple = "powerpc-unknown-linux-gnu"
+
+%struct.abc = type { i32, [32 x i8] }        
+%struct.def = type { i8**, %struct.abc }        
+        %struct.anon = type <{  }>
+
+define i8* @foo(%struct.anon* %deviceRef, %struct.abc* %pCap) {
+entry:
+        %tmp1 = bitcast %struct.anon* %deviceRef to %struct.def*            
+        %tmp3 = getelementptr %struct.def, %struct.def* %tmp1, i32 0, i32 1               
+        %tmp35 = bitcast %struct.abc* %tmp3 to i8*           
+        ret i8* %tmp35
+}
+
+

Added: llvm/trunk/test/Transforms/InstCombine/2007-05-18-CastFoldBug.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-05-18-CastFoldBug.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-05-18-CastFoldBug.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-05-18-CastFoldBug.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,10 @@
+; RUN: opt < %s -instcombine -S | grep "call.*sret"
+; Make sure instcombine doesn't drop the sret attribute.
+
+define void @blah(i16* %tmp10) {
+entry:
+	call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend_stret to void (i16*)*)( i16*  sret %tmp10  )
+	ret void
+}
+
+declare i8* @objc_msgSend_stret(i8*, i8*, ...)

Added: llvm/trunk/test/Transforms/InstCombine/2007-06-06-AshrSignBit.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-06-06-AshrSignBit.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-06-06-AshrSignBit.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-06-06-AshrSignBit.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,22 @@
+; RUN: opt < %s -instcombine -S | grep "ashr"
+; PR1499
+
+define void @av_cmp_q_cond_true(i32* %retval, i32* %tmp9, i64* %tmp10) {
+newFuncRoot:
+	br label %cond_true
+
+return.exitStub:		; preds = %cond_true
+	ret void
+
+cond_true:		; preds = %newFuncRoot
+	%tmp30 = load i64, i64* %tmp10		; <i64> [#uses=1]
+	%.cast = zext i32 63 to i64		; <i64> [#uses=1]
+	%tmp31 = ashr i64 %tmp30, %.cast		; <i64> [#uses=1]
+	%tmp3132 = trunc i64 %tmp31 to i32		; <i32> [#uses=1]
+	%tmp33 = or i32 %tmp3132, 1		; <i32> [#uses=1]
+	store i32 %tmp33, i32* %tmp9
+	%tmp34 = load i32, i32* %tmp9		; <i32> [#uses=1]
+	store i32 %tmp34, i32* %retval
+	br label %return.exitStub
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2007-06-21-DivCompareMiscomp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-06-21-DivCompareMiscomp.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-06-21-DivCompareMiscomp.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-06-21-DivCompareMiscomp.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,9 @@
+; RUN: opt < %s -instcombine -S | grep "ret i1 true"
+; rdar://5278853
+
+define i1 @test(i32 %tmp468) {
+        %tmp470 = udiv i32 %tmp468, 4           ; <i32> [#uses=2]
+        %tmp475 = icmp ult i32 %tmp470, 1073741824              ; <i1> [#uses=1]
+        ret i1 %tmp475
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2007-08-02-InfiniteLoop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-08-02-InfiniteLoop.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-08-02-InfiniteLoop.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-08-02-InfiniteLoop.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,10 @@
+; RUN: opt < %s -instcombine -disable-output
+; PR1594
+
+define i64 @test(i16 %tmp510, i16 %tmp512) {
+	%W = sext i16 %tmp510 to i32           ; <i32> [#uses=1]
+        %X = sext i16 %tmp512 to i32           ; <i32> [#uses=1]
+        %Y = add i32 %W, %X               ; <i32> [#uses=1]
+        %Z = sext i32 %Y to i64          ; <i64> [#uses=1]
+	ret i64 %Z
+}

Added: llvm/trunk/test/Transforms/InstCombine/2007-09-10-AliasConstFold.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-09-10-AliasConstFold.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-09-10-AliasConstFold.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-09-10-AliasConstFold.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,15 @@
+; RUN: opt < %s -instcombine -S | grep icmp
+; PR1646
+
+ at __gthrw_pthread_cancel = weak alias i32 (i32), i32 (i32)* @pthread_cancel		; <i32 (i32)*> [#uses=1]
+ at __gthread_active_ptr.5335 = internal constant i8* bitcast (i32 (i32)* @__gthrw_pthread_cancel to i8*)		; <i8**> [#uses=1]
+define weak i32 @pthread_cancel(i32) {
+       ret i32 0
+}
+
+define i1 @__gthread_active_p() {
+entry:
+	%tmp1 = load i8*, i8** @__gthread_active_ptr.5335, align 4		; <i8*> [#uses=1]
+	%tmp2 = icmp ne i8* %tmp1, null		; <i1> [#uses=1]
+	ret i1 %tmp2
+}

Added: llvm/trunk/test/Transforms/InstCombine/2007-09-17-AliasConstFold2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-09-17-AliasConstFold2.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-09-17-AliasConstFold2.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-09-17-AliasConstFold2.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,16 @@
+; RUN: opt < %s -instcombine -S | grep icmp
+; PR1678
+
+ at A = weak alias void (), void ()* @B		; <void ()*> [#uses=1]
+
+define weak void @B() {
+       ret void
+}
+
+define i32 @active() {
+entry:
+	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
+	%tmp1 = icmp ne void ()* @A, null		; <i1> [#uses=1]
+	%tmp12 = zext i1 %tmp1 to i32		; <i32> [#uses=1]
+	ret i32 %tmp12
+}

Added: llvm/trunk/test/Transforms/InstCombine/2007-10-10-EliminateMemCpy.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-10-10-EliminateMemCpy.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-10-10-EliminateMemCpy.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-10-10-EliminateMemCpy.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,20 @@
+; RUN: opt < %s -instcombine -S | not grep call
+; RUN: opt < %s -O3 -S | not grep xyz
+target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
+
+ at .str = internal constant [4 x i8] c"xyz\00"		; <[4 x i8]*> [#uses=1]
+
+define void @foo(i8* %P) {
+entry:
+  %P_addr = alloca i8*
+  store i8* %P, i8** %P_addr
+  %tmp = load i8*, i8** %P_addr, align 4
+  %tmp1 = getelementptr [4 x i8], [4 x i8]* @.str, i32 0, i32 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %tmp, i8* %tmp1, i32 4, i1 false)
+  br label %return
+
+return:                                           ; preds = %entry
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i1) nounwind

Added: llvm/trunk/test/Transforms/InstCombine/2007-10-12-Crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-10-12-Crash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-10-12-Crash.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-10-12-Crash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,38 @@
+; RUN: opt < %s -instcombine -disable-output
+
+	%struct.Ray = type { %struct.Vec, %struct.Vec }
+	%struct.Scene = type { i32 (...)** }
+	%struct.Vec = type { double, double, double }
+
+declare double @_Z9ray_traceRK3VecRK3RayRK5Scene(%struct.Vec*, %struct.Ray*, %struct.Scene*)
+
+define i32 @main(i32 %argc, i8** %argv) {
+entry:
+	%tmp3 = alloca %struct.Ray, align 4		; <%struct.Ray*> [#uses=2]
+	%tmp97 = icmp slt i32 0, 512		; <i1> [#uses=1]
+	br i1 %tmp97, label %bb71, label %bb108
+
+bb29:		; preds = %bb62
+	%tmp322 = bitcast %struct.Ray* %tmp3 to %struct.Vec*		; <%struct.Vec*> [#uses=1]
+	%tmp322.0 = getelementptr %struct.Vec, %struct.Vec* %tmp322, i32 0, i32 0		; <double*> [#uses=1]
+	store double 0.000000e+00, double* %tmp322.0
+	%tmp57 = call double @_Z9ray_traceRK3VecRK3RayRK5Scene( %struct.Vec* null, %struct.Ray* %tmp3, %struct.Scene* null )		; <double> [#uses=0]
+	br label %bb62
+
+bb62:		; preds = %bb71, %bb29
+	%tmp65 = icmp slt i32 0, 4		; <i1> [#uses=1]
+	br i1 %tmp65, label %bb29, label %bb68
+
+bb68:		; preds = %bb62
+	ret i32 0
+
+bb71:		; preds = %entry
+	%tmp74 = icmp slt i32 0, 4		; <i1> [#uses=1]
+	br i1 %tmp74, label %bb62, label %bb77
+
+bb77:		; preds = %bb71
+	ret i32 0
+
+bb108:		; preds = %entry
+	ret i32 0
+}

Added: llvm/trunk/test/Transforms/InstCombine/2007-10-28-stacksave.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-10-28-stacksave.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-10-28-stacksave.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-10-28-stacksave.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,47 @@
+; RUN: opt < %s -instcombine -S | grep "call.*stacksave"
+; PR1745
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i686-apple-darwin8"
+ at p = weak global i8* null		; <i8**> [#uses=1]
+
+define i32 @main() {
+entry:
+	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
+	br label %lab
+
+lab:		; preds = %cleanup31, %entry
+	%n.0 = phi i32 [ 0, %entry ], [ %tmp25, %cleanup31 ]		; <i32> [#uses=2]
+	%tmp2 = call i8* @llvm.stacksave( )		; <i8*> [#uses=2]
+	%tmp4 = srem i32 %n.0, 47		; <i32> [#uses=1]
+	%tmp5 = add i32 %tmp4, 1		; <i32> [#uses=5]
+	%tmp7 = sub i32 %tmp5, 1		; <i32> [#uses=0]
+	%tmp89 = zext i32 %tmp5 to i64		; <i64> [#uses=1]
+	%tmp10 = mul i64 %tmp89, 32		; <i64> [#uses=0]
+	%tmp12 = mul i32 %tmp5, 4		; <i32> [#uses=0]
+	%tmp1314 = zext i32 %tmp5 to i64		; <i64> [#uses=1]
+	%tmp15 = mul i64 %tmp1314, 32		; <i64> [#uses=0]
+	%tmp17 = mul i32 %tmp5, 4		; <i32> [#uses=1]
+	%tmp18 = alloca i8, i32 %tmp17		; <i8*> [#uses=1]
+	%tmp1819 = bitcast i8* %tmp18 to i32*		; <i32*> [#uses=2]
+	%tmp21 = getelementptr i32, i32* %tmp1819, i32 0		; <i32*> [#uses=1]
+	store i32 1, i32* %tmp21, align 4
+	%tmp2223 = bitcast i32* %tmp1819 to i8*		; <i8*> [#uses=1]
+	store volatile i8* %tmp2223, i8** @p, align 4
+	%tmp25 = add i32 %n.0, 1		; <i32> [#uses=2]
+	%tmp27 = icmp sle i32 %tmp25, 999999		; <i1> [#uses=1]
+	%tmp2728 = zext i1 %tmp27 to i8		; <i8> [#uses=1]
+	%toBool = icmp ne i8 %tmp2728, 0		; <i1> [#uses=1]
+	br i1 %toBool, label %cleanup31, label %cond_next
+
+cond_next:		; preds = %lab
+	call void @llvm.stackrestore( i8* %tmp2 )
+	ret i32 0
+
+cleanup31:		; preds = %lab
+	call void @llvm.stackrestore( i8* %tmp2 )
+	br label %lab
+}
+
+declare i8* @llvm.stacksave()
+
+declare void @llvm.stackrestore(i8*)

Added: llvm/trunk/test/Transforms/InstCombine/2007-10-31-RangeCrash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-10-31-RangeCrash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-10-31-RangeCrash.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-10-31-RangeCrash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,35 @@
+; RUN: opt < %s -instcombine -disable-output
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128"
+target triple = "powerpc-unknown-linux-gnu"
+
+define i32 @test() {
+entry:
+	%tmp50.i17 = icmp slt i32 0, 4		; <i1> [#uses=1]
+	br i1 %tmp50.i17, label %bb.i, label %calculateColorSpecificBlackLevel.exit
+
+bb.i:		; preds = %entry
+	br label %bb51.i.i
+
+bb27.i.i:		; preds = %bb51.i.i
+	%tmp31.i.i = load i16, i16* null, align 2		; <i16> [#uses=2]
+	%tmp35.i.i = icmp ult i16 %tmp31.i.i, 1		; <i1> [#uses=1]
+	%tmp41.i.i = icmp ugt i16 %tmp31.i.i, -1		; <i1> [#uses=1]
+	%bothcond.i.i = or i1 %tmp35.i.i, %tmp41.i.i		; <i1> [#uses=1]
+	%bothcond1.i.i = zext i1 %bothcond.i.i to i32		; <i32> [#uses=1]
+	%tmp46.i.i = xor i32 %bothcond1.i.i, 1		; <i32> [#uses=1]
+	%count.0.i.i = add i32 %count.1.i.i, %tmp46.i.i		; <i32> [#uses=1]
+	%tmp50.i.i = add i32 %x.0.i.i, 2		; <i32> [#uses=1]
+	br label %bb51.i.i
+
+bb51.i.i:		; preds = %bb27.i.i, %bb.i
+	%count.1.i.i = phi i32 [ %count.0.i.i, %bb27.i.i ], [ 0, %bb.i ]		; <i32> [#uses=1]
+	%x.0.i.i = phi i32 [ %tmp50.i.i, %bb27.i.i ], [ 0, %bb.i ]		; <i32> [#uses=2]
+	%tmp54.i.i = icmp slt i32 %x.0.i.i, 0		; <i1> [#uses=1]
+	br i1 %tmp54.i.i, label %bb27.i.i, label %bb57.i.i
+
+bb57.i.i:		; preds = %bb51.i.i
+	ret i32 0
+
+calculateColorSpecificBlackLevel.exit:		; preds = %entry
+	ret i32 undef
+}

Added: llvm/trunk/test/Transforms/InstCombine/2007-10-31-StringCrash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-10-31-StringCrash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-10-31-StringCrash.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-10-31-StringCrash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,21 @@
+; RUN: opt < %s -instcombine -disable-output
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i686-apple-darwin8"
+
+declare void @__darwin_gcc3_preregister_frame_info()
+
+define void @_start(i32 %argc, i8** %argv, i8** %envp) {
+entry:
+	%tmp1 = bitcast void ()* @__darwin_gcc3_preregister_frame_info to i32*		; <i32*> [#uses=1]
+	%tmp2 = load i32, i32* %tmp1, align 4		; <i32> [#uses=1]
+	%tmp3 = icmp ne i32 %tmp2, 0		; <i1> [#uses=1]
+	%tmp34 = zext i1 %tmp3 to i8		; <i8> [#uses=1]
+	%toBool = icmp ne i8 %tmp34, 0		; <i1> [#uses=1]
+	br i1 %toBool, label %cond_true, label %return
+
+cond_true:		; preds = %entry
+	ret void
+
+return:		; preds = %entry
+	ret void
+}

Added: llvm/trunk/test/Transforms/InstCombine/2007-11-07-OpaqueAlignCrash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-11-07-OpaqueAlignCrash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-11-07-OpaqueAlignCrash.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-11-07-OpaqueAlignCrash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,22 @@
+; RUN: opt < %s -instcombine -disable-output
+; PR1780
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
+target triple = "i686-pc-linux-gnu"
+
+%opaque_t = type opaque
+%opaque2 = type opaque
+%op_ts = type {%opaque2, i32}
+
+ at g = external global %opaque_t
+ at h = external global %op_ts
+
+define i32 @foo() {
+entry:
+        %x = load i8, i8* bitcast (%opaque_t* @g to i8*)
+        %y = load i32, i32* bitcast (%op_ts* @h to i32*)
+	%z = zext i8 %x to i32
+	%r = add i32 %y, %z
+        ret i32 %r
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2007-11-15-CompareMiscomp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-11-15-CompareMiscomp.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-11-15-CompareMiscomp.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-11-15-CompareMiscomp.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,10 @@
+; RUN: opt < %s -instcombine -S | grep "icmp eq i32 %In, 1"
+; PR1800
+
+define i1 @test(i32 %In) {
+	%c1 = icmp sgt i32 %In, -1
+	%c2 = icmp eq i32 %In, 1
+	%V = and i1 %c1, %c2
+	ret i1 %V
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2007-11-25-CompatibleAttributes.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-11-25-CompatibleAttributes.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-11-25-CompatibleAttributes.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-11-25-CompatibleAttributes.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,12 @@
+; RUN: opt < %s -instcombine -S | not grep bitcast
+; PR1716
+
+ at .str = internal constant [4 x i8] c"%d\0A\00"		; <[4 x i8]*> [#uses=1]
+
+define i32 @main(i32 %argc, i8** %argv) {
+entry:
+	%tmp32 = tail call i32 (i8*  , ...) bitcast (i32 (i8*, ...)  * @printf to i32 (i8*  , ...)  *)( i8* getelementptr ([4 x i8], [4 x i8]* @.str, i32 0, i32 0)  , i32 0 ) nounwind 		; <i32> [#uses=0]
+	ret i32 undef
+}
+
+declare i32 @printf(i8*, ...) nounwind 

Added: llvm/trunk/test/Transforms/InstCombine/2007-12-10-ConstFoldCompare.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-12-10-ConstFoldCompare.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-12-10-ConstFoldCompare.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-12-10-ConstFoldCompare.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,9 @@
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
+target triple = "i686-pc-linux-gnu"
+; RUN: opt < %s -instcombine -S | not grep "ret i1 0"
+; PR1850
+
+define i1 @test() {
+	%cond = icmp ule i8* inttoptr (i64 4294967297 to i8*), inttoptr (i64 5 to i8*)
+	ret i1 %cond
+}

Added: llvm/trunk/test/Transforms/InstCombine/2007-12-12-GEPScale.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-12-12-GEPScale.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-12-12-GEPScale.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-12-12-GEPScale.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,10 @@
+; RUN: opt < %s -instcombine -S | not grep 1431655764
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
+
+define i8* @foo([100 x {i8,i8,i8}]* %x) {
+entry:
+        %p = bitcast [100 x {i8,i8,i8}]* %x to i8*
+        %q = getelementptr i8, i8* %p, i32 -4
+        ret i8* %q
+}

Added: llvm/trunk/test/Transforms/InstCombine/2007-12-16-AsmNoUnwind.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-12-16-AsmNoUnwind.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-12-16-AsmNoUnwind.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-12-16-AsmNoUnwind.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,7 @@
+; RUN: opt < %s -instcombine -S | grep nounwind
+
+define void @bar() {
+entry:
+        call void asm sideeffect "", "~{dirflag},~{fpsr},~{flags}"( )
+        ret void
+}

Added: llvm/trunk/test/Transforms/InstCombine/2007-12-18-AddSelCmpSub.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-12-18-AddSelCmpSub.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-12-18-AddSelCmpSub.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-12-18-AddSelCmpSub.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+define i32 @foo(i32 %a) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    [[T15:%.*]] = sub i32 99, [[A:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[T15]], 0
+; CHECK-NEXT:    [[A_OP:%.*]] = add i32 [[A]], 1
+; CHECK-NEXT:    [[T13:%.*]] = select i1 [[TMP1]], i32 100, i32 [[A_OP]]
+; CHECK-NEXT:    ret i32 [[T13]]
+;
+  %t15 = sub i32 99, %a
+  %t16 = icmp slt i32 %t15, 0
+  %smax = select i1 %t16, i32 0, i32 %t15
+  %t12 = add i32 %smax, %a
+  %t13 = add i32 %t12, 1
+  ret i32 %t13
+}
+
+define i32 @bar(i32 %a) {
+; CHECK-LABEL: @bar(
+; CHECK-NEXT:    [[T15:%.*]] = sub i32 99, [[A:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[T15]], 0
+; CHECK-NEXT:    [[T12:%.*]] = select i1 [[TMP1]], i32 99, i32 [[A]]
+; CHECK-NEXT:    ret i32 [[T12]]
+;
+  %t15 = sub i32 99, %a
+  %t16 = icmp slt i32 %t15, 0
+  %smax = select i1 %t16, i32 0, i32 %t15
+  %t12 = add i32 %smax, %a
+  ret i32 %t12
+}
+
+define i32 @fun(i32 %a) {
+; CHECK-LABEL: @fun(
+; CHECK-NEXT:    [[T16:%.*]] = icmp slt i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[T12:%.*]] = select i1 [[T16]], i32 [[A]], i32 99
+; CHECK-NEXT:    ret i32 [[T12]]
+;
+  %t15 = sub i32 99, %a
+  %t16 = icmp slt i32 %a, 0
+  %smax = select i1 %t16, i32 0, i32 %t15
+  %t12 = add i32 %smax, %a
+  ret i32 %t12
+}

Added: llvm/trunk/test/Transforms/InstCombine/2007-12-28-IcmpSub2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2007-12-28-IcmpSub2.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2007-12-28-IcmpSub2.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2007-12-28-IcmpSub2.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,89 @@
+; RUN: opt < %s -mem2reg -instcombine -S | grep "ret i32 1" | count 8
+
+define i32 @test1() {
+entry:
+	%z = alloca i32
+	store i32 0, i32* %z
+	%tmp = load i32, i32* %z
+	%sub = sub i32 %tmp, 1
+	%cmp = icmp ule i32 %sub, 0
+	%retval = select i1 %cmp, i32 0, i32 1
+	ret i32 %retval
+}
+
+define i32 @test2() {
+entry:
+	%z = alloca i32
+	store i32 0, i32* %z
+	%tmp = load i32, i32* %z
+	%sub = sub i32 %tmp, 1
+	%cmp = icmp ugt i32 %sub, 0
+	%retval = select i1 %cmp, i32 1, i32 0
+	ret i32 %retval
+}
+
+define i32 @test3() {
+entry:
+	%z = alloca i32
+	store i32 0, i32* %z
+	%tmp = load i32, i32* %z
+	%sub = sub i32 %tmp, 1
+	%cmp = icmp slt i32 %sub, 0
+	%retval = select i1 %cmp, i32 1, i32 0
+	ret i32 %retval
+}
+
+define i32 @test4() {
+entry:
+	%z = alloca i32
+	store i32 0, i32* %z
+	%tmp = load i32, i32* %z
+	%sub = sub i32 %tmp, 1
+	%cmp = icmp sle i32 %sub, 0
+	%retval = select i1 %cmp, i32 1, i32 0
+	ret i32 %retval
+}
+
+define i32 @test5() {
+entry:
+	%z = alloca i32
+	store i32 0, i32* %z
+	%tmp = load i32, i32* %z
+	%sub = sub i32 %tmp, 1
+	%cmp = icmp sge i32 %sub, 0
+	%retval = select i1 %cmp, i32 0, i32 1
+	ret i32 %retval
+}
+
+define i32 @test6() {
+entry:
+	%z = alloca i32
+	store i32 0, i32* %z
+	%tmp = load i32, i32* %z
+	%sub = sub i32 %tmp, 1
+	%cmp = icmp sgt i32 %sub, 0
+	%retval = select i1 %cmp, i32 0, i32 1
+	ret i32 %retval
+}
+
+define i32 @test7() {
+entry:
+	%z = alloca i32
+	store i32 0, i32* %z
+	%tmp = load i32, i32* %z
+	%sub = sub i32 %tmp, 1
+	%cmp = icmp eq i32 %sub, 0
+	%retval = select i1 %cmp, i32 0, i32 1
+	ret i32 %retval
+}
+
+define i32 @test8() {
+entry:
+	%z = alloca i32
+	store i32 0, i32* %z
+	%tmp = load i32, i32* %z
+	%sub = sub i32 %tmp, 1
+	%cmp = icmp ne i32 %sub, 0
+	%retval = select i1 %cmp, i32 1, i32 0
+	ret i32 %retval
+}

Added: llvm/trunk/test/Transforms/InstCombine/2008-01-06-BitCastAttributes.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-01-06-BitCastAttributes.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-01-06-BitCastAttributes.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-01-06-BitCastAttributes.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,30 @@
+; Ignore stderr, we expect warnings there
+; RUN: opt < %s -instcombine 2> /dev/null -S | FileCheck %s
+
+; CHECK-NOT: bitcast
+
+define void @a() {
+  ret void
+}
+
+define signext i32 @b(i32* inreg  %x)   {
+  ret i32 0
+}
+
+define void @c(...) {
+  ret void
+}
+
+define void @g(i32* %y) {
+; CHECK-LABEL: @g(
+; CHECK: call i64 bitcast (i32 (i32*)* @b to i64 (i32)*)(i32 0)
+	%x = call i64 bitcast (i32 (i32*)* @b to i64 (i32)*)( i32 0 )		; <i64> [#uses=0]
+
+; The rest should not have bitcasts remaining
+; CHECK-NOT: bitcast
+  call void bitcast (void ()* @a to void (i32*)*)( i32* noalias  %y )
+  call <2 x i32> bitcast (i32 (i32*)* @b to <2 x i32> (i32*)*)( i32* inreg  null )		; <<2 x i32>>:1 [#uses=0]
+  call void bitcast (void (...)* @c to void (i32)*)( i32 0 )
+  call void bitcast (void (...)* @c to void (i32)*)( i32 zeroext  0 )
+  ret void
+}

Added: llvm/trunk/test/Transforms/InstCombine/2008-01-06-CastCrash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-01-06-CastCrash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-01-06-CastCrash.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-01-06-CastCrash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,10 @@
+; RUN: opt < %s -instcombine -disable-output
+
+define <2 x i32> @f() {
+	ret <2 x i32> undef
+}
+
+define i32 @g() {
+	%x = call i32 bitcast (<2 x i32> ()* @f to i32 ()*)( )		; <i32> [#uses=1]
+	ret i32 %x
+}

Added: llvm/trunk/test/Transforms/InstCombine/2008-01-06-VoidCast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-01-06-VoidCast.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-01-06-VoidCast.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-01-06-VoidCast.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,12 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+define void @f(i16 %y) {
+  ret void
+}
+
+define i32 @g(i32 %y) {
+; CHECK-LABEL: @g(
+; CHECK: call i32 bitcast
+  %x = call i32 bitcast (void (i16)* @f to i32 (i32)*)( i32 %y )		; <i32> [#uses=1]
+  ret i32 %x
+}

Added: llvm/trunk/test/Transforms/InstCombine/2008-01-13-AndCmpCmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-01-13-AndCmpCmp.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-01-13-AndCmpCmp.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-01-13-AndCmpCmp.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,9 @@
+; RUN: opt < %s -instcombine -S | grep and
+; PR1907
+
+define i1 @test(i32 %c84.17) {
+	%tmp2696 = icmp ne i32 %c84.17, 34		; <i1> [#uses=2]
+ 	%tmp2699 = icmp sgt i32 %c84.17, -1		; <i1> [#uses=1]
+ 	%tmp2703 = and i1 %tmp2696, %tmp2699		; <i1> [#uses=1]
+	ret i1 %tmp2703
+}

Added: llvm/trunk/test/Transforms/InstCombine/2008-01-14-VarArgTrampoline.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-01-14-VarArgTrampoline.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-01-14-VarArgTrampoline.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-01-14-VarArgTrampoline.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,26 @@
+; RUN: opt < %s -instcombine -S | grep zeroext
+
+	%struct.FRAME.nest = type { i32, i32 (...)* }
+	%struct.__builtin_trampoline = type { [10 x i8] }
+
+declare void @llvm.init.trampoline(i8*, i8*, i8*) nounwind 
+declare i8* @llvm.adjust.trampoline(i8*) nounwind
+
+declare i32 @f(%struct.FRAME.nest* nest , ...)
+
+define i32 @nest(i32 %n) {
+entry:
+	%FRAME.0 = alloca %struct.FRAME.nest, align 8		; <%struct.FRAME.nest*> [#uses=3]
+	%TRAMP.216 = alloca [10 x i8], align 16		; <[10 x i8]*> [#uses=1]
+	%TRAMP.216.sub = getelementptr [10 x i8], [10 x i8]* %TRAMP.216, i32 0, i32 0		; <i8*> [#uses=1]
+	%tmp3 = getelementptr %struct.FRAME.nest, %struct.FRAME.nest* %FRAME.0, i32 0, i32 0		; <i32*> [#uses=1]
+	store i32 %n, i32* %tmp3, align 8
+	%FRAME.06 = bitcast %struct.FRAME.nest* %FRAME.0 to i8*		; <i8*> [#uses=1]
+	call void @llvm.init.trampoline( i8* %TRAMP.216.sub, i8* bitcast (i32 (%struct.FRAME.nest*, ...)* @f to i8*), i8* %FRAME.06 )		; <i8*> [#uses=1]
+        %tramp = call i8* @llvm.adjust.trampoline( i8* %TRAMP.216.sub)
+	%tmp7 = getelementptr %struct.FRAME.nest, %struct.FRAME.nest* %FRAME.0, i32 0, i32 1		; <i32 (...)**> [#uses=1]
+	%tmp89 = bitcast i8* %tramp to i32 (...)*		; <i32 (...)*> [#uses=2]
+	store i32 (...)* %tmp89, i32 (...)** %tmp7, align 8
+	%tmp2.i = call i32 (...) %tmp89( i32 zeroext 0 )		; <i32> [#uses=1]
+	ret i32 %tmp2.i
+}

Added: llvm/trunk/test/Transforms/InstCombine/2008-01-21-MismatchedCastAndCompare.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-01-21-MismatchedCastAndCompare.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-01-21-MismatchedCastAndCompare.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-01-21-MismatchedCastAndCompare.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,20 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; PR1940
+
+define i1 @test1(i8 %A, i8 %B) {
+        %a = zext i8 %A to i32
+        %b = zext i8 %B to i32
+        %c = icmp sgt i32 %a, %b
+        ret i1 %c
+; CHECK: %c = icmp ugt i8 %A, %B
+; CHECK: ret i1 %c
+}
+
+define i1 @test2(i8 %A, i8 %B) {
+        %a = sext i8 %A to i32
+        %b = sext i8 %B to i32
+        %c = icmp ugt i32 %a, %b
+        ret i1 %c
+; CHECK: %c = icmp ugt i8 %A, %B
+; CHECK: ret i1 %c
+}

Added: llvm/trunk/test/Transforms/InstCombine/2008-01-21-MulTrunc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-01-21-MulTrunc.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-01-21-MulTrunc.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-01-21-MulTrunc.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,17 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+
+define i16 @test1(i16 %a) {
+        %tmp = zext i16 %a to i32               ; <i32> [#uses=2]
+        %tmp21 = lshr i32 %tmp, 8               ; <i32> [#uses=1]
+; CHECK: %tmp21 = lshr i16 %a, 8
+        %tmp5 = mul i32 %tmp, 5         ; <i32> [#uses=1]
+; CHECK: %tmp5 = mul i16 %a, 5
+        %tmp.upgrd.32 = or i32 %tmp21, %tmp5            ; <i32> [#uses=1]
+; CHECK: %tmp.upgrd.32 = or i16 %tmp21, %tmp5
+        %tmp.upgrd.3 = trunc i32 %tmp.upgrd.32 to i16           ; <i16> [#uses=1]
+        ret i16 %tmp.upgrd.3
+; CHECK: ret i16 %tmp.upgrd.32
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2008-01-27-FloatSelect.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-01-27-FloatSelect.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-01-27-FloatSelect.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-01-27-FloatSelect.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,8 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; CHECK: select
+
+define double @fold(i1 %a, double %b) {
+%s = select i1 %a, double 0., double 1.
+%c = fdiv double %b, %s
+ret double %c
+}

Added: llvm/trunk/test/Transforms/InstCombine/2008-02-13-MulURem.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-02-13-MulURem.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-02-13-MulURem.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-02-13-MulURem.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,10 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; PR1933
+
+; CHECK: rem
+
+define i32 @fold(i32 %a) {
+  %s = mul i32 %a, 3
+  %c = urem i32 %s, 3
+  ret i32 %c
+}

Added: llvm/trunk/test/Transforms/InstCombine/2008-02-16-SDivOverflow2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-02-16-SDivOverflow2.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-02-16-SDivOverflow2.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-02-16-SDivOverflow2.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,9 @@
+; RUN: opt < %s -instcombine -S | grep "sdiv i8 \%a, 9"
+; PR2048
+
+define i8 @i(i8 %a) {
+  %tmp1 = sdiv i8 %a, -3
+  %tmp2 = sdiv i8 %tmp1, -3
+  ret i8 %tmp2
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2008-02-23-MulSub.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-02-23-MulSub.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-02-23-MulSub.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-02-23-MulSub.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,9 @@
+; RUN: opt < %s -instcombine -S | not grep mul
+
+define i26 @test(i26 %a) nounwind  {
+entry:
+	%_add = mul i26 %a, 2885		; <i26> [#uses=1]
+	%_shl2 = mul i26 %a, 2884		; <i26> [#uses=1]
+	%_sub = sub i26 %_add, %_shl2		; <i26> [#uses=1]
+	ret i26 %_sub
+}

Added: llvm/trunk/test/Transforms/InstCombine/2008-02-28-OrFCmpCrash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-02-28-OrFCmpCrash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-02-28-OrFCmpCrash.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-02-28-OrFCmpCrash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,16 @@
+; RUN: opt < %s -instcombine | llvm-dis
+; rdar://5771353
+
+define float @test(float %x, x86_fp80 %y) nounwind readonly  {
+entry:
+	%tmp67 = fcmp uno x86_fp80 %y, 0xK00000000000000000000		; <i1> [#uses=1]
+	%tmp71 = fcmp uno float %x, 0.000000e+00		; <i1> [#uses=1]
+	%bothcond = or i1 %tmp67, %tmp71		; <i1> [#uses=1]
+	br i1 %bothcond, label %bb74, label %bb80
+
+bb74:		; preds = %entry
+	ret float 0.000000e+00
+
+bb80:		; preds = %entry
+	ret float 0.000000e+00
+}

Added: llvm/trunk/test/Transforms/InstCombine/2008-03-13-IntToPtr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-03-13-IntToPtr.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-03-13-IntToPtr.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-03-13-IntToPtr.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,9 @@
+; RUN: opt < %s -instcombine -S | grep "16" | count 1
+
+define i8* @bork(i8** %qux) {
+  %tmp275 = load i8*, i8** %qux, align 1
+  %tmp275276 = ptrtoint i8* %tmp275 to i32
+  %tmp277 = add i32 %tmp275276, 16
+  %tmp277278 = inttoptr i32 %tmp277 to i8*
+  ret i8* %tmp277278
+}

Added: llvm/trunk/test/Transforms/InstCombine/2008-04-22-ByValBitcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-04-22-ByValBitcast.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-04-22-ByValBitcast.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-04-22-ByValBitcast.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,15 @@
+;; The bitcast cannot be eliminated because byval arguments need
+;; the correct type, or at least a type of the correct size.
+; RUN: opt < %s -instcombine -S | grep bitcast
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin9"
+	%struct.NSRect = type { [4 x float] }
+
+define void @foo(i8* %context) nounwind  {
+entry:
+	%tmp1 = bitcast i8* %context to %struct.NSRect*		; <%struct.NSRect*> [#uses=1]
+	call void (i32, ...) @bar( i32 3, %struct.NSRect* byval align 4  %tmp1 ) nounwind 
+	ret void
+}
+
+declare void @bar(i32, ...)

Added: llvm/trunk/test/Transforms/InstCombine/2008-04-28-VolatileStore.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-04-28-VolatileStore.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-04-28-VolatileStore.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-04-28-VolatileStore.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,8 @@
+; RUN: opt < %s -instcombine -S | grep "store volatile"
+
+define void @test() {
+	%votf = alloca <4 x float>		; <<4 x float>*> [#uses=1]
+	store volatile <4 x float> zeroinitializer, <4 x float>* %votf, align 16
+	ret void
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2008-04-29-VolatileLoadDontMerge.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-04-29-VolatileLoadDontMerge.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-04-29-VolatileLoadDontMerge.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-04-29-VolatileLoadDontMerge.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,25 @@
+; RUN: opt < %s -instcombine -S | grep "load volatile" | count 2
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin8"
+ at g_1 = internal global i32 0		; <i32*> [#uses=3]
+
+define i32 @main() nounwind  {
+entry:
+	%tmp93 = icmp slt i32 0, 10		; <i1> [#uses=0]
+	%tmp34 = load volatile i32, i32* @g_1, align 4		; <i32> [#uses=1]
+	br label %bb
+
+bb:		; preds = %bb, %entry
+	%b.0.reg2mem.0 = phi i32 [ 0, %entry ], [ %tmp6, %bb ]		; <i32> [#uses=1]
+	%tmp3.reg2mem.0 = phi i32 [ %tmp34, %entry ], [ %tmp3, %bb ]		; <i32> [#uses=1]
+	%tmp4 = add i32 %tmp3.reg2mem.0, 5		; <i32> [#uses=1]
+	store volatile i32 %tmp4, i32* @g_1, align 4
+	%tmp6 = add i32 %b.0.reg2mem.0, 1		; <i32> [#uses=2]
+	%tmp9 = icmp slt i32 %tmp6, 10		; <i1> [#uses=1]
+	%tmp3 = load volatile i32, i32* @g_1, align 4		; <i32> [#uses=1]
+	br i1 %tmp9, label %bb, label %bb11
+
+bb11:		; preds = %bb
+	ret i32 0
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2008-04-29-VolatileLoadMerge.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-04-29-VolatileLoadMerge.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-04-29-VolatileLoadMerge.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-04-29-VolatileLoadMerge.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,21 @@
+; RUN: opt < %s -instcombine -S | grep "load volatile" | count 2
+; PR2262
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin8"
+ at g_1 = internal global i32 0		; <i32*> [#uses=3]
+
+define i32 @main(i32 %i) nounwind  {
+entry:
+	%tmp93 = icmp slt i32 %i, 10		; <i1> [#uses=0]
+	%tmp34 = load volatile i32, i32* @g_1, align 4		; <i32> [#uses=1]
+	br i1 %tmp93, label %bb11, label %bb
+
+bb:		; preds = %bb, %entry
+	%tmp3 = load volatile i32, i32* @g_1, align 4		; <i32> [#uses=1]
+	br label %bb11
+
+bb11:		; preds = %bb
+	%tmp4 = phi i32 [ %tmp34, %entry ], [ %tmp3, %bb ]		; <i32> [#uses=1]
+	ret i32 %tmp4
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2008-05-08-LiveStoreDelete.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-05-08-LiveStoreDelete.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-05-08-LiveStoreDelete.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-05-08-LiveStoreDelete.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,25 @@
+; RUN: opt < %s -instcombine -S | grep "store i8" | count 3
+; PR2297
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin8"
+
+define i32 @a() nounwind  {
+entry:
+	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
+	%tmp1 = call i8* @malloc( i32 10 ) nounwind 		; <i8*> [#uses=5]
+	%tmp3 = getelementptr i8, i8* %tmp1, i32 1		; <i8*> [#uses=1]
+	store i8 0, i8* %tmp3, align 1
+	%tmp5 = getelementptr i8, i8* %tmp1, i32 0		; <i8*> [#uses=1]
+	store i8 1, i8* %tmp5, align 1
+	%tmp7 = call i32 @strlen( i8* %tmp1 ) nounwind readonly 		; <i32> [#uses=1]
+	%tmp9 = getelementptr i8, i8* %tmp1, i32 0		; <i8*> [#uses=1]
+	store i8 0, i8* %tmp9, align 1
+	%tmp11 = call i32 (...) @b( i8* %tmp1 ) nounwind 		; <i32> [#uses=0]
+	ret i32 %tmp7
+}
+
+declare i8* @malloc(i32) nounwind 
+
+declare i32 @strlen(i8*) nounwind readonly 
+
+declare i32 @b(...)

Added: llvm/trunk/test/Transforms/InstCombine/2008-05-08-StrLenSink.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-05-08-StrLenSink.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-05-08-StrLenSink.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-05-08-StrLenSink.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,32 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+; PR2297
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin8"
+
+define i32 @a() nounwind  {
+entry:
+	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
+	%tmp1 = call i8* @malloc( i32 10 ) nounwind 		; <i8*> [#uses=5]
+	%tmp3 = getelementptr i8, i8* %tmp1, i32 1		; <i8*> [#uses=1]
+	store i8 0, i8* %tmp3, align 1
+	%tmp5 = getelementptr i8, i8* %tmp1, i32 0		; <i8*> [#uses=1]
+	store i8 1, i8* %tmp5, align 1
+; CHECK: store
+; CHECK: store
+; CHECK-NEXT: strlen
+; CHECK-NEXT: store
+	%tmp7 = call i32 @strlen( i8* %tmp1 ) nounwind readonly 		; <i32> [#uses=1]
+	%tmp9 = getelementptr i8, i8* %tmp1, i32 0		; <i8*> [#uses=1]
+	store i8 0, i8* %tmp9, align 1
+	%tmp11 = call i32 (...) @b( i8* %tmp1 ) nounwind 		; <i32> [#uses=0]
+	br label %return
+
+return:		; preds = %entry
+	ret i32 %tmp7
+}
+
+declare i8* @malloc(i32) nounwind 
+
+declare i32 @strlen(i8*) nounwind readonly 
+
+declare i32 @b(...)

Added: llvm/trunk/test/Transforms/InstCombine/2008-05-09-SinkOfInvoke.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-05-09-SinkOfInvoke.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-05-09-SinkOfInvoke.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-05-09-SinkOfInvoke.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,37 @@
+; RUN: opt < %s -instcombine -disable-output
+; PR2303
+	%"struct.std::ctype<char>" = type { %"struct.std::locale::facet", i32*, i8, i32*, i32*, i16*, i8, [256 x i8], [256 x i8], i8 }
+	%"struct.std::locale::facet" = type { i32 (...)**, i32 }
+
+declare i32* @_ZNSt6locale5facet15_S_get_c_localeEv()
+
+declare i32** @__ctype_toupper_loc() readnone 
+
+declare i32** @__ctype_tolower_loc() readnone 
+
+define void @_ZNSt5ctypeIcEC2EPiPKtbm(%"struct.std::ctype<char>"* %this, i32* %unnamed_arg, i16* %__table, i8 zeroext  %__del, i64 %__refs) personality i32 (...)* @__gxx_personality_v0 {
+entry:
+	%tmp8 = invoke i32* @_ZNSt6locale5facet15_S_get_c_localeEv( )
+			to label %invcont unwind label %lpad		; <i32*> [#uses=0]
+
+invcont:		; preds = %entry
+	%tmp32 = invoke i32** @__ctype_toupper_loc( ) readnone 
+			to label %invcont31 unwind label %lpad		; <i32**> [#uses=0]
+
+invcont31:		; preds = %invcont
+	%tmp38 = invoke i32** @__ctype_tolower_loc( ) readnone 
+			to label %invcont37 unwind label %lpad		; <i32**> [#uses=1]
+
+invcont37:		; preds = %invcont31
+	%tmp39 = load i32*, i32** %tmp38, align 8		; <i32*> [#uses=1]
+	%tmp41 = getelementptr %"struct.std::ctype<char>", %"struct.std::ctype<char>"* %this, i32 0, i32 4		; <i32**> [#uses=1]
+	store i32* %tmp39, i32** %tmp41, align 8
+	ret void
+
+lpad:		; preds = %invcont31, %invcont, %entry
+        %exn = landingpad {i8*, i32}
+                 cleanup
+	unreachable
+}
+
+declare i32 @__gxx_personality_v0(...)

Added: llvm/trunk/test/Transforms/InstCombine/2008-05-17-InfLoop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-05-17-InfLoop.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-05-17-InfLoop.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-05-17-InfLoop.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,23 @@
+; RUN: opt < %s -instcombine -disable-output
+; PR2339
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-s0:0:64-f80:32:32"
+target triple = "i686-pc-linux-gnu"
+
+declare void @BZALLOC(i32)
+
+define void @f(i32) {
+entry:
+	%blockSize100k = alloca i32		; <i32*> [#uses=2]
+	store i32 %0, i32* %blockSize100k
+	%n = alloca i32		; <i32*> [#uses=2]
+	load i32, i32* %blockSize100k		; <i32>:1 [#uses=1]
+	store i32 %1, i32* %n
+	load i32, i32* %n		; <i32>:2 [#uses=1]
+	add i32 %2, 2		; <i32>:3 [#uses=1]
+	mul i32 %3, ptrtoint (i32* getelementptr (i32, i32* null, i32 1) to i32)		; <i32>:4 [#uses=1]
+	call void @BZALLOC( i32 %4 )
+	br label %return
+
+return:		; preds = %entry
+	ret void
+}

Added: llvm/trunk/test/Transforms/InstCombine/2008-05-18-FoldIntToPtr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-05-18-FoldIntToPtr.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-05-18-FoldIntToPtr.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-05-18-FoldIntToPtr.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,13 @@
+; RUN: opt < %s -instcombine -S | grep "ret i1 false" | count 2
+; PR2329
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
+target triple = "i386-pc-linux-gnu"
+
+define i1 @f1() {
+  ret i1 icmp eq (i8* inttoptr (i32 1 to i8*), i8* inttoptr (i32 2 to i8*))
+}
+
+define i1 @f2() {
+  ret i1 icmp eq (i8* inttoptr (i16 1 to i8*), i8* inttoptr (i16 2 to i8*))
+}

Added: llvm/trunk/test/Transforms/InstCombine/2008-05-22-IDivVector.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-05-22-IDivVector.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-05-22-IDivVector.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-05-22-IDivVector.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,6 @@
+; RUN: opt < %s -instcombine -disable-output
+
+define <3 x i8> @f(<3 x i8> %i) {
+  %A = sdiv <3 x i8> %i, %i
+  ret <3 x i8> %A
+}

Added: llvm/trunk/test/Transforms/InstCombine/2008-05-23-CompareFold.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-05-23-CompareFold.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-05-23-CompareFold.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-05-23-CompareFold.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,14 @@
+; RUN: opt -instcombine -S < %s | FileCheck %s
+; PR2359
+
+; CHECK-LABEL: @f(
+; CHECK: ret i1 false
+define i1 @f(i8* %x) {
+entry:
+       %tmp462 = load i8, i8* %x, align 1          ; <i8> [#uses=1]
+       %tmp462463 = sitofp i8 %tmp462 to float         ; <float> [#uses=1]
+       %tmp464 = fcmp ugt float %tmp462463, 0x47EFFFFFE0000000         ; <i1>
+       ret i1 %tmp464
+}
+
+

Added: llvm/trunk/test/Transforms/InstCombine/2008-05-31-AddBool.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-05-31-AddBool.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-05-31-AddBool.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-05-31-AddBool.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,9 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; PR2389
+
+; CHECK: xor
+
+define i1 @test(i1 %a, i1 %b) {
+  %A = add i1 %a, %b
+  ret i1 %A
+}

Added: llvm/trunk/test/Transforms/InstCombine/2008-05-31-Bools.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-05-31-Bools.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-05-31-Bools.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-05-31-Bools.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,24 @@
+; RUN: opt < %s -instcombine -S > %t
+; RUN: grep "xor" %t
+; RUN: grep "and" %t
+; RUN: not grep "div" %t
+
+define i1 @foo1(i1 %a, i1 %b) {
+  %A = sub i1 %a, %b
+  ret i1 %A
+}
+
+define i1 @foo2(i1 %a, i1 %b) {
+  %A = mul i1 %a, %b
+  ret i1 %A
+}
+
+define i1 @foo3(i1 %a, i1 %b) {
+  %A = udiv i1 %a, %b
+  ret i1 %A
+}
+
+define i1 @foo4(i1 %a, i1 %b) {
+  %A = sdiv i1 %a, %b
+  ret i1 %A
+}

Added: llvm/trunk/test/Transforms/InstCombine/2008-06-05-ashr-crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-06-05-ashr-crash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-06-05-ashr-crash.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-06-05-ashr-crash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,7 @@
+; RUN: opt < %s -instcombine
+
+define i65 @foo(i65 %x) nounwind  {
+entry:
+	%tmp2 = ashr i65 %x, 65		; <i65> [#uses=1]
+	ret i65 %tmp2
+}

Added: llvm/trunk/test/Transforms/InstCombine/2008-06-08-ICmpPHI.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-06-08-ICmpPHI.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-06-08-ICmpPHI.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-06-08-ICmpPHI.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,47 @@
+; RUN: opt < %s -instcombine -S | grep "phi i32" | count 2
+
+define void @test() nounwind  {
+entry:
+	br label %bb
+
+bb:		; preds = %bb16, %entry
+	%i.0 = phi i32 [ 0, %entry ], [ %indvar.next, %somebb ]		; <i32> [#uses=1]
+	%x.0 = phi i32 [ 37, %entry ], [ %tmp17, %somebb ]		; <i32> [#uses=1]
+	%tmp = tail call i32 (...) @bork( ) nounwind 		; <i32> [#uses=0]
+	%tmp1 = tail call i32 (...) @bork( ) nounwind 		; <i32> [#uses=0]
+	%tmp2 = tail call i32 (...) @bork( ) nounwind 		; <i32> [#uses=1]
+	%tmp3 = icmp eq i32 %tmp2, 0		; <i1> [#uses=1]
+	br i1 %tmp3, label %bb7, label %bb5
+
+bb5:		; preds = %bb
+	%tmp6 = tail call i32 (...) @bork( ) nounwind 		; <i32> [#uses=0]
+	br label %bb7
+
+bb7:		; preds = %bb5, %bb
+	%tmp8 = tail call i32 (...) @bork( ) nounwind 		; <i32> [#uses=0]
+	%tmp9 = tail call i32 (...) @bork( ) nounwind 		; <i32> [#uses=0]
+	%tmp11 = icmp eq i32 %x.0, 37		; <i1> [#uses=1]
+	br i1 %tmp11, label %bb14, label %bb16
+
+bb14:		; preds = %bb7
+	%tmp15 = tail call i32 (...) @bar( ) nounwind 		; <i32> [#uses=0]
+	br label %bb16
+
+bb16:		; preds = %bb14, %bb7
+	%tmp17 = tail call i32 (...) @zap( ) nounwind 		; <i32> [#uses=1]
+	%indvar.next = add i32 %i.0, 1		; <i32> [#uses=2]
+	%exitcond = icmp eq i32 %indvar.next, 42		; <i1> [#uses=1]
+	br i1 %exitcond, label %return, label %somebb
+
+somebb:
+	br label %bb
+
+return:		; preds = %bb16
+	ret void
+}
+
+declare i32 @bork(...)
+
+declare i32 @bar(...)
+
+declare i32 @zap(...)

Added: llvm/trunk/test/Transforms/InstCombine/2008-06-13-InfiniteLoopStore.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-06-13-InfiniteLoopStore.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-06-13-InfiniteLoopStore.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-06-13-InfiniteLoopStore.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,20 @@
+; RUN: opt < %s -instcombine -S | grep "store i32" | count 2
+
+ at g_139 = global i32 0           ; <i32*> [#uses=2]
+
+define void @func_56(i32 %p_60) nounwind  {
+entry:
+        store i32 1, i32* @g_139, align 4
+        %tmp1 = icmp ne i32 %p_60, 0            ; <i1> [#uses=1]
+        %tmp12 = zext i1 %tmp1 to i8            ; <i8> [#uses=1]
+        %toBool = icmp ne i8 %tmp12, 0          ; <i1> [#uses=1]
+        br i1 %toBool, label %bb, label %return
+
+bb:             ; preds = %bb, %entry
+        store i32 1, i32* @g_139, align 4
+        br label %bb
+
+return:         ; preds = %entry
+        ret void
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2008-06-13-ReadOnlyCallStore.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-06-13-ReadOnlyCallStore.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-06-13-ReadOnlyCallStore.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-06-13-ReadOnlyCallStore.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,19 @@
+; RUN: opt < %s -instcombine -S | grep "store i8" | count 2
+
+define i32 @a(i8* %s) nounwind  {
+entry:
+	store i8 0, i8* %s, align 1 ; This store cannot be eliminated!
+	%tmp3 = call i32 @strlen( i8* %s ) nounwind readonly
+	%tmp5 = icmp ne i32 %tmp3, 0
+	br i1 %tmp5, label %bb, label %bb8
+
+bb:		; preds = %entry
+	store i8 0, i8* %s, align 1
+	br label %bb8
+
+bb8:
+	ret i32 %tmp3
+}
+
+declare i32 @strlen(i8*) nounwind readonly 
+

Added: llvm/trunk/test/Transforms/InstCombine/2008-06-19-UncondLoad.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-06-19-UncondLoad.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-06-19-UncondLoad.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-06-19-UncondLoad.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,16 @@
+; RUN: opt < %s -instcombine -S | grep load | count 3
+; PR2471
+
+declare i32 @x(i32*)
+define i32 @b(i32* %a, i32* %b) {
+entry:
+        %tmp1 = load i32, i32* %a            
+        %tmp3 = load i32, i32* %b           
+        %add = add i32 %tmp1, %tmp3   
+        %call = call i32 @x( i32* %a )
+        %tobool = icmp ne i32 %add, 0
+	; not safe to turn into an uncond load
+        %cond = select i1 %tobool, i32* %b, i32* %a             
+        %tmp8 = load i32, i32* %cond       
+        ret i32 %tmp8
+}

Added: llvm/trunk/test/Transforms/InstCombine/2008-06-21-CompareMiscomp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-06-21-CompareMiscomp.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-06-21-CompareMiscomp.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-06-21-CompareMiscomp.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,11 @@
+; RUN: opt < %s -instcombine -S | grep "icmp eq i32 %In, 15"
+; PR2479
+; (See also PR1800.)
+
+define i1 @test(i32 %In) {
+	%c1 = icmp ugt i32 %In, 13
+	%c2 = icmp eq i32 %In, 15
+	%V = and i1 %c1, %c2
+	ret i1 %V
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2008-06-24-StackRestore.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-06-24-StackRestore.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-06-24-StackRestore.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-06-24-StackRestore.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,39 @@
+; RUN: opt < %s -instcombine -S | grep "call.*llvm.stackrestore"
+; PR2488
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
+target triple = "i386-pc-linux-gnu"
+ at p = weak global i8* null		; <i8**> [#uses=2]
+
+define i32 @main() nounwind  {
+entry:
+	%tmp248 = call i8* @llvm.stacksave( )		; <i8*> [#uses=1]
+	%tmp2752 = alloca i32		; <i32*> [#uses=2]
+	%tmpcast53 = bitcast i32* %tmp2752 to i8*		; <i8*> [#uses=1]
+	store i32 2, i32* %tmp2752, align 4
+	store volatile i8* %tmpcast53, i8** @p, align 4
+	br label %bb44
+
+bb:		; preds = %bb44
+	ret i32 0
+
+bb44:		; preds = %bb44, %entry
+	%indvar = phi i32 [ 0, %entry ], [ %tmp3857, %bb44 ]		; <i32> [#uses=1]
+	%tmp249 = phi i8* [ %tmp248, %entry ], [ %tmp2, %bb44 ]		; <i8*> [#uses=1]
+	%tmp3857 = add i32 %indvar, 1		; <i32> [#uses=3]
+	call void @llvm.stackrestore( i8* %tmp249 )
+	%tmp2 = call i8* @llvm.stacksave( )		; <i8*> [#uses=1]
+	%tmp4 = srem i32 %tmp3857, 1000		; <i32> [#uses=2]
+	%tmp5 = add i32 %tmp4, 1		; <i32> [#uses=1]
+	%tmp27 = alloca i32, i32 %tmp5		; <i32*> [#uses=3]
+	%tmpcast = bitcast i32* %tmp27 to i8*		; <i8*> [#uses=1]
+	store i32 1, i32* %tmp27, align 4
+	%tmp34 = getelementptr i32, i32* %tmp27, i32 %tmp4		; <i32*> [#uses=1]
+	store i32 2, i32* %tmp34, align 4
+	store volatile i8* %tmpcast, i8** @p, align 4
+	%exitcond = icmp eq i32 %tmp3857, 999999		; <i1> [#uses=1]
+	br i1 %exitcond, label %bb, label %bb44
+}
+
+declare i8* @llvm.stacksave() nounwind 
+
+declare void @llvm.stackrestore(i8*) nounwind 

Added: llvm/trunk/test/Transforms/InstCombine/2008-07-08-ShiftOneAndOne.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-07-08-ShiftOneAndOne.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-07-08-ShiftOneAndOne.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-07-08-ShiftOneAndOne.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+define i1 @PR2330(i32 %a) {
+; CHECK-LABEL: @PR2330(
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 %a, 0
+; CHECK-NEXT:    ret i1 [[TOBOOL]]
+;
+  %tmp15 = shl i32 1, %a
+  %tmp237 = and i32 %tmp15, 1
+  %toBool = icmp eq i32 %tmp237, 0
+  ret i1 %toBool
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2008-07-08-SubAnd.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-07-08-SubAnd.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-07-08-SubAnd.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-07-08-SubAnd.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,9 @@
+; RUN: opt < %s -instcombine -S | grep -v "i32 8"
+; PR2330
+
+define i32 @a(i32 %a) nounwind  {
+entry:
+	%tmp2 = sub i32 8, %a		; <i32> [#uses=1]
+	%tmp3 = and i32 %tmp2, 7		; <i32> [#uses=1]
+	ret i32 %tmp3
+}

Added: llvm/trunk/test/Transforms/InstCombine/2008-07-08-VolatileLoadMerge.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-07-08-VolatileLoadMerge.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-07-08-VolatileLoadMerge.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-07-08-VolatileLoadMerge.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,26 @@
+; RUN: opt < %s -instcombine -S | grep "load volatile" | count 2
+; PR2496
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin8"
+ at g_1 = internal global i32 0		; <i32*> [#uses=3]
+
+define i32 @main() nounwind  {
+entry:
+	%tmp93 = icmp slt i32 0, 10		; <i1> [#uses=0]
+	%tmp34 = load volatile i32, i32* @g_1, align 4		; <i32> [#uses=1]
+	br label %bb
+
+bb:		; preds = %bb, %entry
+	%b.0.reg2mem.0 = phi i32 [ 0, %entry ], [ %tmp6, %bb ]		; <i32> [#uses=1]
+	%tmp3.reg2mem.0 = phi i32 [ %tmp3, %bb ], [ %tmp34, %entry ]
+	%tmp4 = add i32 %tmp3.reg2mem.0, 5		; <i32> [#uses=1]
+	store volatile i32 %tmp4, i32* @g_1, align 4
+	%tmp6 = add i32 %b.0.reg2mem.0, 1		; <i32> [#uses=2]
+	%tmp9 = icmp slt i32 %tmp6, 10		; <i1> [#uses=1]
+	%tmp3 = load volatile i32, i32* @g_1, align 4		; <i32> [#uses=1]
+	br i1 %tmp9, label %bb, label %bb11
+
+bb11:		; preds = %bb
+	ret i32 0
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2008-07-09-SubAndError.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-07-09-SubAndError.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-07-09-SubAndError.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-07-09-SubAndError.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,9 @@
+; RUN: opt < %s -instcombine -S | not grep "sub i32 0"
+; PR2330
+
+define i32 @foo(i32 %a) nounwind {
+entry:
+  %A = sub i32 5, %a
+  %B = and i32 %A, 2
+  ret i32 %B
+}

Added: llvm/trunk/test/Transforms/InstCombine/2008-07-10-CastSextBool.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-07-10-CastSextBool.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-07-10-CastSextBool.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-07-10-CastSextBool.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+define i1 @PR2539_A(i1 %A) {
+; CHECK-LABEL: @PR2539_A(
+; CHECK-NEXT:    [[C:%.*]] = xor i1 %A, true
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %B = zext i1 %A to i32
+  %C = icmp slt i32 %B, 1
+  ret i1 %C
+}
+
+
+define i1 @PR2539_B(i1 zeroext %b) {
+; CHECK-LABEL: @PR2539_B(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = icmp slt i1 %b, true
+  ret i1 %cmp
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2008-07-11-RemAnd.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-07-11-RemAnd.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-07-11-RemAnd.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-07-11-RemAnd.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,9 @@
+; RUN: opt < %s -instcombine -S | not grep rem
+; PR2330
+
+define i32 @a(i32 %b) nounwind  {
+entry:
+	srem i32 %b, 8		; <i32>:0 [#uses=1]
+	and i32 %0, 1		; <i32>:1 [#uses=1]
+	ret i32 %1
+}

Added: llvm/trunk/test/Transforms/InstCombine/2008-07-13-DivZero.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-07-13-DivZero.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-07-13-DivZero.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-07-13-DivZero.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,16 @@
+; RUN: opt < %s -instcombine -S | grep "lshr.*3"
+; RUN: opt < %s -instcombine -S | grep "call .*%cond"
+; PR2506
+
+; We can simplify the operand of udiv to '8', but not the operand to the
+; call.  If the callee never returns, we can't assume the div is reachable.
+define i32 @a(i32 %x, i32 %y) {
+entry:
+        %tobool = icmp ne i32 %y, 0             ; <i1> [#uses=1]
+        %cond = select i1 %tobool, i32 8, i32 0         ; <i32> [#uses=2]
+        %call = call i32 @b( i32 %cond )                ; <i32> [#uses=0]
+        %div = udiv i32 %x, %cond               ; <i32> [#uses=1]
+        ret i32 %div
+}
+
+declare i32 @b(i32)

Added: llvm/trunk/test/Transforms/InstCombine/2008-07-16-fsub.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-07-16-fsub.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-07-16-fsub.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-07-16-fsub.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,8 @@
+; RUN: opt < %s -instcombine -S | grep sub
+; PR2553
+
+define double @test(double %X) nounwind {
+	; fsub of self can't be optimized away.
+	%Y = fsub double %X, %X
+	ret double %Y
+}

Added: llvm/trunk/test/Transforms/InstCombine/2008-08-05-And.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-08-05-And.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-08-05-And.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-08-05-And.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,23 @@
+; RUN: opt < %s -instcombine -S | not grep or
+; PR2629
+
+define void @f(i8* %x) nounwind  {
+entry:
+        br label %bb
+
+bb:
+	%g1 = getelementptr i8, i8* %x, i32 0
+        %l1 = load i8, i8* %g1, align 1
+	%s1 = sub i8 %l1, 6
+	%c1 = icmp ugt i8 %s1, 2
+	%s2 = sub i8 %l1, 10
+        %c2 = icmp ugt i8 %s2, 2
+        %a1 = and i1 %c1, %c2
+	br i1 %a1, label %incompatible, label %okay
+
+okay:
+        ret void
+
+incompatible:
+        ret void
+}

Added: llvm/trunk/test/Transforms/InstCombine/2008-09-02-VectorCrash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-09-02-VectorCrash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-09-02-VectorCrash.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-09-02-VectorCrash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,27 @@
+; RUN: opt < %s -instcombine
+
+define void @entry(i32 %m_task_id, i32 %start_x, i32 %end_x, i32 %start_y, i32 %end_y) {
+	br label %1
+
+; <label>:1		; preds = %4, %0
+	%2 = icmp slt i32 0, %end_y		; <i1> [#uses=1]
+	br i1 %2, label %4, label %3
+
+; <label>:3		; preds = %1
+	ret void
+
+; <label>:4		; preds = %6, %1
+	%5 = icmp slt i32 0, %end_x		; <i1> [#uses=1]
+	br i1 %5, label %6, label %1
+
+; <label>:6		; preds = %4
+	%7 = srem <2 x i32> zeroinitializer, zeroinitializer		; <<2 x i32>> [#uses=1]
+	%8 = extractelement <2 x i32> %7, i32 1		; <i32> [#uses=1]
+	%9 = select i1 false, i32 0, i32 %8		; <i32> [#uses=1]
+	%10 = insertelement <2 x i32> zeroinitializer, i32 %9, i32 1		; <<2 x i32>> [#uses=1]
+	%11 = extractelement <2 x i32> %10, i32 1		; <i32> [#uses=1]
+	%12 = insertelement <4 x i32> zeroinitializer, i32 %11, i32 3		; <<4 x i32>> [#uses=1]
+	%13 = sitofp <4 x i32> %12 to <4 x float>		; <<4 x float>> [#uses=1]
+	store <4 x float> %13, <4 x float>* null
+	br label %4
+}

Added: llvm/trunk/test/Transforms/InstCombine/2008-10-11-DivCompareFold.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-10-11-DivCompareFold.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-10-11-DivCompareFold.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-10-11-DivCompareFold.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,8 @@
+; RUN: opt < %s -instcombine -S | grep "ret i1 false"
+; PR2697
+
+define i1 @x(i32 %x) nounwind {
+	%div = sdiv i32 %x, 65536		; <i32> [#uses=1]
+	%cmp = icmp slt i32 %div, -65536
+	ret i1 %cmp
+}

Added: llvm/trunk/test/Transforms/InstCombine/2008-10-23-ConstFoldWithoutMask.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-10-23-ConstFoldWithoutMask.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-10-23-ConstFoldWithoutMask.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-10-23-ConstFoldWithoutMask.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,8 @@
+; RUN: opt < %s -instcombine
+; PR2940
+
+define i32 @tstid() {
+	%var0 = inttoptr i32 1 to i8*		; <i8*> [#uses=1]
+	%var2 = ptrtoint i8* %var0 to i32		; <i32> [#uses=1]
+	ret i32 %var2
+}

Added: llvm/trunk/test/Transforms/InstCombine/2008-11-01-SRemDemandedBits.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-11-01-SRemDemandedBits.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-11-01-SRemDemandedBits.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-11-01-SRemDemandedBits.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,8 @@
+; RUN: opt < %s -instcombine -S | grep "ret i1 true"
+; PR2993
+
+define i1 @foo(i32 %x) {
+  %1 = srem i32 %x, -1
+  %2 = icmp eq i32 %1, 0
+  ret i1 %2
+}

Added: llvm/trunk/test/Transforms/InstCombine/2008-11-08-FCmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-11-08-FCmp.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-11-08-FCmp.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-11-08-FCmp.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,63 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; PR3021
+
+; When inst combining an FCMP with the LHS coming from a uitofp instruction, we
+; can't lower it to signed ICMP instructions.
+
+; CHECK-LABEL: @test1(
+define i1 @test1(i32 %val) {
+  %1 = uitofp i32 %val to double
+  %2 = fcmp ole double %1, 0.000000e+00
+; CHECK: icmp eq i32 %val, 0
+  ret i1 %2
+}
+
+; CHECK-LABEL: @test2(
+define i1 @test2(i32 %val) {
+  %1 = uitofp i32 %val to double
+  %2 = fcmp olt double %1, 0.000000e+00
+  ret i1 %2
+; CHECK: ret i1 false
+}
+
+; CHECK-LABEL: @test3(
+define i1 @test3(i32 %val) {
+  %1 = uitofp i32 %val to double
+  %2 = fcmp oge double %1, 0.000000e+00
+  ret i1 %2
+; CHECK: ret i1 true
+}
+
+; CHECK-LABEL: @test4(
+define i1 @test4(i32 %val) {
+  %1 = uitofp i32 %val to double
+  %2 = fcmp ogt double %1, 0.000000e+00
+; CHECK: icmp ne i32 %val, 0
+  ret i1 %2
+}
+
+; CHECK-LABEL: @test5(
+define i1 @test5(i32 %val) {
+  %1 = uitofp i32 %val to double
+  %2 = fcmp ogt double %1, -4.400000e+00
+  ret i1 %2
+; CHECK: ret i1 true
+}
+
+; CHECK-LABEL: @test6(
+define i1 @test6(i32 %val) {
+  %1 = uitofp i32 %val to double
+  %2 = fcmp olt double %1, -4.400000e+00
+  ret i1 %2
+; CHECK: ret i1 false
+}
+
+; Check that optimizing unsigned >= comparisons correctly distinguishes
+; positive and negative constants.  <rdar://problem/12029145>
+; CHECK-LABEL: @test7(
+define i1 @test7(i32 %val) {
+  %1 = uitofp i32 %val to double
+  %2 = fcmp oge double %1, 3.200000e+00
+  ret i1 %2
+; CHECK: icmp ugt i32 %val, 3
+}

Added: llvm/trunk/test/Transforms/InstCombine/2008-11-27-IDivVector.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-11-27-IDivVector.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-11-27-IDivVector.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-11-27-IDivVector.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,11 @@
+; RUN: opt < %s -instcombine -S | not grep div
+
+define <2 x i8> @f(<2 x i8> %x) {
+  %A = udiv <2 x i8> %x, <i8 1, i8 1>
+  ret <2 x i8> %A
+}
+
+define <2 x i8> @g(<2 x i8> %x) {
+  %A = sdiv <2 x i8> %x, <i8 1, i8 1>
+  ret <2 x i8> %A
+}

Added: llvm/trunk/test/Transforms/InstCombine/2008-11-27-MultiplyIntVec.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-11-27-MultiplyIntVec.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-11-27-MultiplyIntVec.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-11-27-MultiplyIntVec.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,11 @@
+; RUN: opt < %s -instcombine -S | not grep mul
+
+define <2 x i8> @f(<2 x i8> %x) {
+  %A = mul <2 x i8> %x, <i8 1, i8 1>
+  ret <2 x i8> %A
+}
+
+define <2 x i8> @g(<2 x i8> %x) {
+  %A = mul <2 x i8> %x, <i8 -1, i8 -1>
+  ret <2 x i8> %A
+}

Added: llvm/trunk/test/Transforms/InstCombine/2008-12-17-SRemNegConstVec.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2008-12-17-SRemNegConstVec.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2008-12-17-SRemNegConstVec.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2008-12-17-SRemNegConstVec.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,7 @@
+; RUN: opt < %s -instcombine -S | grep "i8 2, i8 2"
+; PR2756
+
+define <2 x i8> @foo(<2 x i8> %x) {
+  %A = srem <2 x i8> %x, <i8 2, i8 -2>
+  ret <2 x i8> %A
+}

Added: llvm/trunk/test/Transforms/InstCombine/2009-01-05-i128-crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2009-01-05-i128-crash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2009-01-05-i128-crash.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2009-01-05-i128-crash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,27 @@
+; RUN: opt < %s -instcombine | llvm-dis
+; PR3235
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define hidden i128 @"\01_gfortrani_max_value"(i32 %length, i32 %signed_flag) nounwind {
+entry:
+	switch i32 %length, label %bb13 [
+		i32 1, label %bb17
+		i32 4, label %bb9
+		i32 8, label %bb5
+	]
+
+bb5:		; preds = %entry
+	%0 = icmp eq i32 %signed_flag, 0		; <i1> [#uses=1]
+	%iftmp.28.0 = select i1 %0, i128 18446744073709551615, i128 9223372036854775807		; <i128> [#uses=1]
+	ret i128 %iftmp.28.0
+
+bb9:		; preds = %entry
+	ret i128 0
+
+bb13:		; preds = %entry
+	ret i128 0
+
+bb17:		; preds = %entry
+	ret i128 0
+}

Added: llvm/trunk/test/Transforms/InstCombine/2009-01-08-AlignAlloca.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2009-01-08-AlignAlloca.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2009-01-08-AlignAlloca.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2009-01-08-AlignAlloca.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,28 @@
+; RUN: opt < %s -instcombine -S > %t
+; RUN: grep ", align 4" %t | count 3
+; RUN: grep ", align 8" %t | count 3
+; rdar://6480438
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin9.6"
+	%struct.Key = type { { i32, i32 } }
+	%struct.anon = type <{ i8, [3 x i8], i32 }>
+
+define i32 @bar(i64 %key_token2) nounwind {
+entry:
+	%iospec = alloca %struct.Key		; <%struct.Key*> [#uses=3]
+	%ret = alloca i32		; <i32*> [#uses=2]
+	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
+	%0 = getelementptr %struct.Key, %struct.Key* %iospec, i32 0, i32 0		; <{ i32, i32 }*> [#uses=2]
+	%1 = getelementptr { i32, i32 }, { i32, i32 }* %0, i32 0, i32 0		; <i32*> [#uses=1]
+	store i32 0, i32* %1, align 4
+	%2 = getelementptr { i32, i32 }, { i32, i32 }* %0, i32 0, i32 1		; <i32*> [#uses=1]
+	store i32 0, i32* %2, align 4
+	%3 = getelementptr %struct.Key, %struct.Key* %iospec, i32 0, i32 0		; <{ i32, i32 }*> [#uses=1]
+	%4 = bitcast { i32, i32 }* %3 to i64*		; <i64*> [#uses=1]
+	store i64 %key_token2, i64* %4, align 4
+	%5 = call i32 (...) @foo(%struct.Key* byval align 4 %iospec, i32* %ret) nounwind		; <i32> [#uses=0]
+	%6 = load i32, i32* %ret, align 4		; <i32> [#uses=1]
+	ret i32 %6
+}
+
+declare i32 @foo(...)

Added: llvm/trunk/test/Transforms/InstCombine/2009-01-16-PointerAddrSpace.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2009-01-16-PointerAddrSpace.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2009-01-16-PointerAddrSpace.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2009-01-16-PointerAddrSpace.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,11 @@
+; RUN: opt < %s -instcombine -S | grep "store.*addrspace(1)"
+; PR3335
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin9.6"
+
+define i32 @test(i32* %P) nounwind {
+entry:
+  %Q = addrspacecast i32* %P to i32 addrspace(1)*
+  store i32 0, i32 addrspace(1)* %Q, align 4
+  ret i32 0
+}

Added: llvm/trunk/test/Transforms/InstCombine/2009-01-19-fmod-constant-float-specials.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2009-01-19-fmod-constant-float-specials.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2009-01-19-fmod-constant-float-specials.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2009-01-19-fmod-constant-float-specials.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,315 @@
+; RUN: opt < %s -simplifycfg -instcombine -S | grep 0x7FF8000000000000 | count 12
+; RUN: opt < %s -simplifycfg -instcombine -S | grep "0\.0" | count 3
+; RUN: opt < %s -simplifycfg -instcombine -S | grep "3\.5" | count 1
+;
+
+; ModuleID = 'apf.c'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin9.6"
+@"\01LC" = internal constant [4 x i8] c"%f\0A\00"		; <[4 x i8]*> [#uses=1]
+
+define void @foo1() nounwind {
+entry:
+	%y = alloca float		; <float*> [#uses=2]
+	%x = alloca float		; <float*> [#uses=2]
+	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
+	store float 0x7FF0000000000000, float* %x, align 4
+	store float 0x7FF8000000000000, float* %y, align 4
+	%0 = load float, float* %y, align 4		; <float> [#uses=1]
+	%1 = fpext float %0 to double		; <double> [#uses=1]
+	%2 = load float, float* %x, align 4		; <float> [#uses=1]
+	%3 = fpext float %2 to double		; <double> [#uses=1]
+	%4 = frem double %3, %1		; <double> [#uses=1]
+	%5 = call i32 (i8*, ...) @printf(i8* getelementptr ([4 x i8], [4 x i8]* @"\01LC", i32 0, i32 0), double %4) nounwind		; <i32> [#uses=0]
+	br label %return
+
+return:		; preds = %entry
+	ret void
+}
+
+declare i32 @printf(i8*, ...) nounwind
+
+define void @foo2() nounwind {
+entry:
+	%y = alloca float		; <float*> [#uses=2]
+	%x = alloca float		; <float*> [#uses=2]
+	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
+	store float 0x7FF0000000000000, float* %x, align 4
+	store float 0.000000e+00, float* %y, align 4
+	%0 = load float, float* %y, align 4		; <float> [#uses=1]
+	%1 = fpext float %0 to double		; <double> [#uses=1]
+	%2 = load float, float* %x, align 4		; <float> [#uses=1]
+	%3 = fpext float %2 to double		; <double> [#uses=1]
+	%4 = frem double %3, %1		; <double> [#uses=1]
+	%5 = call i32 (i8*, ...) @printf(i8* getelementptr ([4 x i8], [4 x i8]* @"\01LC", i32 0, i32 0), double %4) nounwind		; <i32> [#uses=0]
+	br label %return
+
+return:		; preds = %entry
+	ret void
+}
+
+define void @foo3() nounwind {
+entry:
+	%y = alloca float		; <float*> [#uses=2]
+	%x = alloca float		; <float*> [#uses=2]
+	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
+	store float 0x7FF0000000000000, float* %x, align 4
+	store float 3.500000e+00, float* %y, align 4
+	%0 = load float, float* %y, align 4		; <float> [#uses=1]
+	%1 = fpext float %0 to double		; <double> [#uses=1]
+	%2 = load float, float* %x, align 4		; <float> [#uses=1]
+	%3 = fpext float %2 to double		; <double> [#uses=1]
+	%4 = frem double %3, %1		; <double> [#uses=1]
+	%5 = call i32 (i8*, ...) @printf(i8* getelementptr ([4 x i8], [4 x i8]* @"\01LC", i32 0, i32 0), double %4) nounwind		; <i32> [#uses=0]
+	br label %return
+
+return:		; preds = %entry
+	ret void
+}
+
+define void @foo4() nounwind {
+entry:
+	%y = alloca float		; <float*> [#uses=2]
+	%x = alloca float		; <float*> [#uses=2]
+	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
+	store float 0x7FF0000000000000, float* %x, align 4
+	store float 0x7FF0000000000000, float* %y, align 4
+	%0 = load float, float* %y, align 4		; <float> [#uses=1]
+	%1 = fpext float %0 to double		; <double> [#uses=1]
+	%2 = load float, float* %x, align 4		; <float> [#uses=1]
+	%3 = fpext float %2 to double		; <double> [#uses=1]
+	%4 = frem double %3, %1		; <double> [#uses=1]
+	%5 = call i32 (i8*, ...) @printf(i8* getelementptr ([4 x i8], [4 x i8]* @"\01LC", i32 0, i32 0), double %4) nounwind		; <i32> [#uses=0]
+	br label %return
+
+return:		; preds = %entry
+	ret void
+}
+
+define void @foo5() nounwind {
+entry:
+	%y = alloca float		; <float*> [#uses=2]
+	%x = alloca float		; <float*> [#uses=2]
+	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
+	store float 0x7FF8000000000000, float* %x, align 4
+	store float 0x7FF0000000000000, float* %y, align 4
+	%0 = load float, float* %y, align 4		; <float> [#uses=1]
+	%1 = fpext float %0 to double		; <double> [#uses=1]
+	%2 = load float, float* %x, align 4		; <float> [#uses=1]
+	%3 = fpext float %2 to double		; <double> [#uses=1]
+	%4 = frem double %3, %1		; <double> [#uses=1]
+	%5 = call i32 (i8*, ...) @printf(i8* getelementptr ([4 x i8], [4 x i8]* @"\01LC", i32 0, i32 0), double %4) nounwind		; <i32> [#uses=0]
+	br label %return
+
+return:		; preds = %entry
+	ret void
+}
+
+define void @foo6() nounwind {
+entry:
+	%y = alloca float		; <float*> [#uses=2]
+	%x = alloca float		; <float*> [#uses=2]
+	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
+	store float 0x7FF8000000000000, float* %x, align 4
+	store float 0.000000e+00, float* %y, align 4
+	%0 = load float, float* %y, align 4		; <float> [#uses=1]
+	%1 = fpext float %0 to double		; <double> [#uses=1]
+	%2 = load float, float* %x, align 4		; <float> [#uses=1]
+	%3 = fpext float %2 to double		; <double> [#uses=1]
+	%4 = frem double %3, %1		; <double> [#uses=1]
+	%5 = call i32 (i8*, ...) @printf(i8* getelementptr ([4 x i8], [4 x i8]* @"\01LC", i32 0, i32 0), double %4) nounwind		; <i32> [#uses=0]
+	br label %return
+
+return:		; preds = %entry
+	ret void
+}
+
+define void @foo7() nounwind {
+entry:
+	%y = alloca float		; <float*> [#uses=2]
+	%x = alloca float		; <float*> [#uses=2]
+	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
+	store float 0x7FF8000000000000, float* %x, align 4
+	store float 3.500000e+00, float* %y, align 4
+	%0 = load float, float* %y, align 4		; <float> [#uses=1]
+	%1 = fpext float %0 to double		; <double> [#uses=1]
+	%2 = load float, float* %x, align 4		; <float> [#uses=1]
+	%3 = fpext float %2 to double		; <double> [#uses=1]
+	%4 = frem double %3, %1		; <double> [#uses=1]
+	%5 = call i32 (i8*, ...) @printf(i8* getelementptr ([4 x i8], [4 x i8]* @"\01LC", i32 0, i32 0), double %4) nounwind		; <i32> [#uses=0]
+	br label %return
+
+return:		; preds = %entry
+	ret void
+}
+
+define void @foo8() nounwind {
+entry:
+	%y = alloca float		; <float*> [#uses=2]
+	%x = alloca float		; <float*> [#uses=2]
+	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
+	store float 0x7FF8000000000000, float* %x, align 4
+	store float 0x7FF8000000000000, float* %y, align 4
+	%0 = load float, float* %y, align 4		; <float> [#uses=1]
+	%1 = fpext float %0 to double		; <double> [#uses=1]
+	%2 = load float, float* %x, align 4		; <float> [#uses=1]
+	%3 = fpext float %2 to double		; <double> [#uses=1]
+	%4 = frem double %3, %1		; <double> [#uses=1]
+	%5 = call i32 (i8*, ...) @printf(i8* getelementptr ([4 x i8], [4 x i8]* @"\01LC", i32 0, i32 0), double %4) nounwind		; <i32> [#uses=0]
+	br label %return
+
+return:		; preds = %entry
+	ret void
+}
+
+define void @foo9() nounwind {
+entry:
+	%y = alloca float		; <float*> [#uses=2]
+	%x = alloca float		; <float*> [#uses=2]
+	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
+	store float 0.000000e+00, float* %x, align 4
+	store float 0x7FF8000000000000, float* %y, align 4
+	%0 = load float, float* %y, align 4		; <float> [#uses=1]
+	%1 = fpext float %0 to double		; <double> [#uses=1]
+	%2 = load float, float* %x, align 4		; <float> [#uses=1]
+	%3 = fpext float %2 to double		; <double> [#uses=1]
+	%4 = frem double %3, %1		; <double> [#uses=1]
+	%5 = call i32 (i8*, ...) @printf(i8* getelementptr ([4 x i8], [4 x i8]* @"\01LC", i32 0, i32 0), double %4) nounwind		; <i32> [#uses=0]
+	br label %return
+
+return:		; preds = %entry
+	ret void
+}
+
+define void @foo10() nounwind {
+entry:
+	%y = alloca float		; <float*> [#uses=2]
+	%x = alloca float		; <float*> [#uses=2]
+	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
+	store float 0.000000e+00, float* %x, align 4
+	store float 0x7FF0000000000000, float* %y, align 4
+	%0 = load float, float* %y, align 4		; <float> [#uses=1]
+	%1 = fpext float %0 to double		; <double> [#uses=1]
+	%2 = load float, float* %x, align 4		; <float> [#uses=1]
+	%3 = fpext float %2 to double		; <double> [#uses=1]
+	%4 = frem double %3, %1		; <double> [#uses=1]
+	%5 = call i32 (i8*, ...) @printf(i8* getelementptr ([4 x i8], [4 x i8]* @"\01LC", i32 0, i32 0), double %4) nounwind		; <i32> [#uses=0]
+	br label %return
+
+return:		; preds = %entry
+	ret void
+}
+
+define void @foo11() nounwind {
+entry:
+	%y = alloca float		; <float*> [#uses=2]
+	%x = alloca float		; <float*> [#uses=2]
+	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
+	store float 0.000000e+00, float* %x, align 4
+	store float 0.000000e+00, float* %y, align 4
+	%0 = load float, float* %y, align 4		; <float> [#uses=1]
+	%1 = fpext float %0 to double		; <double> [#uses=1]
+	%2 = load float, float* %x, align 4		; <float> [#uses=1]
+	%3 = fpext float %2 to double		; <double> [#uses=1]
+	%4 = frem double %3, %1		; <double> [#uses=1]
+	%5 = call i32 (i8*, ...) @printf(i8* getelementptr ([4 x i8], [4 x i8]* @"\01LC", i32 0, i32 0), double %4) nounwind		; <i32> [#uses=0]
+	br label %return
+
+return:		; preds = %entry
+	ret void
+}
+
+define void @foo12() nounwind {
+entry:
+	%y = alloca float		; <float*> [#uses=2]
+	%x = alloca float		; <float*> [#uses=2]
+	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
+	store float 0.000000e+00, float* %x, align 4
+	store float 3.500000e+00, float* %y, align 4
+	%0 = load float, float* %y, align 4		; <float> [#uses=1]
+	%1 = fpext float %0 to double		; <double> [#uses=1]
+	%2 = load float, float* %x, align 4		; <float> [#uses=1]
+	%3 = fpext float %2 to double		; <double> [#uses=1]
+	%4 = frem double %3, %1		; <double> [#uses=1]
+	%5 = call i32 (i8*, ...) @printf(i8* getelementptr ([4 x i8], [4 x i8]* @"\01LC", i32 0, i32 0), double %4) nounwind		; <i32> [#uses=0]
+	br label %return
+
+return:		; preds = %entry
+	ret void
+}
+
+define void @foo13() nounwind {
+entry:
+	%y = alloca float		; <float*> [#uses=2]
+	%x = alloca float		; <float*> [#uses=2]
+	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
+	store float 3.500000e+00, float* %x, align 4
+	store float 0x7FF8000000000000, float* %y, align 4
+	%0 = load float, float* %y, align 4		; <float> [#uses=1]
+	%1 = fpext float %0 to double		; <double> [#uses=1]
+	%2 = load float, float* %x, align 4		; <float> [#uses=1]
+	%3 = fpext float %2 to double		; <double> [#uses=1]
+	%4 = frem double %3, %1		; <double> [#uses=1]
+	%5 = call i32 (i8*, ...) @printf(i8* getelementptr ([4 x i8], [4 x i8]* @"\01LC", i32 0, i32 0), double %4) nounwind		; <i32> [#uses=0]
+	br label %return
+
+return:		; preds = %entry
+	ret void
+}
+
+define void @foo14() nounwind {
+entry:
+	%y = alloca float		; <float*> [#uses=2]
+	%x = alloca float		; <float*> [#uses=2]
+	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
+	store float 3.500000e+00, float* %x, align 4
+	store float 0x7FF0000000000000, float* %y, align 4
+	%0 = load float, float* %y, align 4		; <float> [#uses=1]
+	%1 = fpext float %0 to double		; <double> [#uses=1]
+	%2 = load float, float* %x, align 4		; <float> [#uses=1]
+	%3 = fpext float %2 to double		; <double> [#uses=1]
+	%4 = frem double %3, %1		; <double> [#uses=1]
+	%5 = call i32 (i8*, ...) @printf(i8* getelementptr ([4 x i8], [4 x i8]* @"\01LC", i32 0, i32 0), double %4) nounwind		; <i32> [#uses=0]
+	br label %return
+
+return:		; preds = %entry
+	ret void
+}
+
+define void @foo15() nounwind {
+entry:
+	%y = alloca float		; <float*> [#uses=2]
+	%x = alloca float		; <float*> [#uses=2]
+	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
+	store float 3.500000e+00, float* %x, align 4
+	store float 0.000000e+00, float* %y, align 4
+	%0 = load float, float* %y, align 4		; <float> [#uses=1]
+	%1 = fpext float %0 to double		; <double> [#uses=1]
+	%2 = load float, float* %x, align 4		; <float> [#uses=1]
+	%3 = fpext float %2 to double		; <double> [#uses=1]
+	%4 = frem double %3, %1		; <double> [#uses=1]
+	%5 = call i32 (i8*, ...) @printf(i8* getelementptr ([4 x i8], [4 x i8]* @"\01LC", i32 0, i32 0), double %4) nounwind		; <i32> [#uses=0]
+	br label %return
+
+return:		; preds = %entry
+	ret void
+}
+
+define void @foo16() nounwind {
+entry:
+	%y = alloca float		; <float*> [#uses=2]
+	%x = alloca float		; <float*> [#uses=2]
+	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
+	store float 3.500000e+00, float* %x, align 4
+	store float 3.500000e+00, float* %y, align 4
+	%0 = load float, float* %y, align 4		; <float> [#uses=1]
+	%1 = fpext float %0 to double		; <double> [#uses=1]
+	%2 = load float, float* %x, align 4		; <float> [#uses=1]
+	%3 = fpext float %2 to double		; <double> [#uses=1]
+	%4 = frem double %3, %1		; <double> [#uses=1]
+	%5 = call i32 (i8*, ...) @printf(i8* getelementptr ([4 x i8], [4 x i8]* @"\01LC", i32 0, i32 0), double %4) nounwind		; <i32> [#uses=0]
+	br label %return
+
+return:		; preds = %entry
+	ret void
+}

Added: llvm/trunk/test/Transforms/InstCombine/2009-01-19-fmod-constant-float.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2009-01-19-fmod-constant-float.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2009-01-19-fmod-constant-float.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2009-01-19-fmod-constant-float.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,75 @@
+; RUN: opt < %s -simplifycfg -instcombine -S | grep 0x3FB99999A0000000 | count 2
+; RUN: opt < %s -simplifycfg -instcombine -S | grep 0xBFB99999A0000000 | count 2
+; check constant folding for 'frem'.  PR 3316.
+
+; ModuleID = 'tt.c'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin9.6"
+
+define float @test1() nounwind {
+entry:
+	%retval = alloca float		; <float*> [#uses=2]
+	%0 = alloca float		; <float*> [#uses=2]
+	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
+	%1 = frem double 1.000000e-01, 1.000000e+00	; <double> [#uses=1]
+	%2 = fptrunc double %1 to float		; <float> [#uses=1]
+	store float %2, float* %0, align 4
+	%3 = load float, float* %0, align 4		; <float> [#uses=1]
+	store float %3, float* %retval, align 4
+	br label %return
+
+return:		; preds = %entry
+	%retval1 = load float, float* %retval		; <float> [#uses=1]
+	ret float %retval1
+}
+
+define float @test2() nounwind {
+entry:
+	%retval = alloca float		; <float*> [#uses=2]
+	%0 = alloca float		; <float*> [#uses=2]
+	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
+	%1 = frem double -1.000000e-01, 1.000000e+00	; <double> [#uses=1]
+	%2 = fptrunc double %1 to float		; <float> [#uses=1]
+	store float %2, float* %0, align 4
+	%3 = load float, float* %0, align 4		; <float> [#uses=1]
+	store float %3, float* %retval, align 4
+	br label %return
+
+return:		; preds = %entry
+	%retval1 = load float, float* %retval		; <float> [#uses=1]
+	ret float %retval1
+}
+
+define float @test3() nounwind {
+entry:
+	%retval = alloca float		; <float*> [#uses=2]
+	%0 = alloca float		; <float*> [#uses=2]
+	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
+	%1 = frem double 1.000000e-01, -1.000000e+00	; <double> [#uses=1]
+	%2 = fptrunc double %1 to float		; <float> [#uses=1]
+	store float %2, float* %0, align 4
+	%3 = load float, float* %0, align 4		; <float> [#uses=1]
+	store float %3, float* %retval, align 4
+	br label %return
+
+return:		; preds = %entry
+	%retval1 = load float, float* %retval		; <float> [#uses=1]
+	ret float %retval1
+}
+
+define float @test4() nounwind {
+entry:
+	%retval = alloca float		; <float*> [#uses=2]
+	%0 = alloca float		; <float*> [#uses=2]
+	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
+	%1 = frem double -1.000000e-01, -1.000000e+00	; <double> [#uses=1]
+	%2 = fptrunc double %1 to float		; <float> [#uses=1]
+	store float %2, float* %0, align 4
+	%3 = load float, float* %0, align 4		; <float> [#uses=1]
+	store float %3, float* %retval, align 4
+	br label %return
+
+return:		; preds = %entry
+	%retval1 = load float, float* %retval		; <float> [#uses=1]
+	ret float %retval1
+}

Added: llvm/trunk/test/Transforms/InstCombine/2009-01-24-EmptyStruct.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2009-01-24-EmptyStruct.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2009-01-24-EmptyStruct.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2009-01-24-EmptyStruct.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,18 @@
+; RUN: opt < %s -instcombine
+; PR3381
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+target triple = "x86_64-unknown-linux-gnu"
+	%struct.atomic_t = type { i32 }
+	%struct.inode = type { i32, %struct.mutex }
+	%struct.list_head = type { %struct.list_head*, %struct.list_head* }
+	%struct.lock_class_key = type {  }
+	%struct.mutex = type { %struct.atomic_t, %struct.rwlock_t, %struct.list_head }
+	%struct.rwlock_t = type { %struct.lock_class_key }
+
+define void @handle_event(%struct.inode* %bar) nounwind {
+entry:
+	%0 = getelementptr %struct.inode, %struct.inode* %bar, i64 -1, i32 1, i32 1		; <%struct.rwlock_t*> [#uses=1]
+	%1 = bitcast %struct.rwlock_t* %0 to i32*		; <i32*> [#uses=1]
+	store i32 1, i32* %1, align 4
+	ret void
+}

Added: llvm/trunk/test/Transforms/InstCombine/2009-01-31-InfIterate.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2009-01-31-InfIterate.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2009-01-31-InfIterate.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2009-01-31-InfIterate.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,22 @@
+; RUN: opt < %s -instcombine | llvm-dis
+; PR3452
+define i128 @test(i64 %A, i64 %B, i1 %C, i128 %Z, i128 %Y, i64* %P, i64* %Q) {
+entry:
+	%tmp2 = trunc i128 %Z to i64
+	%tmp4 = trunc i128 %Y to i64
+	store i64 %tmp2, i64* %P
+	store i64 %tmp4, i64* %Q
+	%x = sub i64 %tmp2, %tmp4
+	%c = sub i64 %tmp2, %tmp4
+	%tmp137 = zext i1 %C to i64
+	%tmp138 = sub i64 %c, %tmp137
+	br label %T
+
+T:
+	%G = phi i64 [%tmp138, %entry], [%tmp2, %Fal]
+	%F = zext i64 %G to i128
+	ret i128 %F
+
+Fal:
+	br label %T
+}

Added: llvm/trunk/test/Transforms/InstCombine/2009-01-31-Pressure.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2009-01-31-Pressure.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2009-01-31-Pressure.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2009-01-31-Pressure.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,22 @@
+; RUN: opt < %s -instcombine -S | grep "%B = add i8 %b, %x"
+; PR2698
+
+declare void @use1(i1)
+declare void @use8(i8)
+
+define void @test1(i8 %a, i8 %b, i8 %x) {
+  %A = add i8 %a, %x
+  %B = add i8 %b, %x
+  %C = icmp eq i8 %A, %B
+  call void @use1(i1 %C)
+  ret void
+}
+
+define void @test2(i8 %a, i8 %b, i8 %x) {
+  %A = add i8 %a, %x
+  %B = add i8 %b, %x
+  %C = icmp eq i8 %A, %B
+  call void @use1(i1 %C)
+  call void @use8(i8 %A)
+  ret void
+}

Added: llvm/trunk/test/Transforms/InstCombine/2009-02-04-FPBitcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2009-02-04-FPBitcast.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2009-02-04-FPBitcast.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2009-02-04-FPBitcast.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,12 @@
+; RUN: opt < %s -instcombine
+; PR3468
+
+define x86_fp80 @cast() {
+	%tmp = bitcast i80 0 to x86_fp80		; <x86_fp80> [#uses=1]
+	ret x86_fp80 %tmp
+}
+
+define i80 @invcast() {
+	%tmp = bitcast x86_fp80 0xK00000000000000000000 to i80		; <i80> [#uses=1]
+	ret i80 %tmp
+}

Added: llvm/trunk/test/Transforms/InstCombine/2009-02-11-NotInitialized.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2009-02-11-NotInitialized.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2009-02-11-NotInitialized.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2009-02-11-NotInitialized.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,14 @@
+; RUN: opt < %s -inline -instcombine -functionattrs | llvm-dis
+;
+; Check that nocapture attributes are added when run after an SCC pass.
+; PR3520
+
+define i32 @use(i8* %x) nounwind readonly {
+; CHECK: @use(i8* nocapture %x)
+  %1 = tail call i64 @strlen(i8* %x) nounwind readonly
+  %2 = trunc i64 %1 to i32
+  ret i32 %2
+}
+
+declare i64 @strlen(i8*) nounwind readonly
+; CHECK: declare i64 @strlen(i8* nocapture) nounwind readonly

Added: llvm/trunk/test/Transforms/InstCombine/2009-02-20-InstCombine-SROA.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2009-02-20-InstCombine-SROA.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2009-02-20-InstCombine-SROA.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2009-02-20-InstCombine-SROA.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,279 @@
+; RUN: opt < %s -instcombine -sroa -S | not grep " = alloca"
+; rdar://6417724
+; Instcombine shouldn't do anything to this function that prevents promoting the allocas inside it.
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin9.6"
+
+%"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >" = type { i32* }
+%"struct.std::_Vector_base<int,std::allocator<int> >" = type { %"struct.std::_Vector_base<int,std::allocator<int> >::_Vector_impl" }
+%"struct.std::_Vector_base<int,std::allocator<int> >::_Vector_impl" = type { i32*, i32*, i32* }
+%"struct.std::bidirectional_iterator_tag" = type <{ i8 }>
+%"struct.std::forward_iterator_tag" = type <{ i8 }>
+%"struct.std::input_iterator_tag" = type <{ i8 }>
+%"struct.std::random_access_iterator_tag" = type <{ i8 }>
+%"struct.std::vector<int,std::allocator<int> >" = type { %"struct.std::_Vector_base<int,std::allocator<int> >" }
+
+define i32* @_Z3fooRSt6vectorIiSaIiEE(%"struct.std::vector<int,std::allocator<int> >"* %X) {
+entry:
+  %0 = alloca %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"
+  %__first_addr.i.i = alloca %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"
+  %__last_addr.i.i = alloca %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"
+  %unnamed_arg.i = alloca %"struct.std::bidirectional_iterator_tag", align 8
+  %1 = alloca %"struct.std::bidirectional_iterator_tag"
+  %__first_addr.i = alloca %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"
+  %2 = alloca %"struct.std::bidirectional_iterator_tag"
+  %3 = alloca %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"
+  %4 = alloca i32
+  %"alloca point" = bitcast i32 0 to i32
+  store i32 42, i32* %4, align 4
+  %5 = getelementptr %"struct.std::vector<int,std::allocator<int> >", %"struct.std::vector<int,std::allocator<int> >"* %X, i32 0, i32 0
+  %6 = getelementptr %"struct.std::_Vector_base<int,std::allocator<int> >", %"struct.std::_Vector_base<int,std::allocator<int> >"* %5, i32 0, i32 0
+  %7 = getelementptr %"struct.std::_Vector_base<int,std::allocator<int> >::_Vector_impl", %"struct.std::_Vector_base<int,std::allocator<int> >::_Vector_impl"* %6, i32 0, i32 1
+  %8 = load i32*, i32** %7, align 4
+  %9 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %3, i32 0, i32 0
+  store i32* %8, i32** %9, align 4
+  %10 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %3, i32 0, i32 0
+  %11 = load i32*, i32** %10, align 4
+  %tmp2.i = ptrtoint i32* %11 to i32
+  %tmp1.i = inttoptr i32 %tmp2.i to i32*
+  %tmp3 = ptrtoint i32* %tmp1.i to i32
+  %tmp2 = inttoptr i32 %tmp3 to i32*
+  %12 = getelementptr %"struct.std::vector<int,std::allocator<int> >", %"struct.std::vector<int,std::allocator<int> >"* %X, i32 0, i32 0
+  %13 = getelementptr %"struct.std::_Vector_base<int,std::allocator<int> >", %"struct.std::_Vector_base<int,std::allocator<int> >"* %12, i32 0, i32 0
+  %14 = getelementptr %"struct.std::_Vector_base<int,std::allocator<int> >::_Vector_impl", %"struct.std::_Vector_base<int,std::allocator<int> >::_Vector_impl"* %13, i32 0, i32 0
+  %15 = load i32*, i32** %14, align 4
+  %16 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %0, i32 0, i32 0
+  store i32* %15, i32** %16, align 4
+  %17 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %0, i32 0, i32 0
+  %18 = load i32*, i32** %17, align 4
+  %tmp2.i17 = ptrtoint i32* %18 to i32
+  %tmp1.i18 = inttoptr i32 %tmp2.i17 to i32*
+  %tmp8 = ptrtoint i32* %tmp1.i18 to i32
+  %tmp6 = inttoptr i32 %tmp8 to i32*
+  %19 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i, i32 0, i32 0
+  store i32* %tmp6, i32** %19
+  %20 = getelementptr %"struct.std::bidirectional_iterator_tag", %"struct.std::bidirectional_iterator_tag"* %1, i32 0, i32 0
+  %21 = load i8, i8* %20, align 1
+  %22 = or i8 %21, 0
+  %23 = or i8 %22, 0
+  %24 = or i8 %23, 0
+  %25 = getelementptr %"struct.std::bidirectional_iterator_tag", %"struct.std::bidirectional_iterator_tag"* %2, i32 0, i32 0
+  store i8 0, i8* %25, align 1
+  %elt.i = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i, i32 0, i32 0
+  %val.i = load i32*, i32** %elt.i
+  %tmp.i = bitcast %"struct.std::bidirectional_iterator_tag"* %unnamed_arg.i to i8*
+  %tmp9.i = bitcast %"struct.std::bidirectional_iterator_tag"* %2 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp.i, i8* %tmp9.i, i64 1, i1 false)
+  %26 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i.i, i32 0, i32 0
+  store i32* %val.i, i32** %26
+  %27 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__last_addr.i.i, i32 0, i32 0
+  store i32* %tmp2, i32** %27
+  %28 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__last_addr.i.i, i32 0, i32 0
+  %29 = load i32*, i32** %28, align 4
+  %30 = ptrtoint i32* %29 to i32
+  %31 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i.i, i32 0, i32 0
+  %32 = load i32*, i32** %31, align 4
+  %33 = ptrtoint i32* %32 to i32
+  %34 = sub i32 %30, %33
+  %35 = ashr i32 %34, 2
+  %36 = ashr i32 %35, 2
+  br label %bb12.i.i
+
+bb.i.i:                                           ; preds = %bb12.i.i
+  %37 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i.i, i32 0, i32 0
+  %38 = load i32*, i32** %37, align 4
+  %39 = load i32, i32* %38, align 4
+  %40 = load i32, i32* %4, align 4
+  %41 = icmp eq i32 %39, %40
+  %42 = zext i1 %41 to i8
+  %toBool.i.i = icmp ne i8 %42, 0
+  br i1 %toBool.i.i, label %bb1.i.i, label %bb2.i.i
+
+bb1.i.i:                                          ; preds = %bb.i.i
+  %43 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i.i, i32 0, i32 0
+  %44 = load i32*, i32** %43, align 4
+  br label %_ZSt4findIN9__gnu_cxx17__normal_iteratorIPiSt6vectorIiSaIiEEEEiET_S7_S7_RKT0_.exit
+
+bb2.i.i:                                          ; preds = %bb.i.i
+  %45 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i.i, i32 0, i32 0
+  %46 = load i32*, i32** %45, align 4
+  %47 = getelementptr i32, i32* %46, i64 1
+  %48 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i.i, i32 0, i32 0
+  store i32* %47, i32** %48, align 4
+  %49 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i.i, i32 0, i32 0
+  %50 = load i32*, i32** %49, align 4
+  %51 = load i32, i32* %50, align 4
+  %52 = load i32, i32* %4, align 4
+  %53 = icmp eq i32 %51, %52
+  %54 = zext i1 %53 to i8
+  %toBool3.i.i = icmp ne i8 %54, 0
+  br i1 %toBool3.i.i, label %bb4.i.i, label %bb5.i.i
+
+bb4.i.i:                                          ; preds = %bb2.i.i
+  %55 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i.i, i32 0, i32 0
+  %56 = load i32*, i32** %55, align 4
+  br label %_ZSt4findIN9__gnu_cxx17__normal_iteratorIPiSt6vectorIiSaIiEEEEiET_S7_S7_RKT0_.exit
+
+bb5.i.i:                                          ; preds = %bb2.i.i
+  %57 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i.i, i32 0, i32 0
+  %58 = load i32*, i32** %57, align 4
+  %59 = getelementptr i32, i32* %58, i64 1
+  %60 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i.i, i32 0, i32 0
+  store i32* %59, i32** %60, align 4
+  %61 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i.i, i32 0, i32 0
+  %62 = load i32*, i32** %61, align 4
+  %63 = load i32, i32* %62, align 4
+  %64 = load i32, i32* %4, align 4
+  %65 = icmp eq i32 %63, %64
+  %66 = zext i1 %65 to i8
+  %toBool6.i.i = icmp ne i8 %66, 0
+  br i1 %toBool6.i.i, label %bb7.i.i, label %bb8.i.i
+
+bb7.i.i:                                          ; preds = %bb5.i.i
+  %67 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i.i, i32 0, i32 0
+  %68 = load i32*, i32** %67, align 4
+  br label %_ZSt4findIN9__gnu_cxx17__normal_iteratorIPiSt6vectorIiSaIiEEEEiET_S7_S7_RKT0_.exit
+
+bb8.i.i:                                          ; preds = %bb5.i.i
+  %69 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i.i, i32 0, i32 0
+  %70 = load i32*, i32** %69, align 4
+  %71 = getelementptr i32, i32* %70, i64 1
+  %72 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i.i, i32 0, i32 0
+  store i32* %71, i32** %72, align 4
+  %73 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i.i, i32 0, i32 0
+  %74 = load i32*, i32** %73, align 4
+  %75 = load i32, i32* %74, align 4
+  %76 = load i32, i32* %4, align 4
+  %77 = icmp eq i32 %75, %76
+  %78 = zext i1 %77 to i8
+  %toBool9.i.i = icmp ne i8 %78, 0
+  br i1 %toBool9.i.i, label %bb10.i.i, label %bb11.i.i
+
+bb10.i.i:                                         ; preds = %bb8.i.i
+  %79 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i.i, i32 0, i32 0
+  %80 = load i32*, i32** %79, align 4
+  br label %_ZSt4findIN9__gnu_cxx17__normal_iteratorIPiSt6vectorIiSaIiEEEEiET_S7_S7_RKT0_.exit
+
+bb11.i.i:                                         ; preds = %bb8.i.i
+  %81 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i.i, i32 0, i32 0
+  %82 = load i32*, i32** %81, align 4
+  %83 = getelementptr i32, i32* %82, i64 1
+  %84 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i.i, i32 0, i32 0
+  store i32* %83, i32** %84, align 4
+  %85 = sub i32 %__trip_count.0.i.i, 1
+  br label %bb12.i.i
+
+bb12.i.i:                                         ; preds = %bb11.i.i, %entry
+  %__trip_count.0.i.i = phi i32 [ %36, %entry ], [ %85, %bb11.i.i ]
+  %86 = icmp sgt i32 %__trip_count.0.i.i, 0
+  br i1 %86, label %bb.i.i, label %bb13.i.i
+
+bb13.i.i:                                         ; preds = %bb12.i.i
+  %87 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__last_addr.i.i, i32 0, i32 0
+  %88 = load i32*, i32** %87, align 4
+  %89 = ptrtoint i32* %88 to i32
+  %90 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i.i, i32 0, i32 0
+  %91 = load i32*, i32** %90, align 4
+  %92 = ptrtoint i32* %91 to i32
+  %93 = sub i32 %89, %92
+  %94 = ashr i32 %93, 2
+  switch i32 %94, label %bb26.i.i [
+    i32 1, label %bb22.i.i
+    i32 2, label %bb18.i.i
+    i32 3, label %bb14.i.i
+  ]
+
+bb14.i.i:                                         ; preds = %bb13.i.i
+  %95 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i.i, i32 0, i32 0
+  %96 = load i32*, i32** %95, align 4
+  %97 = load i32, i32* %96, align 4
+  %98 = load i32, i32* %4, align 4
+  %99 = icmp eq i32 %97, %98
+  %100 = zext i1 %99 to i8
+  %toBool15.i.i = icmp ne i8 %100, 0
+  br i1 %toBool15.i.i, label %bb16.i.i, label %bb17.i.i
+
+bb16.i.i:                                         ; preds = %bb14.i.i
+  %101 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i.i, i32 0, i32 0
+  %102 = load i32*, i32** %101, align 4
+  br label %_ZSt4findIN9__gnu_cxx17__normal_iteratorIPiSt6vectorIiSaIiEEEEiET_S7_S7_RKT0_.exit
+
+bb17.i.i:                                         ; preds = %bb14.i.i
+  %103 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i.i, i32 0, i32 0
+  %104 = load i32*, i32** %103, align 4
+  %105 = getelementptr i32, i32* %104, i64 1
+  %106 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i.i, i32 0, i32 0
+  store i32* %105, i32** %106, align 4
+  br label %bb18.i.i
+
+bb18.i.i:                                         ; preds = %bb17.i.i, %bb13.i.i
+  %107 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i.i, i32 0, i32 0
+  %108 = load i32*, i32** %107, align 4
+  %109 = load i32, i32* %108, align 4
+  %110 = load i32, i32* %4, align 4
+  %111 = icmp eq i32 %109, %110
+  %112 = zext i1 %111 to i8
+  %toBool19.i.i = icmp ne i8 %112, 0
+  br i1 %toBool19.i.i, label %bb20.i.i, label %bb21.i.i
+
+bb20.i.i:                                         ; preds = %bb18.i.i
+  %113 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i.i, i32 0, i32 0
+  %114 = load i32*, i32** %113, align 4
+  br label %_ZSt4findIN9__gnu_cxx17__normal_iteratorIPiSt6vectorIiSaIiEEEEiET_S7_S7_RKT0_.exit
+
+bb21.i.i:                                         ; preds = %bb18.i.i
+  %115 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i.i, i32 0, i32 0
+  %116 = load i32*, i32** %115, align 4
+  %117 = getelementptr i32, i32* %116, i64 1
+  %118 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i.i, i32 0, i32 0
+  store i32* %117, i32** %118, align 4
+  br label %bb22.i.i
+
+bb22.i.i:                                         ; preds = %bb21.i.i, %bb13.i.i
+  %119 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i.i, i32 0, i32 0
+  %120 = load i32*, i32** %119, align 4
+  %121 = load i32, i32* %120, align 4
+  %122 = load i32, i32* %4, align 4
+  %123 = icmp eq i32 %121, %122
+  %124 = zext i1 %123 to i8
+  %toBool23.i.i = icmp ne i8 %124, 0
+  br i1 %toBool23.i.i, label %bb24.i.i, label %bb25.i.i
+
+bb24.i.i:                                         ; preds = %bb22.i.i
+  %125 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i.i, i32 0, i32 0
+  %126 = load i32*, i32** %125, align 4
+  br label %_ZSt4findIN9__gnu_cxx17__normal_iteratorIPiSt6vectorIiSaIiEEEEiET_S7_S7_RKT0_.exit
+
+bb25.i.i:                                         ; preds = %bb22.i.i
+  %127 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i.i, i32 0, i32 0
+  %128 = load i32*, i32** %127, align 4
+  %129 = getelementptr i32, i32* %128, i64 1
+  %130 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__first_addr.i.i, i32 0, i32 0
+  store i32* %129, i32** %130, align 4
+  br label %bb26.i.i
+
+bb26.i.i:                                         ; preds = %bb25.i.i, %bb13.i.i
+  %131 = getelementptr %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >"* %__last_addr.i.i, i32 0, i32 0
+  %132 = load i32*, i32** %131, align 4
+  br label %_ZSt4findIN9__gnu_cxx17__normal_iteratorIPiSt6vectorIiSaIiEEEEiET_S7_S7_RKT0_.exit
+
+_ZSt4findIN9__gnu_cxx17__normal_iteratorIPiSt6vectorIiSaIiEEEEiET_S7_S7_RKT0_.exit: ; preds = %bb26.i.i, %bb24.i.i, %bb20.i.i, %bb16.i.i, %bb10.i.i, %bb7.i.i, %bb4.i.i, %bb1.i.i
+  %.0.0.i.i = phi i32* [ %132, %bb26.i.i ], [ %126, %bb24.i.i ], [ %114, %bb20.i.i ], [ %102, %bb16.i.i ], [ %80, %bb10.i.i ], [ %68, %bb7.i.i ], [ %56, %bb4.i.i ], [ %44, %bb1.i.i ]
+  %tmp2.i.i = ptrtoint i32* %.0.0.i.i to i32
+  %tmp1.i.i = inttoptr i32 %tmp2.i.i to i32*
+  %tmp4.i = ptrtoint i32* %tmp1.i.i to i32
+  %tmp3.i = inttoptr i32 %tmp4.i to i32*
+  %tmp8.i = ptrtoint i32* %tmp3.i to i32
+  %tmp6.i = inttoptr i32 %tmp8.i to i32*
+  %tmp12 = ptrtoint i32* %tmp6.i to i32
+  %tmp10 = inttoptr i32 %tmp12 to i32*
+  %tmp16 = ptrtoint i32* %tmp10 to i32
+  br label %return
+
+return:                                           ; preds = %_ZSt4findIN9__gnu_cxx17__normal_iteratorIPiSt6vectorIiSaIiEEEEiET_S7_S7_RKT0_.exit
+  %tmp14 = inttoptr i32 %tmp16 to i32*
+  ret i32* %tmp14
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind

Added: llvm/trunk/test/Transforms/InstCombine/2009-02-21-LoadCST.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2009-02-21-LoadCST.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2009-02-21-LoadCST.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2009-02-21-LoadCST.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,12 @@
+; RUN: opt < %s -instcombine -S | grep "ret i32 3679669"
+; PR3595
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
+target triple = "i386-pc-linux-gnu"
+
+ at .str1 = internal constant [4 x i8] c"\B5%8\00"
+
+define i32 @test() {
+  %rhsv = load i32, i32* bitcast ([4 x i8]* @.str1 to i32*), align 1
+  ret i32 %rhsv
+}

Added: llvm/trunk/test/Transforms/InstCombine/2009-02-25-CrashZeroSizeArray.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2009-02-25-CrashZeroSizeArray.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2009-02-25-CrashZeroSizeArray.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2009-02-25-CrashZeroSizeArray.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,38 @@
+; RUN: opt < %s -instcombine | llvm-dis
+; PR3667
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
+target triple = "i386-pc-linux-gnu"
+
+define void @_ada_c32001b(i32 %tmp5) {
+entry:
+	%max289 = select i1 false, i32 %tmp5, i32 0		; <i32> [#uses=1]
+	%tmp6 = mul i32 %max289, 4		; <i32> [#uses=1]
+	%tmp7 = alloca i8, i32 0		; <i8*> [#uses=1]
+	%tmp8 = bitcast i8* %tmp7 to [0 x [0 x i32]]*		; <[0 x [0 x i32]]*> [#uses=1]
+	%tmp11 = load i32, i32* null, align 1		; <i32> [#uses=1]
+	%tmp12 = icmp eq i32 %tmp11, 3		; <i1> [#uses=1]
+	%tmp13 = zext i1 %tmp12 to i8		; <i8> [#uses=1]
+	%tmp14 = ashr i32 %tmp6, 2		; <i32> [#uses=1]
+	%tmp15 = bitcast [0 x [0 x i32]]* %tmp8 to i8*		; <i8*> [#uses=1]
+	%tmp16 = mul i32 %tmp14, 4		; <i32> [#uses=1]
+	%tmp17 = mul i32 1, %tmp16		; <i32> [#uses=1]
+	%tmp18 = getelementptr i8, i8* %tmp15, i32 %tmp17		; <i8*> [#uses=1]
+	%tmp19 = bitcast i8* %tmp18 to [0 x i32]*		; <[0 x i32]*> [#uses=1]
+	%tmp20 = bitcast [0 x i32]* %tmp19 to i32*		; <i32*> [#uses=1]
+	%tmp21 = getelementptr i32, i32* %tmp20, i32 0		; <i32*> [#uses=1]
+	%tmp22 = load i32, i32* %tmp21, align 1		; <i32> [#uses=1]
+	%tmp23 = icmp eq i32 %tmp22, 4		; <i1> [#uses=1]
+	%tmp24 = zext i1 %tmp23 to i8		; <i8> [#uses=1]
+	%toBool709 = icmp ne i8 %tmp13, 0		; <i1> [#uses=1]
+	%toBool710 = icmp ne i8 %tmp24, 0		; <i1> [#uses=1]
+	%tmp25 = and i1 %toBool709, %toBool710		; <i1> [#uses=1]
+	%tmp26 = zext i1 %tmp25 to i8		; <i8> [#uses=1]
+	%toBool711 = icmp ne i8 %tmp26, 0		; <i1> [#uses=1]
+	br i1 %toBool711, label %a, label %b
+
+a:		; preds = %entry
+	ret void
+
+b:		; preds = %entry
+	ret void
+}

Added: llvm/trunk/test/Transforms/InstCombine/2009-03-18-vector-ashr-crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2009-03-18-vector-ashr-crash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2009-03-18-vector-ashr-crash.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2009-03-18-vector-ashr-crash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,11 @@
+; RUN: opt < %s -instcombine | llvm-dis
+; PR3826
+
+define void @0(<4 x i16>*, <4 x i16>*) {
+	%3 = alloca <4 x i16>*		; <<4 x i16>**> [#uses=1]
+	%4 = load <4 x i16>, <4 x i16>* null, align 1		; <<4 x i16>> [#uses=1]
+	%5 = ashr <4 x i16> %4, <i16 5, i16 5, i16 5, i16 5>		; <<4 x i16>> [#uses=1]
+	%6 = load <4 x i16>*, <4 x i16>** %3		; <<4 x i16>*> [#uses=1]
+	store <4 x i16> %5, <4 x i16>* %6, align 1
+	ret void
+}

Added: llvm/trunk/test/Transforms/InstCombine/2009-03-24-InfLoop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2009-03-24-InfLoop.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2009-03-24-InfLoop.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2009-03-24-InfLoop.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,9 @@
+; PR3874
+; RUN: opt < %s -instcombine | llvm-dis
+  define i1 @test(i32 %x) {
+    %A = lshr i32 3968, %x
+    %B = and i32 %A, 1
+    %C = icmp eq i32 %B, 0
+    ret i1 %C
+  }
+

Added: llvm/trunk/test/Transforms/InstCombine/2009-04-07-MulPromoteToI96.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2009-04-07-MulPromoteToI96.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2009-04-07-MulPromoteToI96.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2009-04-07-MulPromoteToI96.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,13 @@
+; RUN: opt < %s -instcombine -S | grep "mul i64"
+; rdar://6762288
+
+; Instcombine should not promote the mul to i96 because it is definitely
+; not a legal type for the target, and we don't want a libcall.
+
+define i96 @test(i96 %a.4, i96 %b.2) {
+	%tmp1086 = trunc i96 %a.4 to i64		; <i64> [#uses=1]
+	%tmp836 = trunc i96 %b.2 to i64		; <i64> [#uses=1]
+	%mul185 = mul i64 %tmp1086, %tmp836		; <i64> [#uses=1]
+	%tmp544 = zext i64 %mul185 to i96		; <i96> [#uses=1]
+	ret i96 %tmp544
+}

Added: llvm/trunk/test/Transforms/InstCombine/2009-05-23-FCmpToICmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2009-05-23-FCmpToICmp.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2009-05-23-FCmpToICmp.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2009-05-23-FCmpToICmp.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,9 @@
+; RUN: opt < %s -instcombine -S | not grep cmp
+; rdar://6903175
+
+define i1 @f0(i32 *%a) nounwind {
+       %b = load i32, i32* %a, align 4
+       %c = uitofp i32 %b to double
+       %d = fcmp ogt double %c, 0x41EFFFFFFFE00000
+       ret i1 %d
+}

Added: llvm/trunk/test/Transforms/InstCombine/2009-06-11-StoreAddrSpace.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2009-06-11-StoreAddrSpace.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2009-06-11-StoreAddrSpace.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2009-06-11-StoreAddrSpace.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,7 @@
+; RUN: opt < %s -instcombine -S | grep "store i32 0,"
+; PR4366
+
+define void @a() {
+  store i32 0, i32 addrspace(1)* null
+  ret void
+}

Added: llvm/trunk/test/Transforms/InstCombine/2009-06-16-SRemDemandedBits.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2009-06-16-SRemDemandedBits.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2009-06-16-SRemDemandedBits.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2009-06-16-SRemDemandedBits.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,9 @@
+; RUN: opt < %s -instcombine -S | grep srem
+; PR3439
+
+define i32 @a(i32 %x) nounwind {
+entry:
+	%rem = srem i32 %x, 2
+	%and = and i32 %rem, 2
+	ret i32 %and
+}

Added: llvm/trunk/test/Transforms/InstCombine/2009-07-02-MaskedIntVector.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2009-07-02-MaskedIntVector.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2009-07-02-MaskedIntVector.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2009-07-02-MaskedIntVector.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,15 @@
+; RUN: opt < %s -instcombine | llvm-dis
+; PR4495
+
+define i32 @test(i64 %test) {
+entry:
+	%0 = bitcast <4 x i32> undef to <16 x i8>		; <<16 x i8>> [#uses=1]
+	%t12 = shufflevector <16 x i8> %0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>		; <<16 x i8>> [#uses=1]
+	%t11 = bitcast <16 x i8> %t12 to <2 x i64>		; <<2 x i64>> [#uses=1]
+	%t9 = extractelement <2 x i64> %t11, i32 0		; <i64> [#uses=1]
+	%t10 = bitcast i64 %t9 to <2 x i32>		; <<2 x i32>> [#uses=1]
+	%t7 = bitcast i64 %test to <2 x i32>		; <<2 x i32>> [#uses=1]
+	%t6 = xor <2 x i32> %t10, %t7		; <<2 x i32>> [#uses=1]
+	%t1 = extractelement <2 x i32> %t6, i32 0		; <i32> [#uses=1]
+	ret i32 %t1
+}

Added: llvm/trunk/test/Transforms/InstCombine/2009-12-17-CmpSelectNull.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2009-12-17-CmpSelectNull.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2009-12-17-CmpSelectNull.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2009-12-17-CmpSelectNull.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,16 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+ at .str254 = internal constant [2 x i8] c".\00"
+ at .str557 = internal constant [3 x i8] c"::\00"
+
+define i8* @demangle_qualified(i32 %isfuncname) nounwind {
+entry:
+  %tobool272 = icmp ne i32 %isfuncname, 0
+  %cond276 = select i1 %tobool272, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str254, i32 0, i32 0), i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str557, i32 0, i32 0) ; <i8*> [#uses=4]
+  %cmp.i504 = icmp eq i8* %cond276, null
+  %rval = getelementptr i8, i8* %cond276, i1 %cmp.i504
+  ret i8* %rval
+}
+
+; CHECK: %cond276 = select i1
+; CHECK: ret i8* %cond276

Added: llvm/trunk/test/Transforms/InstCombine/2010-01-28-NegativeSRem.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2010-01-28-NegativeSRem.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2010-01-28-NegativeSRem.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2010-01-28-NegativeSRem.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,19 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; PR6165
+
+define i32 @f() {
+entry:
+  br label %BB1
+
+BB1:                                              ; preds = %BB1, %entry
+; CHECK: BB1:
+  %x = phi i32 [ -29, %entry ], [ 0, %BB1 ]       ; <i32> [#uses=2]
+  %rem = srem i32 %x, 2                           ; <i32> [#uses=1]
+  %t = icmp eq i32 %rem, -1                       ; <i1> [#uses=1]
+  br i1 %t, label %BB2, label %BB1
+; CHECK-NOT: br i1 false
+
+BB2:                                              ; preds = %BB1
+; CHECK: BB2:
+  ret i32 %x
+}

Added: llvm/trunk/test/Transforms/InstCombine/2010-03-03-ExtElim.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2010-03-03-ExtElim.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2010-03-03-ExtElim.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2010-03-03-ExtElim.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,32 @@
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
+target triple = "i386-unknown-linux-gnu"
+
+ at g_92 = common global [2 x i32*] zeroinitializer, align 4 ; <[2 x i32*]*> [#uses=1]
+ at g_177 = constant i32** bitcast (i8* getelementptr (i8, i8* bitcast ([2 x i32*]* @g_92 to i8*), i64 4) to i32**), align 4 ; <i32***> [#uses=1]
+
+define i1 @PR6486() nounwind {
+; CHECK-LABEL: @PR6486(
+  %tmp = load i32**, i32*** @g_177                       ; <i32**> [#uses=1]
+  %cmp = icmp ne i32** null, %tmp                 ; <i1> [#uses=1]
+  %conv = zext i1 %cmp to i32                     ; <i32> [#uses=1]
+  %cmp1 = icmp sle i32 0, %conv                   ; <i1> [#uses=1]
+  ret i1 %cmp1
+; CHECK: ret i1 true
+}
+
+ at d = common global i32 0, align 4
+ at a = common global [1 x i32] zeroinitializer, align 4
+
+define i1 @PR16462_1() nounwind {
+; CHECK-LABEL: @PR16462_1(
+  ret i1 icmp sgt (i32 sext (i16 trunc (i32 select (i1 icmp eq (i32* getelementptr inbounds ([1 x i32], [1 x i32]* @a, i32 0, i32 0), i32* @d), i32 0, i32 1) to i16) to i32), i32 65535)
+; CHECK: ret i1 false
+}
+
+define i1 @PR16462_2() nounwind {
+; CHECK-LABEL: @PR16462_2(
+  ret i1 icmp sgt (i32 sext (i16 trunc (i32 select (i1 icmp eq (i32* getelementptr inbounds ([1 x i32], [1 x i32]* @a, i32 0, i32 0), i32* @d), i32 0, i32 1) to i16) to i32), i32 42)
+; CHECK: ret i1 false
+}

Added: llvm/trunk/test/Transforms/InstCombine/2010-05-30-memcpy-Struct.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2010-05-30-memcpy-Struct.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2010-05-30-memcpy-Struct.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2010-05-30-memcpy-Struct.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,20 @@
+; RUN: opt -instcombine -S < %s | FileCheck %s
+; PR7265
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+%union.anon = type { i32, [4 x i8] }
+
+ at .str = private constant [3 x i8] c"%s\00"
+
+define void @CopyEventArg(%union.anon* %ev) nounwind {
+entry:
+  %call = call i32 (i8*, i8*, ...) @sprintf(i8* undef, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str, i64 0, i64 0), %union.anon* %ev) nounwind
+; CHECK: bitcast %union.anon* %ev to i8*
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64
+  ret void
+}
+
+declare i32 @sprintf(i8*, i8*, ...)
+

Added: llvm/trunk/test/Transforms/InstCombine/2010-11-01-lshr-mask.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2010-11-01-lshr-mask.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2010-11-01-lshr-mask.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2010-11-01-lshr-mask.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,57 @@
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+; <rdar://problem/8606771>
+define i32 @main(i32 %argc) {
+; CHECK-LABEL: @main(
+; CHECK-NEXT:    [[TMP3151:%.*]] = trunc i32 %argc to i8
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i8 [[TMP3151]], 5
+; CHECK-NEXT:    [[TMP4126:%.*]] = and i8 [[TMP1]], 64
+; CHECK-NEXT:    [[TMP4127:%.*]] = xor i8 [[TMP4126]], 64
+; CHECK-NEXT:    [[TMP4086:%.*]] = zext i8 [[TMP4127]] to i32
+; CHECK-NEXT:    ret i32 [[TMP4086]]
+;
+  %tmp3151 = trunc i32 %argc to i8
+  %tmp3161 = or i8 %tmp3151, -17
+  %tmp3162 = and i8 %tmp3151, 122
+  %tmp3163 = xor i8 %tmp3162, -17
+  %tmp4114 = shl i8 %tmp3163, 6
+  %tmp4115 = xor i8 %tmp4114, %tmp3163
+  %tmp4120 = xor i8 %tmp3161, %tmp4115
+  %tmp4126 = lshr i8 %tmp4120, 7
+  %tmp4127 = mul i8 %tmp4126, 64
+  %tmp4086 = zext i8 %tmp4127 to i32
+  ret i32 %tmp4086
+}
+
+; rdar://8739316
+define i8 @foo(i8 %arg, i8 %arg1) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    [[TMP:%.*]] = shl i8 %arg, 7
+; CHECK-NEXT:    [[TMP2:%.*]] = and i8 %arg1, 84
+; CHECK-NEXT:    [[TMP3:%.*]] = and i8 %arg1, -118
+; CHECK-NEXT:    [[TMP4:%.*]] = and i8 %arg1, 33
+; CHECK-NEXT:    [[TMP5:%.*]] = sub nsw i8 40, [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = and i8 [[TMP5]], 84
+; CHECK-NEXT:    [[TMP7:%.*]] = or i8 [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i8 [[TMP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i8 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = lshr i8 [[TMP8]], 7
+; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i8 [[TMP10]], 5
+; CHECK-NEXT:    [[TMP12:%.*]] = xor i8 [[TMP11]], [[TMP9]]
+; CHECK-NEXT:    ret i8 [[TMP12]]
+;
+  %tmp = shl i8 %arg, 7
+  %tmp2 = and i8 %arg1, 84
+  %tmp3 = and i8 %arg1, -118
+  %tmp4 = and i8 %arg1, 33
+  %tmp5 = sub i8 -88, %tmp2
+  %tmp6 = and i8 %tmp5, 84
+  %tmp7 = or i8 %tmp4, %tmp6
+  %tmp8 = xor i8 %tmp, %tmp3
+  %tmp9 = or i8 %tmp7, %tmp8
+  %tmp10 = lshr i8 %tmp8, 7
+  %tmp11 = shl i8 %tmp10, 5
+  %tmp12 = xor i8 %tmp11, %tmp9
+  ret i8 %tmp12
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2010-11-21-SizeZeroTypeGEP.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2010-11-21-SizeZeroTypeGEP.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2010-11-21-SizeZeroTypeGEP.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2010-11-21-SizeZeroTypeGEP.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,17 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+
+define {}* @foo({}* %x, i32 %n) {
+; CHECK-LABEL: @foo(
+; CHECK-NOT: getelementptr
+  %p = getelementptr {}, {}* %x, i32 %n
+  ret {}* %p
+}
+
+define i8* @bar(i64 %n, {{}, [0 x {[0 x i8]}]}* %p) {
+; CHECK-LABEL: @bar(
+  %g = getelementptr {{}, [0 x {[0 x i8]}]}, {{}, [0 x {[0 x i8]}]}* %p, i64 %n, i32 1, i64 %n, i32 0, i64 %n
+; CHECK: %p, i64 0, i32 1, i64 0, i32 0, i64 %n
+  ret i8* %g
+}

Added: llvm/trunk/test/Transforms/InstCombine/2010-11-23-Distributed.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2010-11-23-Distributed.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2010-11-23-Distributed.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2010-11-23-Distributed.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,23 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+define i32 @foo(i32 %x, i32 %y) {
+; CHECK-LABEL: @foo(
+  %add = add nsw i32 %y, %x
+  %mul = mul nsw i32 %add, %y
+  %square = mul nsw i32 %y, %y
+  %res = sub i32 %mul, %square
+  ret i32 %res
+; CHECK-NEXT: mul i32 %x, %y
+; CHECK-NEXT: ret i32
+}
+
+define i1 @bar(i64 %x, i64 %y) {
+; CHECK-LABEL: @bar(
+  %a = and i64 %y, %x
+; CHECK: and
+; CHECK-NOT: and
+  %not = xor i64 %a, -1
+  %b = and i64 %y, %not
+  %r = icmp eq i64 %b, 0
+  ret i1 %r
+; CHECK: ret i1
+}

Added: llvm/trunk/test/Transforms/InstCombine/2011-02-14-InfLoop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2011-02-14-InfLoop.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2011-02-14-InfLoop.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2011-02-14-InfLoop.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,19 @@
+; This testcase causes an infinite loop in the instruction combiner,
+; because it changes a pattern and the original pattern is almost
+; identical to the newly-generated pattern.
+; RUN: opt < %s -instcombine -disable-output
+
+;PR PR9216
+
+target triple = "x86_64-unknown-linux-gnu"
+
+define <4 x float> @m_387(i8* noalias nocapture %A, i8* nocapture %B, <4 x i1> %C) nounwind {
+entry:
+  %movcsext20 = sext <4 x i1> %C to <4 x i32>
+  %tmp2389 = xor <4 x i32> %movcsext20, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %movcand25 = and <4 x i32> %tmp2389, <i32 undef, i32 undef, i32 undef, i32 -1>
+  %movcor26 = or <4 x i32> %movcand25, zeroinitializer
+  %L2 = bitcast <4 x i32> %movcor26 to <4 x float>
+  %L3 = shufflevector <4 x float> zeroinitializer, <4 x float> %L2, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x float> %L3
+}

Added: llvm/trunk/test/Transforms/InstCombine/2011-03-08-SRemMinusOneBadOpt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2011-03-08-SRemMinusOneBadOpt.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2011-03-08-SRemMinusOneBadOpt.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2011-03-08-SRemMinusOneBadOpt.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,12 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; PR9346
+
+define i32 @test(i64 %x) nounwind {
+; CHECK: ret i32 0
+entry:
+  %or = or i64 %x, 4294967294
+  %conv = trunc i64 %or to i32
+  %rem.i = srem i32 %conv, -1
+  ret i32 %rem.i
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2011-05-02-VectorBoolean.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2011-05-02-VectorBoolean.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2011-05-02-VectorBoolean.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2011-05-02-VectorBoolean.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,15 @@
+; RUN: opt < %s -instcombine
+; PR9579
+
+define <2 x i16> @entry(<2 x i16> %a) nounwind {
+entry:
+  %a.addr = alloca <2 x i16>, align 4
+  %.compoundliteral = alloca <2 x i16>, align 4
+  store <2 x i16> %a, <2 x i16>* %a.addr, align 4
+  %tmp = load <2 x i16>, <2 x i16>* %a.addr, align 4
+  store <2 x i16> zeroinitializer, <2 x i16>* %.compoundliteral
+  %tmp1 = load <2 x i16>, <2 x i16>* %.compoundliteral
+  %cmp = icmp uge <2 x i16> %tmp, %tmp1
+  %sext = sext <2 x i1> %cmp to <2 x i16>
+  ret <2 x i16> %sext
+}

Added: llvm/trunk/test/Transforms/InstCombine/2011-05-13-InBoundsGEP.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2011-05-13-InBoundsGEP.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2011-05-13-InBoundsGEP.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2011-05-13-InBoundsGEP.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,21 @@
+; RUN: opt < %s -S -instcombine | FileCheck %s
+; rdar://problem/9267970
+; ideally this test will run on a 32-bit host
+; must not discard GEPs that might overflow at runtime (aren't inbounds)
+
+define i32 @main(i32 %argc) {
+entry:
+    %tmp1 = add i32 %argc, -2
+    %tmp2 = add i32 %argc, 1879048192
+    %p = alloca i8
+; CHECK: getelementptr
+    %p1 = getelementptr i8, i8* %p, i32 %tmp1
+; CHECK: getelementptr
+    %p2 = getelementptr i8, i8* %p, i32 %tmp2
+    %cmp = icmp ult i8* %p1, %p2
+    br i1 %cmp, label %bbtrue, label %bbfalse
+bbtrue:          ; preds = %entry
+    ret i32 -1
+bbfalse:         ; preds = %entry
+    ret i32 0
+}

Added: llvm/trunk/test/Transforms/InstCombine/2011-05-28-swapmulsub.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2011-05-28-swapmulsub.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2011-05-28-swapmulsub.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2011-05-28-swapmulsub.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,64 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+target triple = "x86_64-apple-macosx10.6.6"
+
+define zeroext i16 @foo1(i32 %on_off) {
+; CHECK-LABEL: @foo1(
+; CHECK-NEXT:    [[ON_OFF_TR:%.*]] = trunc i32 %on_off to i16
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i16 [[ON_OFF_TR]], 1
+; CHECK-NEXT:    [[CONV:%.*]] = add i16 [[TMP1]], -2
+; CHECK-NEXT:    ret i16 [[CONV]]
+;
+  %on_off.addr = alloca i32, align 4
+  %a = alloca i32, align 4
+  store i32 %on_off, i32* %on_off.addr, align 4
+  %tmp = load i32, i32* %on_off.addr, align 4
+  %sub = sub i32 1, %tmp
+  %mul = mul i32 %sub, -2
+  store i32 %mul, i32* %a, align 4
+  %tmp1 = load i32, i32* %a, align 4
+  %conv = trunc i32 %tmp1 to i16
+  ret i16 %conv
+}
+
+define zeroext i16 @foo2(i32 %on_off, i32 %q) {
+; CHECK-LABEL: @foo2(
+; CHECK-NEXT:    [[SUBA:%.*]] = sub i32 %on_off, %q
+; CHECK-NEXT:    [[SUBA_TR:%.*]] = trunc i32 [[SUBA]] to i16
+; CHECK-NEXT:    [[CONV:%.*]] = shl i16 [[SUBA_TR]], 2
+; CHECK-NEXT:    ret i16 [[CONV]]
+;
+  %on_off.addr = alloca i32, align 4
+  %q.addr = alloca i32, align 4
+  %a = alloca i32, align 4
+  store i32 %on_off, i32* %on_off.addr, align 4
+  store i32 %q, i32* %q.addr, align 4
+  %tmp = load i32, i32* %q.addr, align 4
+  %tmp1 = load i32, i32* %on_off.addr, align 4
+  %sub = sub i32 %tmp, %tmp1
+  %mul = mul i32 %sub, -4
+  store i32 %mul, i32* %a, align 4
+  %tmp2 = load i32, i32* %a, align 4
+  %conv = trunc i32 %tmp2 to i16
+  ret i16 %conv
+}
+
+define zeroext i16 @foo3(i32 %on_off) {
+; CHECK-LABEL: @foo3(
+; CHECK-NEXT:    [[ON_OFF_TR:%.*]] = trunc i32 %on_off to i16
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i16 [[ON_OFF_TR]], 2
+; CHECK-NEXT:    [[CONV:%.*]] = add i16 [[TMP1]], -28
+; CHECK-NEXT:    ret i16 [[CONV]]
+;
+  %on_off.addr = alloca i32, align 4
+  %a = alloca i32, align 4
+  store i32 %on_off, i32* %on_off.addr, align 4
+  %tmp = load i32, i32* %on_off.addr, align 4
+  %sub = sub i32 7, %tmp
+  %mul = mul i32 %sub, -4
+  store i32 %mul, i32* %a, align 4
+  %tmp1 = load i32, i32* %a, align 4
+  %conv = trunc i32 %tmp1 to i16
+  ret i16 %conv
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2011-06-13-nsw-alloca.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2011-06-13-nsw-alloca.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2011-06-13-nsw-alloca.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2011-06-13-nsw-alloca.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,64 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+target triple = "i386-apple-darwin10.0.0"
+
+; CHECK-LABEL: define void @fu1(
+define void @fu1(i32 %parm) nounwind ssp {
+  %1 = alloca i32, align 4
+; CHECK: alloca double*
+  %ptr = alloca double*, align 4
+  store i32 %parm, i32* %1, align 4
+  store double* null, double** %ptr, align 4
+  %2 = load i32, i32* %1, align 4
+  %3 = icmp ne i32 %2, 0
+  br i1 %3, label %4, label %10
+
+; <label>:4                                       ; preds = %0
+  %5 = load i32, i32* %1, align 4
+  %6 = shl nsw i32 %5, 3
+; With "nsw", the alloca and its bitcast can be fused:
+  %7 = add nsw i32 %6, 2048
+;  CHECK: alloca double
+  %8 = alloca i8, i32 %7
+  %9 = bitcast i8* %8 to double*
+; CHECK-NEXT: store double*
+  store double* %9, double** %ptr, align 4
+  br label %10
+; <label>:10                                      ; preds = %4, %0
+  %11 = load double*, double** %ptr, align 4
+  call void @bar(double* %11)
+; CHECK: ret
+  ret void
+}
+
+declare void @bar(double*)
+
+; CHECK-LABEL: define void @fu2(
+define void @fu2(i32 %parm) nounwind ssp {
+  %1 = alloca i32, align 4
+  %ptr = alloca double*, align 4
+  store i32 %parm, i32* %1, align 4
+  store double* null, double** %ptr, align 4
+  %2 = load i32, i32* %1, align 4
+  %3 = icmp ne i32 %2, 0
+  br i1 %3, label %4, label %10
+
+; <label>:4                                       ; preds = %0
+  %5 = load i32, i32* %1, align 4
+  %6 = mul nsw i32 %5, 8
+; Without "nsw", the alloca and its bitcast cannot be fused:
+  %7 = add  i32 %6, 2048
+; CHECK: alloca i8
+  %8 = alloca i8, i32 %7
+; CHECK-NEXT: bitcast double**
+; CHECK-NEXT: store i8*
+  %9 = bitcast i8* %8 to double*
+  store double* %9, double** %ptr, align 4
+  br label %10
+
+; <label>:10                                      ; preds = %4, %0
+  %11 = load double*, double** %ptr, align 4
+  call void @bar(double* %11)
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2011-09-03-Trampoline.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2011-09-03-Trampoline.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2011-09-03-Trampoline.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2011-09-03-Trampoline.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,102 @@
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+declare void @llvm.init.trampoline(i8*, i8*, i8*)
+declare i8* @llvm.adjust.trampoline(i8*)
+declare i32 @f(i8 * nest, i32)
+
+; Most common case
+define i32 @test0(i32 %n) !dbg !4 {
+  %alloca = alloca [10 x i8], align 16
+  %gep = getelementptr [10 x i8], [10 x i8]* %alloca, i32 0, i32 0
+  call void @llvm.init.trampoline(i8* %gep, i8* bitcast (i32 (i8*, i32)* @f to i8*),
+                                  i8* null)
+  %tramp = call i8* @llvm.adjust.trampoline(i8* %gep)
+  %function = bitcast i8* %tramp to i32(i32)*
+  %ret = call i32 %function(i32 %n), !dbg !10
+  ret i32 %ret
+
+; CHECK: define i32 @test0(i32 %n) !dbg !4 {
+; CHECK: %ret = call i32 @f(i8* nest null, i32 %n), !dbg !10
+}
+
+define i32 @test1(i32 %n, i8* %trampmem) {
+  call void @llvm.init.trampoline(i8* %trampmem,
+                                  i8* bitcast (i32 (i8*, i32)* @f to i8*),
+                                  i8* null)
+  %tramp = call i8* @llvm.adjust.trampoline(i8* %trampmem)
+  %function = bitcast i8* %tramp to i32(i32)*
+  %ret = call i32 %function(i32 %n)
+  ret i32 %ret
+; CHECK: define i32 @test1(i32 %n, i8* %trampmem) {
+; CHECK: %ret = call i32 @f(i8* nest null, i32 %n)
+}
+
+define i32 @test2(i32 %n, i8* %trampmem) {
+  %tramp = call i8* @llvm.adjust.trampoline(i8* %trampmem)
+  %functiona = bitcast i8* %tramp to i32(i32)*
+  %ret = call i32 %functiona(i32 %n)
+  ret i32 %ret
+; CHECK: define i32 @test2(i32 %n, i8* %trampmem) {
+; CHECK: %ret = call i32 %functiona(i32 %n)
+}
+
+define i32 @test3(i32 %n, i8* %trampmem) {
+  call void @llvm.init.trampoline(i8* %trampmem,
+                                  i8* bitcast (i32 (i8*, i32)* @f to i8*),
+                                  i8* null)
+
+; CHECK: define i32 @test3(i32 %n, i8* %trampmem) {
+; CHECK: %ret0 = call i32 @f(i8* nest null, i32 %n)
+  %tramp0 = call i8* @llvm.adjust.trampoline(i8* %trampmem)
+  %function0 = bitcast i8* %tramp0 to i32(i32)*
+  %ret0 = call i32 %function0(i32 %n)
+
+  ;; Not optimized since previous call could be writing.
+  %tramp1 = call i8* @llvm.adjust.trampoline(i8* %trampmem)
+  %function1 = bitcast i8* %tramp1 to i32(i32)*
+  %ret1 = call i32 %function1(i32 %n)
+; CHECK: %ret1 = call i32 %function1(i32 %n)
+
+  ret i32 %ret1
+}
+
+define i32 @test4(i32 %n) {
+  %alloca = alloca [10 x i8], align 16
+  %gep = getelementptr [10 x i8], [10 x i8]* %alloca, i32 0, i32 0
+  call void @llvm.init.trampoline(i8* %gep, i8* bitcast (i32 (i8*, i32)* @f to i8*),
+                                  i8* null)
+
+  %tramp0 = call i8* @llvm.adjust.trampoline(i8* %gep)
+  %function0 = bitcast i8* %tramp0 to i32(i32)*
+  %ret0 = call i32 %function0(i32 %n)
+
+  %tramp1 = call i8* @llvm.adjust.trampoline(i8* %gep)
+  %function1 = bitcast i8* %tramp0 to i32(i32)*
+  %ret1 = call i32 %function1(i32 %n)
+
+  %tramp2 = call i8* @llvm.adjust.trampoline(i8* %gep)
+  %function2 = bitcast i8* %tramp2 to i32(i32)*
+  %ret2 = call i32 %function2(i32 %n)
+
+  ret i32 %ret2
+
+; CHECK: define i32 @test4(i32 %n) {
+; CHECK: %ret0 = call i32 @f(i8* nest null, i32 %n)
+; CHECK: %ret1 = call i32 @f(i8* nest null, i32 %n)
+; CHECK: %ret2 = call i32 @f(i8* nest null, i32 %n)
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.0 (trunk 127710)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2)
+!1 = !DIFile(filename: "string.h", directory: "Game")
+!2 = !{}
+!3 = !{i32 1, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "passthru", scope: !1, file: !1, line: 79, type: !5, isLocal: true, isDefinition: true, scopeLine: 79, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !8)
+!5 = !DISubroutineType(types: !6)
+!6 = !{!7}
+!7 = !DIDerivedType(tag: DW_TAG_pointer_type, scope: !0, baseType: null, size: 64, align: 64)
+!8 = !{!9}
+!9 = !DILocalVariable(name: "a", arg: 1, scope: !4, file: !1, line: 78, type: !7)
+!10 = !DILocation(line: 78, column: 28, scope: !4)

Added: llvm/trunk/test/Transforms/InstCombine/2011-10-07-AlignPromotion.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2011-10-07-AlignPromotion.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2011-10-07-AlignPromotion.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2011-10-07-AlignPromotion.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,20 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+; rdar://problem/10063307
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios5.0.0"
+
+%0 = type { [2 x i32] }
+%struct.CGPoint = type { float, float }
+
+define void @t(%struct.CGPoint* %a) nounwind {
+  %Point = alloca %struct.CGPoint, align 4
+  %1 = bitcast %struct.CGPoint* %a to i64*
+  %2 = bitcast %struct.CGPoint* %Point to i64*
+  %3 = load i64, i64* %1, align 4
+  store i64 %3, i64* %2, align 4
+  call void @foo(i64* %2) nounwind
+  ret void
+; CHECK: %Point = alloca i64, align 4
+}
+
+declare void @foo(i64*)

Added: llvm/trunk/test/Transforms/InstCombine/2012-01-11-OpaqueBitcastCrash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2012-01-11-OpaqueBitcastCrash.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2012-01-11-OpaqueBitcastCrash.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2012-01-11-OpaqueBitcastCrash.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,12 @@
+; RUN: opt < %s -instcombine -disable-output
+
+%opaque_struct = type opaque
+
+ at G = external global [0 x %opaque_struct]
+
+declare void @foo(%opaque_struct*)
+
+define void @bar() {
+  call void @foo(%opaque_struct* bitcast ([0 x %opaque_struct]* @G to %opaque_struct*))
+  ret void
+}

Added: llvm/trunk/test/Transforms/InstCombine/2012-02-13-FCmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2012-02-13-FCmp.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2012-02-13-FCmp.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2012-02-13-FCmp.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,35 @@
+; RUN: opt -instcombine -S < %s | FileCheck %s
+; Radar 10803727
+ at .str = private unnamed_addr constant [35 x i8] c"\0Ain_range input (should be 0): %f\0A\00", align 1
+ at .str1 = external hidden unnamed_addr constant [35 x i8], align 1
+
+declare i32 @printf(i8*, ...)
+define i64 @_Z8tempCastj(i32 %val) uwtable ssp {
+entry:
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([35 x i8], [35 x i8]* @.str1, i64 0, i64 0), i32 %val)
+  %conv = uitofp i32 %val to double
+  %call.i = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([35 x i8], [35 x i8]* @.str, i64 0, i64 0), double %conv)
+  %cmp.i = fcmp oge double %conv, -1.000000e+00
+  br i1 %cmp.i, label %land.rhs.i, label %if.end.critedge
+; CHECK:  br i1 true, label %land.rhs.i, label %if.end.critedge
+
+land.rhs.i:                                       ; preds = %entry
+  %cmp1.i = fcmp olt double %conv, 1.000000e+00
+  br i1 %cmp1.i, label %if.then, label %if.end
+
+if.then:                                          ; preds = %land.rhs.i
+  %add = fadd double %conv, 5.000000e-01
+  %conv3 = fptosi double %add to i64
+  br label %return
+
+if.end.critedge:                                  ; preds = %entry
+  br label %if.end
+
+if.end:                                           ; preds = %if.end.critedge, %land.rhs.i
+  br label %return
+
+return:                                           ; preds = %if.end, %if.then
+  %retval.0 = phi i64 [ %conv3, %if.then ], [ -1, %if.end ]
+  ret i64 %retval.0
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2012-02-28-ICmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2012-02-28-ICmp.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2012-02-28-ICmp.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2012-02-28-ICmp.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,19 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; <rdar://problem/10803154>
+
+; There should be no transformation.
+; CHECK: %a = trunc i32 %x to i8
+; CHECK: %b = icmp ne i8 %a, 0
+; CHECK: %c = and i32 %x, 16711680
+; CHECK: %d = icmp ne i32 %c, 0
+; CHECK: %e = and i1 %b, %d
+; CHECK: ret i1 %e
+
+define i1 @f1(i32 %x) {
+  %a = trunc i32 %x to i8
+  %b = icmp ne i8 %a, 0
+  %c = and i32 %x, 16711680
+  %d = icmp ne i32 %c, 0
+  %e = and i1 %b, %d
+  ret i1 %e
+}

Added: llvm/trunk/test/Transforms/InstCombine/2012-03-10-InstCombine.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2012-03-10-InstCombine.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2012-03-10-InstCombine.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2012-03-10-InstCombine.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,35 @@
+; RUN: opt < %s -S -instcombine | FileCheck %s
+
+; Derived from gcc.c-torture/execute/frame-address.c
+
+; CHECK-LABEL:     @func(
+; CHECK:     return:
+; CHECK-NOT: ret i32 0
+; CHECK:     ret i32 %retval
+
+define i32 @func(i8* %c, i8* %f) nounwind uwtable readnone noinline ssp {
+entry:
+  %d = alloca i8, align 1
+  store i8 0, i8* %d, align 1
+  %cmp = icmp ugt i8* %d, %c
+  br i1 %cmp, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %cmp2 = icmp ule i8* %d, %f
+  %not.cmp1 = icmp uge i8* %c, %f
+  %.cmp2 = and i1 %cmp2, %not.cmp1
+  %land.ext = zext i1 %.cmp2 to i32
+  br label %return
+
+if.else:                                          ; preds = %entry
+  %cmp5 = icmp uge i8* %d, %f
+  %not.cmp3 = icmp ule i8* %c, %f
+  %.cmp5 = and i1 %cmp5, %not.cmp3
+  %land.ext7 = zext i1 %.cmp5 to i32
+  br label %return
+
+return:                                           ; preds = %if.else, %if.then
+  %retval.0 = phi i32 [ %land.ext, %if.then ], [ %land.ext7, %if.else ]
+  ret i32 %retval.0
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2012-04-24-vselect.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2012-04-24-vselect.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2012-04-24-vselect.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2012-04-24-vselect.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,13 @@
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+; CHECK-LABEL: @foo(
+; CHECK: <i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+
+define <8 x i32> @foo() nounwind {
+entry:
+  %v1.i = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>,
+    <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>,
+    <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i32> %v1.i
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2012-04-30-SRem.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2012-04-30-SRem.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2012-04-30-SRem.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2012-04-30-SRem.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,12 @@
+; RUN: opt -instcombine -S < %s | FileCheck %s
+; PR12541
+
+define i32 @foo(i32 %x) {
+  %y = xor i32 %x, 3
+  %z = srem i32 1656690544, %y
+  %sext = shl i32 %z, 24
+  %s = ashr exact i32 %sext, 24
+  ret i32 %s
+; CHECK-NOT: and
+; The shifts were wrongly being turned into an and with 112
+}

Added: llvm/trunk/test/Transforms/InstCombine/2012-05-28-select-hang.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2012-05-28-select-hang.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2012-05-28-select-hang.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2012-05-28-select-hang.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,39 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+ at c = common global i8 0, align 1
+ at a = common global i8 0, align 1
+ at b = common global i8 0, align 1
+
+define void @func() nounwind uwtable ssp {
+entry:
+  %0 = load i8, i8* @c, align 1
+  %conv = zext i8 %0 to i32
+  %or = or i32 %conv, 1
+  %conv1 = trunc i32 %or to i8
+  store i8 %conv1, i8* @a, align 1
+  %conv2 = zext i8 %conv1 to i32
+  %neg = xor i32 %conv2, -1
+  %and = and i32 1, %neg
+  %conv3 = trunc i32 %and to i8
+  store i8 %conv3, i8* @b, align 1
+  %1 = load i8, i8* @a, align 1
+  %conv4 = zext i8 %1 to i32
+  %conv5 = zext i8 %conv3 to i32
+  %tobool = icmp ne i32 %conv4, 0
+  br i1 %tobool, label %land.rhs, label %land.end
+
+land.rhs:                                         ; preds = %entry
+  %tobool8 = icmp ne i32 %conv5, 0
+  br label %land.end
+
+land.end:                                         ; preds = %land.rhs, %entry
+  %2 = phi i1 [ false, %entry ], [ %tobool8, %land.rhs ]
+  %land.ext = zext i1 %2 to i32
+  %mul = mul nsw i32 3, %land.ext
+  %conv9 = trunc i32 %mul to i8
+  store i8 %conv9, i8* @a, align 1
+  ret void
+
+; CHECK-LABEL: @func(
+; CHECK-NOT: select
+}

Added: llvm/trunk/test/Transforms/InstCombine/2012-06-06-LoadOfPHIs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2012-06-06-LoadOfPHIs.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2012-06-06-LoadOfPHIs.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2012-06-06-LoadOfPHIs.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,162 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; <rdar://problem/10889741>
+
+define void @func(double %r, double %g, double %b, double* %outH, double* %outS, double* %outL) nounwind uwtable ssp {
+bb:
+  %tmp = alloca double, align 8
+  %tmp1 = alloca double, align 8
+  %tmp2 = alloca double, align 8
+  store double %r, double* %tmp, align 8
+  store double %g, double* %tmp1, align 8
+  store double %b, double* %tmp2, align 8
+  %tmp3 = fcmp ogt double %r, %g
+  br i1 %tmp3, label %bb4, label %bb8
+
+bb4:                                              ; preds = %bb
+  %tmp5 = fcmp ogt double %r, %b
+  br i1 %tmp5, label %bb6, label %bb7
+
+bb6:                                              ; preds = %bb4
+  br label %bb12
+
+bb7:                                              ; preds = %bb4
+  br label %bb12
+
+bb8:                                              ; preds = %bb
+  %tmp9 = fcmp ogt double %g, %b
+  br i1 %tmp9, label %bb10, label %bb11
+
+bb10:                                             ; preds = %bb8
+  br label %bb12
+
+bb11:                                             ; preds = %bb8
+  br label %bb12
+
+bb12:                                             ; preds = %bb11, %bb10, %bb7, %bb6
+  %max.0 = phi double* [ %tmp, %bb6 ], [ %tmp2, %bb7 ], [ %tmp1, %bb10 ], [ %tmp2, %bb11 ]
+; CHECK: %tmp13 = load double, double* %tmp, align 8
+; CHECK: %tmp14 = load double, double* %tmp1, align 8
+; CHECK: %tmp15 = fcmp olt double %tmp13, %tmp14
+  %tmp13 = load double, double* %tmp, align 8
+  %tmp14 = load double, double* %tmp1, align 8
+  %tmp15 = fcmp olt double %tmp13, %tmp14
+  br i1 %tmp15, label %bb16, label %bb21
+
+bb16:                                             ; preds = %bb12
+  %tmp17 = load double, double* %tmp2, align 8
+  %tmp18 = fcmp olt double %tmp13, %tmp17
+  br i1 %tmp18, label %bb19, label %bb20
+
+bb19:                                             ; preds = %bb16
+  br label %bb26
+
+bb20:                                             ; preds = %bb16
+  br label %bb26
+
+bb21:                                             ; preds = %bb12
+  %tmp22 = load double, double* %tmp2, align 8
+  %tmp23 = fcmp olt double %tmp14, %tmp22
+  br i1 %tmp23, label %bb24, label %bb25
+
+bb24:                                             ; preds = %bb21
+  br label %bb26
+
+bb25:                                             ; preds = %bb21
+  br label %bb26
+
+bb26:                                             ; preds = %bb25, %bb24, %bb20, %bb19
+  %min.0 = phi double* [ %tmp, %bb19 ], [ %tmp2, %bb20 ], [ %tmp1, %bb24 ], [ %tmp2, %bb25 ]
+; CHECK: %tmp27 = load double, double* %min.0, align 8
+; CHECK: %tmp28 = load double, double* %max.0
+; CHECK: %tmp29 = fadd double %tmp27, %tmp28
+  %tmp27 = load double, double* %min.0, align 8
+  %tmp28 = load double, double* %max.0
+  %tmp29 = fadd double %tmp27, %tmp28
+  %tmp30 = fdiv double %tmp29, 2.000000e+00
+  store double %tmp30, double* %outL
+  %tmp31 = load double, double* %min.0
+  %tmp32 = load double, double* %max.0
+  %tmp33 = fcmp oeq double %tmp31, %tmp32
+  br i1 %tmp33, label %bb34, label %bb35
+
+bb34:                                             ; preds = %bb26
+  store double 0.000000e+00, double* %outS
+  store double 0.000000e+00, double* %outH
+  br label %bb81
+
+bb35:                                             ; preds = %bb26
+  %tmp36 = fcmp olt double %tmp30, 5.000000e-01
+  %tmp37 = fsub double %tmp32, %tmp31
+  br i1 %tmp36, label %bb38, label %bb41
+
+bb38:                                             ; preds = %bb35
+  %tmp39 = fadd double %tmp32, %tmp31
+  %tmp40 = fdiv double %tmp37, %tmp39
+  store double %tmp40, double* %outS
+  br label %bb45
+
+bb41:                                             ; preds = %bb35
+  %tmp42 = fsub double 2.000000e+00, %tmp32
+  %tmp43 = fsub double %tmp42, %tmp31
+  %tmp44 = fdiv double %tmp37, %tmp43
+  store double %tmp44, double* %outS
+  br label %bb45
+
+bb45:                                             ; preds = %bb41, %bb38
+  %tmp46 = icmp eq double* %max.0, %tmp
+  br i1 %tmp46, label %bb47, label %bb55
+
+bb47:                                             ; preds = %bb45
+  %tmp48 = load double, double* %tmp1, align 8
+  %tmp49 = load double, double* %tmp2, align 8
+  %tmp50 = fsub double %tmp48, %tmp49
+  %tmp51 = load double, double* %max.0
+  %tmp52 = load double, double* %min.0
+  %tmp53 = fsub double %tmp51, %tmp52
+  %tmp54 = fdiv double %tmp50, %tmp53
+  store double %tmp54, double* %outH
+  br label %bb75
+
+bb55:                                             ; preds = %bb45
+  %tmp56 = icmp eq double* %max.0, %tmp1
+  br i1 %tmp56, label %bb57, label %bb66
+
+bb57:                                             ; preds = %bb55
+  %tmp58 = load double, double* %tmp2, align 8
+  %tmp59 = load double, double* %tmp, align 8
+  %tmp60 = fsub double %tmp58, %tmp59
+  %tmp61 = load double, double* %max.0
+  %tmp62 = load double, double* %min.0
+  %tmp63 = fsub double %tmp61, %tmp62
+  %tmp64 = fdiv double %tmp60, %tmp63
+  %tmp65 = fadd double 2.000000e+00, %tmp64
+  store double %tmp65, double* %outH
+  br label %bb75
+
+bb66:                                             ; preds = %bb55
+  %tmp67 = load double, double* %tmp, align 8
+  %tmp68 = load double, double* %tmp1, align 8
+  %tmp69 = fsub double %tmp67, %tmp68
+  %tmp70 = load double, double* %max.0
+  %tmp71 = load double, double* %min.0
+  %tmp72 = fsub double %tmp70, %tmp71
+  %tmp73 = fdiv double %tmp69, %tmp72
+  %tmp74 = fadd double 4.000000e+00, %tmp73
+  store double %tmp74, double* %outH
+  br label %bb75
+
+bb75:                                             ; preds = %bb66, %bb57, %bb47
+  %tmp76 = load double, double* %outH
+  %tmp77 = fdiv double %tmp76, 6.000000e+00
+  store double %tmp77, double* %outH
+  %tmp78 = fcmp olt double %tmp77, 0.000000e+00
+  br i1 %tmp78, label %bb79, label %bb81
+
+bb79:                                             ; preds = %bb75
+  %tmp80 = fadd double %tmp77, 1.000000e+00
+  store double %tmp80, double* %outH
+  br label %bb81
+
+bb81:                                             ; preds = %bb79, %bb75, %bb34
+  ret void
+}

Added: llvm/trunk/test/Transforms/InstCombine/2012-07-25-LoadPart.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2012-07-25-LoadPart.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2012-07-25-LoadPart.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2012-07-25-LoadPart.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,14 @@
+; RUN: opt < %s -data-layout="e-p:32:32:32" -instcombine -S | FileCheck %s --check-prefix=LE
+; RUN: opt < %s -data-layout="E-p:32:32:32" -instcombine -S | FileCheck %s --check-prefix=BE
+; PR13442
+
+ at test = constant [4 x i32] [i32 1, i32 2, i32 3, i32 4]
+
+define i64 @foo() {
+  %ret = load i64, i64* bitcast (i8* getelementptr (i8, i8* bitcast ([4 x i32]* @test to i8*), i64 2) to i64*), align 1
+  ret i64 %ret
+  ; 0x00030000_00020000 in [01 00/00 00 02 00 00 00 03 00/00 00 04 00 00 00]
+  ; LE: ret i64 844424930263040
+  ; 0x00000200_00000300 in [00 00/00 01 00 00 00 02 00 00/00 03 00 00 00 04]
+  ; BE: ret i64 281474976841728
+}

Added: llvm/trunk/test/Transforms/InstCombine/2012-07-30-addrsp-bitcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2012-07-30-addrsp-bitcast.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2012-07-30-addrsp-bitcast.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2012-07-30-addrsp-bitcast.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,10 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; CHECK: addrspacecast
+
+ at base = internal unnamed_addr addrspace(3) global [16 x i32] zeroinitializer, align 16
+declare void @foo(i32*)
+
+define void @test() nounwind {
+  call void @foo(i32* getelementptr (i32, i32* addrspacecast ([16 x i32] addrspace(3)* @base to i32*), i64 2147483647)) nounwind
+  ret void
+}

Added: llvm/trunk/test/Transforms/InstCombine/2012-08-28-udiv_ashl.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2012-08-28-udiv_ashl.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2012-08-28-udiv_ashl.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2012-08-28-udiv_ashl.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,57 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+; rdar://12182093
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK-LABEL: @udiv400(
+; CHECK: udiv i32 %x, 400
+; CHECK: ret
+define i32 @udiv400(i32 %x) {
+entry:
+  %div = lshr i32 %x, 2
+  %div1 = udiv i32 %div, 100
+  ret i32 %div1
+}
+
+
+; CHECK-LABEL: @udiv400_no(
+; CHECK: ashr
+; CHECK: div
+; CHECK: ret
+define i32 @udiv400_no(i32 %x) {
+entry:
+  %div = ashr i32 %x, 2
+  %div1 = udiv i32 %div, 100
+  ret i32 %div1
+}
+
+; CHECK-LABEL: @sdiv400_yes(
+; CHECK: udiv i32 %x, 400
+; CHECK: ret
+define i32 @sdiv400_yes(i32 %x) {
+entry:
+  %div = lshr i32 %x, 2
+  ; The sign bits of both operands are zero (i.e. we can prove they are
+  ; unsigned inputs), turn this into a udiv.
+  ; Next, optimize this just like sdiv.
+  %div1 = sdiv i32 %div, 100
+  ret i32 %div1
+}
+
+
+; CHECK-LABEL: @udiv_i80(
+; CHECK: udiv i80 %x, 400
+; CHECK: ret
+define i80 @udiv_i80(i80 %x) {
+  %div = lshr i80 %x, 2
+  %div1 = udiv i80 %div, 100
+  ret i80 %div1
+}
+
+define i32 @no_crash_notconst_udiv(i32 %x, i32 %notconst) {
+  %div = lshr i32 %x, %notconst
+  %div1 = udiv i32 %div, 100
+  ret i32 %div1
+}

Added: llvm/trunk/test/Transforms/InstCombine/2012-09-17-ZeroSizedAlloca.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2012-09-17-ZeroSizedAlloca.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2012-09-17-ZeroSizedAlloca.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2012-09-17-ZeroSizedAlloca.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,24 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+; When merging zero sized alloca check that requested alignments of the allocas
+; are obeyed.
+
+ at x = global i8* null, align 8
+ at y = global i8* null, align 8
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK-LABEL: @f(
+; CHECK-NEXT: alloca [0 x i8], align 1024
+; CHECK-NOT: alloca
+; CHECK: ret void
+define void @f() {
+  %1 = alloca [0 x i8], align 1
+  %2 = alloca [0 x i8], align 1024
+  %3 = getelementptr inbounds [0 x i8], [0 x i8]* %1, i64 0, i64 0
+  %4 = getelementptr inbounds [0 x i8], [0 x i8]* %2, i64 0, i64 0
+  store i8* %3, i8** @x, align 8
+  store i8* %4, i8** @y, align 8
+  ret void
+}

Added: llvm/trunk/test/Transforms/InstCombine/2012-10-25-vector-of-pointers.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2012-10-25-vector-of-pointers.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2012-10-25-vector-of-pointers.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2012-10-25-vector-of-pointers.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,51 @@
+; RUN: opt < %s -instcombine -S
+
+; Make sure that we don't crash when optimizing the vectors of pointers.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct.hoge = type { double*, double*, double*, double** }
+
+define void @widget(%struct.hoge* nocapture %arg) nounwind uwtable ssp {
+bb:
+  %tmp = getelementptr inbounds %struct.hoge, %struct.hoge* %arg, i64 0, i32 0
+  br i1 undef, label %bb1, label %bb17
+
+bb1:                                              ; preds = %bb
+  br i1 undef, label %bb2, label %bb3
+
+bb2:                                              ; preds = %bb1
+  br label %bb17
+
+bb3:                                              ; preds = %bb1
+  %tmp4 = bitcast double** %tmp to <2 x double*>*
+  %tmp5 = load <2 x double*>, <2 x double*>* %tmp4, align 8
+  %tmp6 = ptrtoint <2 x double*> %tmp5 to <2 x i64>
+  %tmp7 = sub <2 x i64> zeroinitializer, %tmp6
+  %tmp8 = ashr exact <2 x i64> %tmp7, <i64 3, i64 3>
+  %tmp9 = extractelement <2 x i64> %tmp8, i32 0
+  %tmp10 = add nsw i64 undef, %tmp9
+  br i1 undef, label %bb11, label %bb12
+
+bb11:                                             ; preds = %bb3
+  br label %bb13
+
+bb12:                                             ; preds = %bb3
+  br label %bb13
+
+bb13:                                             ; preds = %bb12, %bb11
+  br i1 undef, label %bb16, label %bb14
+
+bb14:                                             ; preds = %bb13
+  br i1 undef, label %bb16, label %bb15
+
+bb15:                                             ; preds = %bb14
+  br label %bb16
+
+bb16:                                             ; preds = %bb15, %bb14, %bb13
+  unreachable
+
+bb17:                                             ; preds = %bb2, %bb
+  ret void
+}

Added: llvm/trunk/test/Transforms/InstCombine/2012-12-14-simp-vgep.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2012-12-14-simp-vgep.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2012-12-14-simp-vgep.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2012-12-14-simp-vgep.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,10 @@
+; RUN: opt < %s -instcombine -S
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+define <4 x i32> @foo(<4 x i32*>* %in) {
+  %t17 = load <4 x i32*>, <4 x i32*>* %in, align 8
+  %t18 = icmp eq <4 x i32*> %t17, zeroinitializer
+  %t19 = zext <4 x i1> %t18 to <4 x i32>
+  ret <4 x i32> %t19
+}

Added: llvm/trunk/test/Transforms/InstCombine/2012-3-15-or-xor-constant.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2012-3-15-or-xor-constant.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2012-3-15-or-xor-constant.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2012-3-15-or-xor-constant.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,12 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; PR12234
+
+ at g = extern_weak global i32
+define i32 @function(i32 %x) nounwind {
+entry:
+  %xor = xor i32 %x, 1
+  store volatile i32 %xor, i32* inttoptr (i64 1 to i32*), align 4
+  %or4 = or i32 or (i32 zext (i1 icmp eq (i32* @g, i32* null) to i32), i32 1), %xor
+  ret i32 %or4
+}
+; CHECK-LABEL: define i32 @function(

Added: llvm/trunk/test/Transforms/InstCombine/2012-6-7-vselect-bitcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2012-6-7-vselect-bitcast.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2012-6-7-vselect-bitcast.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2012-6-7-vselect-bitcast.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,11 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; CHECK: bitcast
+
+define void @foo(<16 x i8> %a, <16 x i8> %b, <4 x i32>* %c) {
+  %aa = bitcast <16 x i8> %a to <4 x i32>
+  %bb = bitcast <16 x i8> %b to <4 x i32>
+  %select_v = select <4 x i1> zeroinitializer, <4 x i32> %aa, <4 x i32> %bb
+  store <4 x i32> %select_v, <4 x i32>* %c, align 4
+  ret void
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/2013-03-05-Combine-BitcastTy-Into-Alloca.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/2013-03-05-Combine-BitcastTy-Into-Alloca.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/2013-03-05-Combine-BitcastTy-Into-Alloca.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/2013-03-05-Combine-BitcastTy-Into-Alloca.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,45 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+%struct._my_struct = type <{ [12 x i8], [4 x i8] }>
+
+ at initval = common global %struct._my_struct zeroinitializer, align 1
+
+; InstCombine will try to change the %struct._my_struct alloca into an
+; allocation of an i96 because of the bitcast to create %2. That's not valid,
+; as the other 32 bits of the structure still feed into the return value
+define { i64, i64 } @function(i32 %x, i32 %y, i32 %z) nounwind {
+; CHECK-LABEL: @function(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %retval = alloca %struct._my_struct, align 8
+; CHECK-NOT: bitcast i96* %retval to %struct._my_struct*
+entry:
+  %retval = alloca %struct._my_struct, align 8
+  %k.sroa.0.0.copyload = load i96, i96* bitcast (%struct._my_struct* @initval to i96*), align 1
+  %k.sroa.1.12.copyload = load i32, i32* bitcast ([4 x i8]* getelementptr inbounds (%struct._my_struct, %struct._my_struct* @initval, i64 0, i32 1) to i32*), align 1
+  %0 = zext i32 %x to i96
+  %bf.value = shl nuw nsw i96 %0, 6
+  %bf.clear = and i96 %k.sroa.0.0.copyload, -288230376151711744
+  %1 = zext i32 %y to i96
+  %bf.value2 = shl nuw nsw i96 %1, 32
+  %bf.shl3 = and i96 %bf.value2, 288230371856744448
+  %bf.value.masked = and i96 %bf.value, 4294967232
+  %2 = zext i32 %z to i96
+  %bf.value8 = and i96 %2, 63
+  %bf.clear4 = or i96 %bf.shl3, %bf.value.masked
+  %bf.set5 = or i96 %bf.clear4, %bf.value8
+  %bf.set10 = or i96 %bf.set5, %bf.clear
+  %retval.0.cast7 = bitcast %struct._my_struct* %retval to i96*
+  store i96 %bf.set10, i96* %retval.0.cast7, align 8
+  %retval.12.idx8 = getelementptr inbounds %struct._my_struct, %struct._my_struct* %retval, i64 0, i32 1
+  %retval.12.cast9 = bitcast [4 x i8]* %retval.12.idx8 to i32*
+  store i32 %k.sroa.1.12.copyload, i32* %retval.12.cast9, align 4
+  %trunc = trunc i96 %bf.set10 to i64
+  %.fca.0.insert = insertvalue { i64, i64 } undef, i64 %trunc, 0
+  %retval.8.idx12 = getelementptr inbounds %struct._my_struct, %struct._my_struct* %retval, i64 0, i32 0, i64 8
+  %retval.8.cast13 = bitcast i8* %retval.8.idx12 to i64*
+  %retval.8.load14 = load i64, i64* %retval.8.cast13, align 8
+  %.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %retval.8.load14, 1
+  ret { i64, i64 } %.fca.1.insert
+}

Added: llvm/trunk/test/Transforms/InstCombine/AArch64/2012-04-23-Neon-Intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/AArch64/2012-04-23-Neon-Intrinsics.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/AArch64/2012-04-23-Neon-Intrinsics.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/AArch64/2012-04-23-Neon-Intrinsics.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,71 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+; ARM64 neon intrinsic variants - <rdar://problem/12349617>
+; REQUIRES: aarch64
+
+define <4 x i32> @mulByZeroARM64(<4 x i16> %x) nounwind readnone ssp {
+entry:
+  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %x, <4 x i16> zeroinitializer) nounwind
+  ret <4 x i32> %a
+; CHECK: entry:
+; CHECK-NEXT: ret <4 x i32> zeroinitializer
+}
+
+define <4 x i32> @mulByOneARM64(<4 x i16> %x) nounwind readnone ssp {
+entry:
+  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %x, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+  ret <4 x i32> %a
+; CHECK: entry:
+; CHECK-NEXT: %a = sext <4 x i16> %x to <4 x i32>
+; CHECK-NEXT: ret <4 x i32> %a
+}
+
+define <4 x i32> @constantMulARM64() nounwind readnone ssp {
+entry:
+  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
+  ret <4 x i32> %a
+; CHECK: entry:
+; CHECK-NEXT: ret <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+}
+
+define <4 x i32> @constantMulSARM64() nounwind readnone ssp {
+entry:
+  %b = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+  ret <4 x i32> %b
+; CHECK: entry:
+; CHECK-NEXT: ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+}
+
+define <4 x i32> @constantMulUARM64() nounwind readnone ssp {
+entry:
+  %b = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+  ret <4 x i32> %b
+; CHECK: entry:
+; CHECK-NEXT: ret <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>
+}
+
+define <4 x i32> @complex1ARM64(<4 x i16> %x) nounwind readnone ssp {
+entry:
+  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) nounwind
+  %b = add <4 x i32> zeroinitializer, %a
+  ret <4 x i32> %b
+; CHECK: entry:
+; CHECK-NEXT: %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) [[NUW:#[0-9]+]]
+; CHECK-NEXT: ret <4 x i32> %a
+}
+
+define <4 x i32> @complex2ARM64(<4 x i32> %x) nounwind readnone ssp {
+entry:
+  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
+  %b = add <4 x i32> %x, %a
+  ret <4 x i32> %b
+; CHECK: entry:
+; CHECK-NEXT: %b = add <4 x i32> %x, <i32 6, i32 6, i32 6, i32 6>
+; CHECK-NEXT: ret <4 x i32> %b
+}
+
+declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+
+; CHECK: attributes #0 = { nounwind readnone ssp }
+; CHECK: attributes #1 = { nounwind readnone }
+; CHECK: attributes [[NUW]] = { nounwind }

Added: llvm/trunk/test/Transforms/InstCombine/AArch64/aes-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/AArch64/aes-intrinsics.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/AArch64/aes-intrinsics.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/AArch64/aes-intrinsics.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,44 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+; ARM64 AES intrinsic variants
+
+define <16 x i8> @combineXorAeseZeroARM64(<16 x i8> %data, <16 x i8> %key) {
+; CHECK-LABEL: @combineXorAeseZeroARM64(
+; CHECK-NEXT:    %data.aes = tail call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %data, <16 x i8> %key)
+; CHECK-NEXT:    ret <16 x i8> %data.aes
+  %data.xor = xor <16 x i8> %data, %key
+  %data.aes = tail call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %data.xor, <16 x i8> zeroinitializer)
+  ret <16 x i8> %data.aes
+}
+
+define <16 x i8> @combineXorAeseNonZeroARM64(<16 x i8> %data, <16 x i8> %key) {
+; CHECK-LABEL: @combineXorAeseNonZeroARM64(
+; CHECK-NEXT:    %data.xor = xor <16 x i8> %data, %key
+; CHECK-NEXT:    %data.aes = tail call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %data.xor, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    ret <16 x i8> %data.aes
+  %data.xor = xor <16 x i8> %data, %key
+  %data.aes = tail call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %data.xor, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  ret <16 x i8> %data.aes
+}
+
+define <16 x i8> @combineXorAesdZeroARM64(<16 x i8> %data, <16 x i8> %key) {
+; CHECK-LABEL: @combineXorAesdZeroARM64(
+; CHECK-NEXT:    %data.aes = tail call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %data, <16 x i8> %key)
+; CHECK-NEXT:    ret <16 x i8> %data.aes
+  %data.xor = xor <16 x i8> %data, %key
+  %data.aes = tail call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %data.xor, <16 x i8> zeroinitializer)
+  ret <16 x i8> %data.aes
+}
+
+define <16 x i8> @combineXorAesdNonZeroARM64(<16 x i8> %data, <16 x i8> %key) {
+; CHECK-LABEL: @combineXorAesdNonZeroARM64(
+; CHECK-NEXT:    %data.xor = xor <16 x i8> %data, %key
+; CHECK-NEXT:    %data.aes = tail call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %data.xor, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    ret <16 x i8> %data.aes
+  %data.xor = xor <16 x i8> %data, %key
+  %data.aes = tail call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %data.xor, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  ret <16 x i8> %data.aes
+}
+
+declare <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8>, <16 x i8>) #0
+declare <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8>, <16 x i8>) #0
+

Added: llvm/trunk/test/Transforms/InstCombine/AArch64/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/AArch64/lit.local.cfg?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/AArch64/lit.local.cfg (added)
+++ llvm/trunk/test/Transforms/InstCombine/AArch64/lit.local.cfg Tue Apr 16 21:52:47 2019
@@ -0,0 +1,2 @@
+if not 'AArch64' in config.root.targets:
+    config.unsupported = True

Added: llvm/trunk/test/Transforms/InstCombine/AArch64/tbl1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/AArch64/tbl1.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/AArch64/tbl1.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/AArch64/tbl1.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-arm-none-eabi"
+
+; Turning a table lookup intrinsic into a shuffle vector instruction
+; can be beneficial. If the mask used for the lookup is the constant
+; vector {7,6,5,4,3,2,1,0}, then the back-end generates rev64
+; instructions instead.
+
+define <8 x i8> @tbl1_8x8(<16 x i8> %vec) {
+; CHECK-LABEL: @tbl1_8x8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[VEC:%.*]], <16 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+;
+entry:
+  %tbl1 = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %vec, <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <8 x i8> %tbl1
+}
+
+; Bail the optimization if a mask index is out of range.
+define <8 x i8> @tbl1_8x8_out_of_range(<16 x i8> %vec) {
+; CHECK-LABEL: @tbl1_8x8_out_of_range(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TBL1:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VEC:%.*]], <8 x i8> <i8 8, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; CHECK-NEXT:    ret <8 x i8> [[TBL1]]
+;
+entry:
+  %tbl1 = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %vec, <8 x i8> <i8 8, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <8 x i8> %tbl1
+}
+
+; Bail the optimization if the size of the return vector is not 8 elements.
+define <16 x i8> @tbl1_16x8(<16 x i8> %vec) {
+; CHECK-LABEL: @tbl1_16x8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TBL1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[VEC:%.*]], <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; CHECK-NEXT:    ret <16 x i8> [[TBL1]]
+;
+entry:
+  %tbl1 = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %vec, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <16 x i8> %tbl1
+}
+
+; Bail the optimization if the elements of the return vector are not of type i8.
+define <8 x i16> @tbl1_8x16(<16 x i8> %vec) {
+; CHECK-LABEL: @tbl1_8x16(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TBL1:%.*]] = call <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8> [[VEC:%.*]], <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; CHECK-NEXT:    ret <8 x i16> [[TBL1]]
+;
+entry:
+  %tbl1 = call <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8> %vec, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+  ret <8 x i16> %tbl1
+}
+
+; The type <8 x i16> is not a valid return type for this intrinsic,
+; but we want to test that the optimization won't trigger for vector
+; elements of type different than i8.
+declare <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8>, <8 x i16>)
+
+declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>)
+declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>)

Added: llvm/trunk/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,2407 @@
+; RUN: opt -S -instcombine %s | FileCheck %s
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.buffer.load
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @buffer_load_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @buffer_load_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  ret float %data
+}
+
+; CHECK-LABEL: @buffer_load_v1f32(
+; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret <1 x float> %data
+define amdgpu_ps <1 x float> @buffer_load_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  ret <1 x float> %data
+}
+
+; CHECK-LABEL: @buffer_load_v2f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret <2 x float> %data
+define amdgpu_ps <2 x float> @buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  ret <2 x float> %data
+}
+
+; CHECK-LABEL: @buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret <4 x float> %data
+define amdgpu_ps <4 x float> @buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  ret <4 x float> %data
+}
+
+; CHECK-LABEL: @extract_elt0_buffer_load_v2f32(
+; CHECK: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %elt0 = extractelement <2 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt1_buffer_load_v2f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt1_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %elt1 = extractelement <2 x float> %data, i32 1
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt0_buffer_load_v4f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt1_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %elt1 = extractelement <4 x float> %data, i32 1
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt2_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %elt1 = extractelement <4 x float> %data, i32 2
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt3_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %elt1 = extractelement <4 x float> %data, i32 3
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret <2 x float>
+define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: ret <2 x float> %shuf
+define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 1, i32 2>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt2_elt3_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: ret <2 x float> %shuf
+define amdgpu_ps <2 x float> @extract_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret <3 x float> %data
+define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt1_elt2_elt3_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+; CHECK-NEXT: ret <3 x float> %shuf
+define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+  ret <3 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt0_elt2_elt3_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT: ret <3 x float> %shuf
+define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+  ret <3 x float> %shuf
+}
+
+; FIXME: Not handled even though only 2 elts used
+; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32_2(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %elt0 = extractelement <4 x float> %data, i32 0
+; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 1
+; CHECK-NEXT: %ins0 = insertvalue { float, float } undef, float %elt0, 0
+; CHECK-NEXT: %ins1 = insertvalue { float, float } %ins0, float %elt1, 1
+; CHECK-NEXT: ret { float, float } %ins1
+define amdgpu_ps { float, float } @extract_elt0_elt1_buffer_load_v4f32_2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  %elt1 = extractelement <4 x float> %data, i32 1
+  %ins0 = insertvalue { float, float } undef, float %elt0, 0
+  %ins1 = insertvalue { float, float } %ins0, float %elt1, 1
+  ret { float, float } %ins1
+}
+
+; CHECK-LABEL: @extract_elt0_buffer_load_v3f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %elt0 = extractelement <3 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt1_buffer_load_v3f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %elt1 = extractelement <3 x float> %data, i32 1
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt2_buffer_load_v3f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %elt1 = extractelement <3 x float> %data, i32 2
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v3f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret <2 x float>
+define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v3f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: ret <2 x float> %shuf
+define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @preserve_metadata_extract_elt0_buffer_load_v2f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false), !fpmath !0
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @preserve_metadata_extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false), !fpmath !0
+  %elt0 = extractelement <2 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #1
+declare <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32>, i32, i32, i1, i1) #1
+declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #1
+declare <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32>, i32, i32, i1, i1) #1
+declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.buffer.load.format
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @buffer_load_format_v1f32(
+; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 true)
+; CHECK-NEXT: ret <1 x float> %data
+define amdgpu_ps <1 x float> @buffer_load_format_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 true)
+  ret <1 x float> %data
+}
+
+; CHECK-LABEL: @extract_elt0_buffer_load_format_v2f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 true, i1 false)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 true, i1 false)
+  %elt0 = extractelement <2 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v3f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret <2 x float> %data
+define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v4f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret <2 x float> %data
+define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %shuf
+}
+
+; The initial insertion point is at the extractelement
+; CHECK-LABEL: @extract01_bitcast_buffer_load_format_v4f32(
+; CHECK-NEXT: %tmp = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false)
+; CHECK-NEXT: %1 = shufflevector <2 x float> %tmp, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT: %tmp1 = bitcast <4 x float> %1 to <2 x double>
+; CHECK-NEXT: %tmp2 = extractelement <2 x double> %tmp1, i32 0
+; CHECK-NEXT: ret double %tmp2
+define double @extract01_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
+  %tmp = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
+  %tmp1 = bitcast <4 x float> %tmp to <2 x double>
+  %tmp2 = extractelement <2 x double> %tmp1, i32 0
+  ret double %tmp2
+}
+
+; CHECK-LABEL: @extract0_bitcast_buffer_load_format_v4f32(
+; CHECK-NEXT: %tmp = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false)
+; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32
+; CHECK-NEXT: ret i32 %tmp2
+define i32 @extract0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
+  %tmp = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
+  %tmp1 = bitcast <4 x float> %tmp to <4 x i32>
+  %tmp2 = extractelement <4 x i32> %tmp1, i32 0
+  ret i32 %tmp2
+}
+
+; CHECK-LABEL: @extract_lo16_0_bitcast_buffer_load_format_v4f32(
+; CHECK-NEXT: %tmp = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false)
+; CHECK-NEXT: %1 = bitcast float %tmp to i32
+; CHECK-NEXT: %tmp2 = trunc i32 %1 to i16
+; CHECK-NEXT: ret i16 %tmp2
+define i16 @extract_lo16_0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
+  %tmp = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
+  %tmp1 = bitcast <4 x float> %tmp to <8 x i16>
+  %tmp2 = extractelement <8 x i16> %tmp1, i32 0
+  ret i16 %tmp2
+}
+
+declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) #1
+declare <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32>, i32, i32, i1, i1) #1
+declare <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32>, i32, i32, i1, i1) #1
+declare <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32>, i32, i32, i1, i1) #1
+declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.raw.buffer.load
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @raw_buffer_load_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @raw_buffer_load_f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  ret float %data
+}
+
+; CHECK-LABEL: @raw_buffer_load_v1f32(
+; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.raw.buffer.load.v1f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret <1 x float> %data
+define amdgpu_ps <1 x float> @raw_buffer_load_v1f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <1 x float> @llvm.amdgcn.raw.buffer.load.v1f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  ret <1 x float> %data
+}
+
+; CHECK-LABEL: @raw_buffer_load_v2f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret <2 x float> %data
+define amdgpu_ps <2 x float> @raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  ret <2 x float> %data
+}
+
+; CHECK-LABEL: @raw_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret <4 x float> %data
+define amdgpu_ps <4 x float> @raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  ret <4 x float> %data
+}
+
+; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2f32(
+; CHECK: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %elt0 = extractelement <2 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt1_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %elt1 = extractelement <2 x float> %data, i32 1
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt0_raw_buffer_load_v4f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt1_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %elt1 = extractelement <4 x float> %data, i32 1
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt2_raw_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %elt1 = extractelement <4 x float> %data, i32 2
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %elt1 = extractelement <4 x float> %data, i32 3
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret <2 x float>
+define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: ret <2 x float> %shuf
+define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 1, i32 2>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt2_elt3_raw_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: ret <2 x float> %shuf
+define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret <3 x float> %data
+define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+; CHECK-NEXT: ret <3 x float> %shuf
+define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+  ret <3 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT: ret <3 x float> %shuf
+define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+  ret <3 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt0_raw_buffer_load_v3f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %elt0 = extractelement <3 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt1_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %elt1 = extractelement <3 x float> %data, i32 1
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt2_raw_buffer_load_v3f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt2_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %elt1 = extractelement <3 x float> %data, i32 2
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v3f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret <2 x float>
+define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v3f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: ret <2 x float> %shuf
+define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_v4f32(
+; CHECK-NEXT: %tmp = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32
+; CHECK-NEXT: ret i32 %tmp2
+define i32 @extract0_bitcast_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %tmp = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %tmp1 = bitcast <4 x float> %tmp to <4 x i32>
+  %tmp2 = extractelement <4 x i32> %tmp1, i32 0
+  ret i32 %tmp2
+}
+
+; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_v4i32(
+; CHECK-NEXT: %tmp = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %tmp2 = bitcast i32 %tmp to float
+; CHECK-NEXT: ret float %tmp2
+define float @extract0_bitcast_raw_buffer_load_v4i32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %tmp = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %tmp1 = bitcast <4 x i32> %tmp to <4 x float>
+  %tmp2 = extractelement <4 x float> %tmp1, i32 0
+  ret float %tmp2
+}
+
+; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_buffer_load_v2f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @preserve_metadata_extract_elt0_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0
+  %elt0 = extractelement <2 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #1
+declare <1 x float> @llvm.amdgcn.raw.buffer.load.v1f32(<4 x i32>, i32, i32, i32) #1
+declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32) #1
+declare <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32>, i32, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #1
+
+declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.raw.buffer.load.format
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @raw_buffer_load_format_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @raw_buffer_load_format_f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  ret float %data
+}
+
+; CHECK-LABEL: @raw_buffer_load_format_v1f32(
+; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.raw.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret <1 x float> %data
+define amdgpu_ps <1 x float> @raw_buffer_load_format_v1f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <1 x float> @llvm.amdgcn.raw.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  ret <1 x float> %data
+}
+
+; CHECK-LABEL: @raw_buffer_load_format_v2f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret <2 x float> %data
+define amdgpu_ps <2 x float> @raw_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  ret <2 x float> %data
+}
+
+; CHECK-LABEL: @raw_buffer_load_format_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret <4 x float> %data
+define amdgpu_ps <4 x float> @raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  ret <4 x float> %data
+}
+
+; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v2f32(
+; CHECK: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_raw_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %elt0 = extractelement <2 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v2f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %elt1 = extractelement <2 x float> %data, i32 1
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v4f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v4f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %elt1 = extractelement <4 x float> %data, i32 1
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt2_raw_buffer_load_format_v4f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt2_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %elt1 = extractelement <4 x float> %data, i32 2
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt3_raw_buffer_load_format_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %elt1 = extractelement <4 x float> %data, i32 3
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_format_v4f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret <2 x float>
+define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_format_v4f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: ret <2 x float> %shuf
+define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 1, i32 2>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt2_elt3_raw_buffer_load_format_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: ret <2 x float> %shuf
+define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_elt2_raw_buffer_load_format_v4f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret <3 x float> %data
+define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_buffer_load_format_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+; CHECK-NEXT: ret <3 x float> %shuf
+define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+  ret <3 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt0_elt2_elt3_raw_buffer_load_format_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT: ret <3 x float> %shuf
+define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+  ret <3 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt0_raw_buffer_load_format_v3f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %elt0 = extractelement <3 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt1_raw_buffer_load_format_v3f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt1_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %elt1 = extractelement <3 x float> %data, i32 1
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt2_raw_buffer_load_format_v3f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt2_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %elt1 = extractelement <3 x float> %data, i32 2
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_format_v3f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret <2 x float>
+define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_format_v3f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: ret <2 x float> %shuf
+define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_format_v4f32(
+; CHECK-NEXT: %tmp = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32
+; CHECK-NEXT: ret i32 %tmp2
+define i32 @extract0_bitcast_raw_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %tmp = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %tmp1 = bitcast <4 x float> %tmp to <4 x i32>
+  %tmp2 = extractelement <4 x i32> %tmp1, i32 0
+  ret i32 %tmp2
+}
+
+; CHECK-LABEL: @extract0_bitcast_raw_buffer_load_format_v4i32(
+; CHECK-NEXT: %tmp = call i32 @llvm.amdgcn.raw.buffer.load.format.i32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %tmp2 = bitcast i32 %tmp to float
+; CHECK-NEXT: ret float %tmp2
+define float @extract0_bitcast_raw_buffer_load_format_v4i32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %tmp = call <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
+  %tmp1 = bitcast <4 x i32> %tmp to <4 x float>
+  %tmp2 = extractelement <4 x float> %tmp1, i32 0
+  ret float %tmp2
+}
+
+; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_buffer_load_format_v2f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @preserve_metadata_extract_elt0_raw_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
+  %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0
+  %elt0 = extractelement <2 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32>, i32, i32, i32) #1
+declare <1 x float> @llvm.amdgcn.raw.buffer.load.format.v1f32(<4 x i32>, i32, i32, i32) #1
+declare <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32) #1
+declare <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32>, i32, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32) #1
+
+declare <4 x i32> @llvm.amdgcn.raw.buffer.load.format.v4i32(<4 x i32>, i32, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.struct.buffer.load
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @struct_buffer_load_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @struct_buffer_load_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  ret float %data
+}
+
+; CHECK-LABEL: @struct_buffer_load_v1f32(
+; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.struct.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret <1 x float> %data
+define amdgpu_ps <1 x float> @struct_buffer_load_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <1 x float> @llvm.amdgcn.struct.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  ret <1 x float> %data
+}
+
+; CHECK-LABEL: @struct_buffer_load_v2f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret <2 x float> %data
+define amdgpu_ps <2 x float> @struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  ret <2 x float> %data
+}
+
+; CHECK-LABEL: @struct_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret <4 x float> %data
+define amdgpu_ps <4 x float> @struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  ret <4 x float> %data
+}
+
+; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2f32(
+; CHECK: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %elt0 = extractelement <2 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt1_struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %elt1 = extractelement <2 x float> %data, i32 1
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt0_struct_buffer_load_v4f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt1_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %elt1 = extractelement <4 x float> %data, i32 1
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt2_struct_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %elt1 = extractelement <4 x float> %data, i32 2
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %elt1 = extractelement <4 x float> %data, i32 3
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret <2 x float>
+define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: ret <2 x float> %shuf
+define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 1, i32 2>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt2_elt3_struct_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: ret <2 x float> %shuf
+define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret <3 x float> %data
+define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+; CHECK-NEXT: ret <3 x float> %shuf
+define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+  ret <3 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT: ret <3 x float> %shuf
+define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+  ret <3 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt0_struct_buffer_load_v3f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %elt0 = extractelement <3 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt1_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %elt1 = extractelement <3 x float> %data, i32 1
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt2_struct_buffer_load_v3f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt2_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %elt1 = extractelement <3 x float> %data, i32 2
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v3f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret <2 x float>
+define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v3f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: ret <2 x float> %shuf
+define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_v4f32(
+; CHECK-NEXT: %tmp = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32
+; CHECK-NEXT: ret i32 %tmp2
+define i32 @extract0_bitcast_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %tmp = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %tmp1 = bitcast <4 x float> %tmp to <4 x i32>
+  %tmp2 = extractelement <4 x i32> %tmp1, i32 0
+  ret i32 %tmp2
+}
+
+; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_v4i32(
+; CHECK-NEXT: %tmp = call i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %tmp2 = bitcast i32 %tmp to float
+; CHECK-NEXT: ret float %tmp2
+define float @extract0_bitcast_struct_buffer_load_v4i32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %tmp = call <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %tmp1 = bitcast <4 x i32> %tmp to <4 x float>
+  %tmp2 = extractelement <4 x float> %tmp1, i32 0
+  ret float %tmp2
+}
+
+; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_buffer_load_v2f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0), !fpmath !0
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @preserve_metadata_extract_elt0_struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0), !fpmath !0
+  %elt0 = extractelement <2 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #1
+declare <1 x float> @llvm.amdgcn.struct.buffer.load.v1f32(<4 x i32>, i32, i32, i32, i32) #1
+declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) #1
+declare <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32>, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) #1
+
+declare <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.struct.buffer.load.format
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @struct_buffer_load_format_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @struct_buffer_load_format_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  ret float %data
+}
+
+; CHECK-LABEL: @struct_buffer_load_format_v1f32(
+; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.struct.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret <1 x float> %data
+define amdgpu_ps <1 x float> @struct_buffer_load_format_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <1 x float> @llvm.amdgcn.struct.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  ret <1 x float> %data
+}
+
+; CHECK-LABEL: @struct_buffer_load_format_v2f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret <2 x float> %data
+define amdgpu_ps <2 x float> @struct_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  ret <2 x float> %data
+}
+
+; CHECK-LABEL: @struct_buffer_load_format_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret <4 x float> %data
+define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  ret <4 x float> %data
+}
+
+; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v2f32(
+; CHECK: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_struct_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %elt0 = extractelement <2 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v2f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %elt1 = extractelement <2 x float> %data, i32 1
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v4f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v4f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %elt1 = extractelement <4 x float> %data, i32 1
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt2_struct_buffer_load_format_v4f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt2_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %elt1 = extractelement <4 x float> %data, i32 2
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt3_struct_buffer_load_format_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %elt1 = extractelement <4 x float> %data, i32 3
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_format_v4f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret <2 x float>
+define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_format_v4f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: ret <2 x float> %shuf
+define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 1, i32 2>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt2_elt3_struct_buffer_load_format_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: ret <2 x float> %shuf
+define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_elt2_struct_buffer_load_format_v4f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret <3 x float> %data
+define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_buffer_load_format_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+; CHECK-NEXT: ret <3 x float> %shuf
+define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+  ret <3 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt0_elt2_elt3_struct_buffer_load_format_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT: ret <3 x float> %shuf
+define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+  ret <3 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt0_struct_buffer_load_format_v3f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %elt0 = extractelement <3 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt1_struct_buffer_load_format_v3f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt1_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %elt1 = extractelement <3 x float> %data, i32 1
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt2_struct_buffer_load_format_v3f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt2_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %elt1 = extractelement <3 x float> %data, i32 2
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_format_v3f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: ret <2 x float>
+define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_format_v3f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: ret <2 x float> %shuf
+define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_format_v4f32(
+; CHECK-NEXT: %tmp = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32
+; CHECK-NEXT: ret i32 %tmp2
+define i32 @extract0_bitcast_struct_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %tmp = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %tmp1 = bitcast <4 x float> %tmp to <4 x i32>
+  %tmp2 = extractelement <4 x i32> %tmp1, i32 0
+  ret i32 %tmp2
+}
+
+; CHECK-LABEL: @extract0_bitcast_struct_buffer_load_format_v4i32(
+; CHECK-NEXT: %tmp = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+; CHECK-NEXT: %tmp2 = bitcast i32 %tmp to float
+; CHECK-NEXT: ret float %tmp2
+define float @extract0_bitcast_struct_buffer_load_format_v4i32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %tmp = call <4 x i32> @llvm.amdgcn.struct.buffer.load.format.v4i32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
+  %tmp1 = bitcast <4 x i32> %tmp to <4 x float>
+  %tmp2 = extractelement <4 x float> %tmp1, i32 0
+  ret float %tmp2
+}
+
+; CHECK-LABEL: @preserve_metadata_extract_elt0_struct_buffer_load_format_v2f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0), !fpmath !0
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @preserve_metadata_extract_elt0_struct_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
+  %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0), !fpmath !0
+  %elt0 = extractelement <2 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32) #1
+declare <1 x float> @llvm.amdgcn.struct.buffer.load.format.v1f32(<4 x i32>, i32, i32, i32, i32) #1
+declare <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32, i32) #1
+declare <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32>, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32) #1
+
+declare <4 x i32> @llvm.amdgcn.struct.buffer.load.format.v4i32(<4 x i32>, i32, i32, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+; Check that the intrinsic remains unchanged in the presence of TFE or LWE
+; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_tfe(
+; CHECK-NEXT: %data = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 1, i32 0)
+; CHECK: ret float %elt0
+define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_tfe(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 1, i32 0)
+  %data.vec = extractvalue {<4 x float>,i32} %data, 0
+  %elt0 = extractelement <4 x float> %data.vec, i32 0
+  ret float %elt0
+}
+
+; Check that the intrinsic remains unchanged in the presence of TFE or LWE
+; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_lwe(
+; CHECK-NEXT: %data = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 2, i32 0)
+; CHECK: ret float %elt0
+define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_lwe(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 2, i32 0)
+  %data.vec = extractvalue {<4 x float>,i32} %data, 0
+  %elt0 = extractelement <4 x float> %data.vec, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt0_image_sample_2d_v4f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.2d.f32.f32(i32 1, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_2d_v4f32_f32(float %s, float %t, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt0_dmask_0000_image_sample_3d_v4f32_f32(
+; CHECK-NEXT: ret float undef
+define amdgpu_ps float @extract_elt0_dmask_0000_image_sample_3d_v4f32_f32(float %s, float %t, float %r, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 0, float %s, float %t, float %r, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt0_dmask_0001_image_sample_1darray_v4f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1darray.f32.f32(i32 1, float %s, float %slice, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_dmask_0001_image_sample_1darray_v4f32_f32(float %s, float %slice, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 1, float %s, float %slice, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt0_dmask_0010_image_sample_1d_v4f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 2, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_dmask_0010_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 2, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt0_dmask_0100_image_sample_1d_v4f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 4, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_dmask_0100_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 4, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt0_dmask_1000_image_sample_1d_v4f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 8, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_dmask_1000_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 8, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt0_dmask_1001_image_sample_1d_v4f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_dmask_1001_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 9, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt0_dmask_0011_image_sample_1d_v4f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_dmask_0011_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt0_dmask_0111_image_sample_1d_v4f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_dmask_0111_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_dmask_0001_image_sample_1d_v4f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: %1 = insertelement <2 x float> undef, float %data, i32 0
+; CHECK-NEXT: ret <2 x float> %1
+define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0001_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_dmask_0011_image_sample_1d_v4f32_f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret <2 x float> %data
+define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0011_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_dmask_0111_image_sample_1d_v4f32_f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret <2 x float> %data
+define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0111_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_dmask_0101_image_sample_1d_v4f32_f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 5, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret <2 x float> %data
+define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0101_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 5, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0001_image_sample_1d_v4f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: %1 = insertelement <3 x float> undef, float %data, i32 0
+; CHECK-NEXT: ret <3 x float> %1
+define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0001_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0011_image_sample_1d_v4f32_f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: %shuf = shufflevector <2 x float> %data, <2 x float> undef, <3 x i32> <i32 0, i32 1, i32 undef>
+; CHECK-NEXT: ret <3 x float> %shuf
+define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0011_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 3, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0101_image_sample_1d_v4f32_f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 5, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: %shuf = shufflevector <2 x float> %data, <2 x float> undef, <3 x i32> <i32 0, i32 1, i32 undef>
+; CHECK-NEXT: ret <3 x float> %shuf
+define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0101_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 5, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0111_image_sample_1d_v4f32_f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.image.sample.1d.v3f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret <3 x float> %data
+define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0111_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_1111_image_sample_1d_v4f32_f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.image.sample.1d.v3f32.f32(i32 7, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret <3 x float> %data
+define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_1111_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x float> %shuf
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.cl
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt1_image_sample_cl_2darray_v4f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.cl.2darray.f32.f32(i32 2, float %s, float %t, float %slice, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt1_image_sample_cl_2darray_v4f32_f32(float %s, float %t, float %slice, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.cl.2darray.v4f32.f32(i32 15, float %s, float %t, float %slice, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 1
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.cl.2darray.v4f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.d
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt2_image_sample_d_cube_v4f32_f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.d.cube.f32.f32.f32(i32 4, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %face, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt2_image_sample_d_cube_v4f32_f32_f32(float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %face, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.d.cube.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %face, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 2
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.d.cube.v4f32.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.d.cl
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt3_image_sample_d_cl_1d_v4f32_f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.d.cl.1d.f32.f32.f32(i32 8, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt3_image_sample_d_cl_1d_v4f32_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 3
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.l
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt1_dmask_0110_image_sample_l_1d_v2f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.l.1d.f32.f32(i32 4, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt1_dmask_0110_image_sample_l_1d_v2f32_f32(float %s, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <2 x float> @llvm.amdgcn.image.sample.l.1d.v2f32.f32(i32 6, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <2 x float> %data, i32 1
+  ret float %elt0
+}
+
+declare <2 x float> @llvm.amdgcn.image.sample.l.1d.v2f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.b
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt1_dmask_1001_image_sample_b_1d_v4f32_f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.b.1d.f32.f32.f32(i32 8, float %bias, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt1_dmask_1001_image_sample_b_1d_v4f32_f32_f32(float %bias, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 9, float %bias, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 1
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.b.cl
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt1_elt2_dmask_1101_image_sample_b_cl_1d_v4f32_f32_f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.b.cl.1d.v2f32.f32.f32(i32 12, float %bias, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret <2 x float> %data
+define amdgpu_ps <2 x float> @extract_elt1_elt2_dmask_1101_image_sample_b_cl_1d_v4f32_f32_f32(float %bias, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 13, float %bias, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 1, i32 2>
+  ret <2 x float> %shuf
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.lz
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt1_elt3_image_sample_lz_1d_v4f32_f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.image.sample.lz.1d.v2f32.f32(i32 10, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret <2 x float> %data
+define amdgpu_ps <2 x float> @extract_elt1_elt3_image_sample_lz_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32 15, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 1, i32 3>
+  ret <2 x float> %shuf
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.cd
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt1_elt2_elt3_image_sample_cd_1d_v4f32_f32_f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.image.sample.cd.1d.v3f32.f32.f32(i32 14, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret <3 x float> %data
+define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_image_sample_cd_1d_v4f32_f32_f32(float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+  ret <3 x float> %shuf
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.cd.cl
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret half %data
+define amdgpu_ps half @extract_elt0_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x half> %data, i32 0
+  ret half %elt0
+}
+
+declare <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.c
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_c_1d_v4f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.1d.f32.f32(i32 1, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_c_1d_v4f32_f32(float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32 15, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.c.cl
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_c_cl_1d_v4f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.cl.1d.f32.f32(i32 1, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_c_cl_1d_v4f32_f32(float %zcompare, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32 15, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.c.d
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_c_d_1d_v4f32_f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.d.1d.f32.f32.f32(i32 1, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_c_d_1d_v4f32_f32_f32(float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.c.d.cl
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_c_d_cl_1d_v4f32_f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.d.cl.1d.f32.f32.f32(i32 1, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_c_d_cl_1d_v4f32_f32_f32(float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.c.l
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_c_l_1d_v4f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.l.1d.f32.f32(i32 1, float %zcompare, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_c_l_1d_v4f32_f32(float %zcompare, float %s, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32 15, float %zcompare, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.c.b
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_c_b_1d_v4f32_f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.b.1d.f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_c_b_1d_v4f32_f32_f32(float %bias, float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.c.b.cl
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_c_b_cl_1d_v4f32_f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.b.cl.1d.f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_c_b_cl_1d_v4f32_f32_f32(float %bias, float %zcompare, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.c.lz
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_c_lz_1d_v4f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.lz.1d.f32.f32(i32 1, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_c_lz_1d_v4f32_f32(float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32 15, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.c.cd
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_c_cd_1d_v4f32_f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.cd.1d.f32.f32.f32(i32 1, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_c_cd_1d_v4f32_f32_f32(float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.c.cd.cl
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_c_cd_cl_1d_v4f32_f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.cd.cl.1d.f32.f32.f32(i32 1, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_c_cd_cl_1d_v4f32_f32_f32(float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_o_1d_v4f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.o.1d.f32.f32(i32 1, i32 %offset, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_o_1d_v4f32_f32(i32 %offset, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32(i32, i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.cl.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_cl_o_1d_v4f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.cl.o.1d.f32.f32(i32 1, i32 %offset, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_cl_o_1d_v4f32_f32(i32 %offset, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.cl.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.cl.o.1d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.d.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_d_o_1d_v4f32_f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.d.o.1d.f32.f32.f32(i32 1, i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_d_o_1d_v4f32_f32_f32(i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.d.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.d.o.1d.v4f32.f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.d.cl.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_d_cl_o_1d_v4f32_f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.d.cl.o.1d.f32.f32.f32(i32 1, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_d_cl_o_1d_v4f32_f32_f32(i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.l.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_l_o_1d_v4f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.l.o.1d.f32.f32(i32 1, i32 %offset, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_l_o_1d_v4f32_f32(i32 %offset, float %s, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.b.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_b_o_1d_v4f32_f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.b.o.1d.f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_b_o_1d_v4f32_f32_f32(i32 %offset, float %bias, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.b.cl.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_b_cl_o_1d_v4f32_f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.b.cl.o.1d.f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_b_cl_o_1d_v4f32_f32_f32(i32 %offset, float %bias, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.b.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.b.cl.o.1d.v4f32.f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.lz.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_lz_o_1d_v4f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.lz.o.1d.f32.f32(i32 1, i32 %offset, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_lz_o_1d_v4f32_f32(i32 %offset, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32(i32, i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.cd.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_cd_o_1d_v4f32_f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.cd.o.1d.f32.f32.f32(i32 1, i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_cd_o_1d_v4f32_f32_f32(i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.cd.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.cd.o.1d.v4f32.f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.cd.cl.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_cd_cl_o_1d_v4f32_f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.cd.cl.o.1d.f32.f32.f32(i32 1, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_cd_cl_o_1d_v4f32_f32_f32(i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.c.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_c_o_1d_v4f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.o.1d.f32.f32(i32 1, i32 %offset, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_c_o_1d_v4f32_f32(i32 %offset, float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.c.cl.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_c_cl_o_1d_v4f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.cl.o.1d.f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_c_cl_o_1d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.cl.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.c.cl.o.1d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.c.d.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_c_d_o_1d_v4f32_f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.d.o.1d.f32.f32.f32(i32 1, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_c_d_o_1d_v4f32_f32_f32(i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.c.d.cl.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_c_d_cl_o_1d_v4f32_f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.d.cl.o.1d.f32.f32.f32(i32 1, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_c_d_cl_o_1d_v4f32_f32_f32(i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.c.l.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_c_l_o_1d_v4f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.l.o.1d.f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_c_l_o_1d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.c.b.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_c_b_o_1d_v4f32_f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.b.o.1d.f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_c_b_o_1d_v4f32_f32_f32(i32 %offset, float %bias, float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.c.b.o.1d.v4f32.f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.c.b.cl.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_c_b_cl_o_1d_v4f32_f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.b.cl.o.1d.f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_c_b_cl_o_1d_v4f32_f32_f32(i32 %offset, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.c.lz.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_c_lz_o_1d_v4f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.lz.o.1d.f32.f32(i32 1, i32 %offset, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_c_lz_o_1d_v4f32_f32(i32 %offset, float %zcompare, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.c.cd.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_c_cd_o_1d_v4f32_f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.cd.o.1d.f32.f32.f32(i32 1, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_c_cd_o_1d_v4f32_f32_f32(i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.c.cd.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.sample.c.cd.cl.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_sample_c_cd_cl_o_1d_v4f32_f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.c.cd.cl.o.1d.f32.f32.f32(i32 1, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_sample_c_cd_cl_o_1d_v4f32_f32_f32(i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.gather4
+; --------------------------------------------------------------------
+
+; Don't handle gather4*
+
+; CHECK-LABEL: @extract_elt0_image_gather4_2d_v4f32_f32(
+; CHECK: %data = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+define amdgpu_ps float @extract_elt0_image_gather4_2d_v4f32_f32(float %s, float %t, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.gather4.cl
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_gather4_cl_2d_v4f32_f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 2, float %s, float %t, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+define amdgpu_ps float @extract_elt0_image_gather4_cl_2d_v4f32_f32(float %s, float %t, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 2, float %s, float %t, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.gather4.l
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_gather4_l_2d_v4f32_f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 4, float %s, float %t, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+define amdgpu_ps float @extract_elt0_image_gather4_l_2d_v4f32_f32(float %s, float %t, float %lod, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 4, float %s, float %t, float %lod, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.gather4.b
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_gather4_b_2darray_v4f32_f32_f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32(i32 8, float %bias, float %s, float %t, float %slice, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+define amdgpu_ps float @extract_elt0_image_gather4_b_2darray_v4f32_f32_f32(float %bias, float %s, float %t, float %slice, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32(i32 8, float %bias, float %s, float %t, float %slice, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.gather4.b.2darray.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.gather4.b.cl
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_gather4_b_cl_cube_v4f32_f32_f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, float %face, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+define amdgpu_ps float @extract_elt0_image_gather4_b_cl_cube_v4f32_f32_f32(float %bias, float %s, float %t, float %face, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, float %face, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.cube.v4f32.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.gather4.lz
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_gather4_lz_2d_v4f32_f16(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 1, half %s, half %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+define amdgpu_ps float @extract_elt0_image_gather4_lz_2d_v4f32_f16(half %s, half %t, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 1, half %s, half %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32, half, half, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.gather4.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_gather4_o_2d_v4f32_f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+define amdgpu_ps float @extract_elt0_image_gather4_o_2d_v4f32_f32(i32 %offset, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.gather4.cl.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_gather4_cl_o_2d_v4f32_f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+define amdgpu_ps float @extract_elt0_image_gather4_cl_o_2d_v4f32_f32(i32 %offset, float %s, float %t, float %clamp, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.gather4.l.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_gather4_l_o_2d_v4f32_f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, float %lod, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+define amdgpu_ps float @extract_elt0_image_gather4_l_o_2d_v4f32_f32(i32 %offset, float %s, float %t, float %lod, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, float %lod, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.gather4.b.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_gather4_b_o_2d_v4f32_f32_f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+define amdgpu_ps float @extract_elt0_image_gather4_b_o_2d_v4f32_f32_f32(i32 %offset, float %bias, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.gather4.b.cl.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_gather4_b_cl_o_2d_v4f32_f32_f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+define amdgpu_ps float @extract_elt0_image_gather4_b_cl_o_2d_v4f32_f32_f32(i32 %offset, float %bias, float %s, float %t, float %clamp, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.gather4.lz.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_gather4_lz_o_2d_v4f32_f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+define amdgpu_ps float @extract_elt0_image_gather4_lz_o_2d_v4f32_f32(i32 %offset, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.gather4.c.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_gather4_c_o_2d_v4f32_f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+define amdgpu_ps float @extract_elt0_image_gather4_c_o_2d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.gather4.c.cl.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_gather4_c_cl_o_2d_v4f32_f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+define amdgpu_ps float @extract_elt0_image_gather4_c_cl_o_2d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %t, float %clamp, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.gather4.c.l.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_gather4_c_l_o_2d_v4f32_f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %lod, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+define amdgpu_ps float @extract_elt0_image_gather4_c_l_o_2d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %t, float %lod, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %lod, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.gather4.c.b.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_gather4_c_b_o_2d_v4f32_f32_f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+define amdgpu_ps float @extract_elt0_image_gather4_c_b_o_2d_v4f32_f32_f32(i32 %offset, float %bias, float %zcompare, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.gather4.c.b.cl.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_gather4_c_b_cl_o_2d_v4f32_f32_f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+define amdgpu_ps float @extract_elt0_image_gather4_c_b_cl_o_2d_v4f32_f32_f32(i32 %offset, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.gather4.c.lz.o
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_gather4_c_lz_o_2d_v4f32_f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+define amdgpu_ps float @extract_elt0_image_gather4_c_lz_o_2d_v4f32_f32(i32 %offset, float %zcompare, float %s, float %t, <8 x i32> inreg %gather4r, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %gather4r, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.getlod
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_getlod_1d_v4f32_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.getlod.1d.f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_getlod_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.getlod.1d.v4f32.f32(i32 15, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.getlod.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.load
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_load_2dmsaa_v4f32_i32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 %sample, <8 x i32> %sampler, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_load_2dmsaa_v4f32_i32(i32 %s, i32 %t, i32 %sample, <8 x i32> inreg %sampler) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %sample, <8 x i32> %sampler, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.load.mip
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_load_mip_1d_v4f32_i32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.load.mip.1d.f32.i32(i32 1, i32 %s, i32 %mip, <8 x i32> %sampler, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_load_mip_1d_v4f32_i32(i32 %s, i32 %mip, <8 x i32> inreg %sampler) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 15, i32 %s, i32 %mip, <8 x i32> %sampler, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.image.getresinfo
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_image_getresinfo_1d_v4f32_i32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.image.getresinfo.1d.f32.i32(i32 1, i32 %mip, <8 x i32> %sampler, i32 0, i32 0)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_image_getresinfo_1d_v4f32_i32(i32 %mip, <8 x i32> inreg %sampler) #0 {
+  %data = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %sampler, i32 0, i32 0)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #1
+
+; --------------------------------------------------------------------
+; TFE / LWE
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @extract_elt0_tfe_image_load_1d_v4f32i32_i32(
+; CHECK-NEXT: %data = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1)
+define amdgpu_ps float @extract_elt0_tfe_image_load_1d_v4f32i32_i32(i32 %s, <8 x i32> inreg %rsrc) #0 {
+  %data = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1)
+  %rgba = extractvalue { <4 x float>, i32 } %data, 0
+  %elt0 = extractelement <4 x float> %rgba, i32 0
+  ret float %elt0
+}
+
+declare {<4 x float>, i32} @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32, i32, <8 x i32>, i32, i32) #1
+
+; CHECK: @tfe_check_assert(
+; CHECK: %data = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 1, i32 undef, i32 undef, <8 x i32> undef, i32 0, i32 1)
+; CHECK-NEXT: ret float %data
+define amdgpu_hs float @tfe_check_assert() #0 {
+  %data = call nsz <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 undef, i32 undef, <8 x i32> undef, i32 0, i32 1) #2
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+
+!0 = !{float 2.500000e+00}

Added: llvm/trunk/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,2098 @@
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.rcp
+; --------------------------------------------------------------------
+
+declare float @llvm.amdgcn.rcp.f32(float) nounwind readnone
+declare double @llvm.amdgcn.rcp.f64(double) nounwind readnone
+
+; CHECK-LABEL: @test_constant_fold_rcp_f32_undef
+; CHECK-NEXT: ret float undef
+define float @test_constant_fold_rcp_f32_undef() nounwind {
+  %val = call float @llvm.amdgcn.rcp.f32(float undef) nounwind readnone
+  ret float %val
+}
+
+; CHECK-LABEL: @test_constant_fold_rcp_f32_1
+; CHECK-NEXT: ret float 1.000000e+00
+define float @test_constant_fold_rcp_f32_1() nounwind {
+  %val = call float @llvm.amdgcn.rcp.f32(float 1.0) nounwind readnone
+  ret float %val
+}
+
+; CHECK-LABEL: @test_constant_fold_rcp_f64_1
+; CHECK-NEXT:  ret double 1.000000e+00
+define double @test_constant_fold_rcp_f64_1() nounwind {
+  %val = call double @llvm.amdgcn.rcp.f64(double 1.0) nounwind readnone
+  ret double %val
+}
+
+; CHECK-LABEL: @test_constant_fold_rcp_f32_half
+; CHECK-NEXT: ret float 2.000000e+00
+define float @test_constant_fold_rcp_f32_half() nounwind {
+  %val = call float @llvm.amdgcn.rcp.f32(float 0.5) nounwind readnone
+  ret float %val
+}
+
+; CHECK-LABEL: @test_constant_fold_rcp_f64_half
+; CHECK-NEXT:  ret double 2.000000e+00
+define double @test_constant_fold_rcp_f64_half() nounwind {
+  %val = call double @llvm.amdgcn.rcp.f64(double 0.5) nounwind readnone
+  ret double %val
+}
+
+; CHECK-LABEL: @test_constant_fold_rcp_f32_43
+; CHECK-NEXT: call float @llvm.amdgcn.rcp.f32(float 4.300000e+01)
+define float @test_constant_fold_rcp_f32_43() nounwind {
+ %val = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) nounwind readnone
+ ret float %val
+}
+
+; CHECK-LABEL: @test_constant_fold_rcp_f64_43
+; CHECK-NEXT: call double @llvm.amdgcn.rcp.f64(double 4.300000e+01)
+define double @test_constant_fold_rcp_f64_43() nounwind {
+  %val = call double @llvm.amdgcn.rcp.f64(double 4.300000e+01) nounwind readnone
+  ret double %val
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.rsq
+; --------------------------------------------------------------------
+
+declare float @llvm.amdgcn.rsq.f32(float) nounwind readnone
+
+; CHECK-LABEL: @test_constant_fold_rsq_f32_undef
+; CHECK-NEXT: ret float undef
+define float @test_constant_fold_rsq_f32_undef() nounwind {
+  %val = call float @llvm.amdgcn.rsq.f32(float undef) nounwind readnone
+  ret float %val
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.frexp.mant
+; --------------------------------------------------------------------
+
+declare float @llvm.amdgcn.frexp.mant.f32(float) nounwind readnone
+declare double @llvm.amdgcn.frexp.mant.f64(double) nounwind readnone
+
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_undef(
+; CHECK-NEXT: ret float undef
+define float @test_constant_fold_frexp_mant_f32_undef() nounwind {
+  %val = call float @llvm.amdgcn.frexp.mant.f32(float undef)
+  ret float %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_undef(
+; CHECK-NEXT:  ret double undef
+define double @test_constant_fold_frexp_mant_f64_undef() nounwind {
+  %val = call double @llvm.amdgcn.frexp.mant.f64(double undef)
+  ret double %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_0(
+; CHECK-NEXT: ret float 0.000000e+00
+define float @test_constant_fold_frexp_mant_f32_0() nounwind {
+  %val = call float @llvm.amdgcn.frexp.mant.f32(float 0.0)
+  ret float %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_0(
+; CHECK-NEXT:  ret double 0.000000e+00
+define double @test_constant_fold_frexp_mant_f64_0() nounwind {
+  %val = call double @llvm.amdgcn.frexp.mant.f64(double 0.0)
+  ret double %val
+}
+
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_n0(
+; CHECK-NEXT: ret float -0.000000e+00
+define float @test_constant_fold_frexp_mant_f32_n0() nounwind {
+  %val = call float @llvm.amdgcn.frexp.mant.f32(float -0.0)
+  ret float %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_n0(
+; CHECK-NEXT:  ret double -0.000000e+00
+define double @test_constant_fold_frexp_mant_f64_n0() nounwind {
+  %val = call double @llvm.amdgcn.frexp.mant.f64(double -0.0)
+  ret double %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_1(
+; CHECK-NEXT: ret float 5.000000e-01
+define float @test_constant_fold_frexp_mant_f32_1() nounwind {
+  %val = call float @llvm.amdgcn.frexp.mant.f32(float 1.0)
+  ret float %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_1(
+; CHECK-NEXT:  ret double 5.000000e-01
+define double @test_constant_fold_frexp_mant_f64_1() nounwind {
+  %val = call double @llvm.amdgcn.frexp.mant.f64(double 1.0)
+  ret double %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_n1(
+; CHECK-NEXT: ret float -5.000000e-01
+define float @test_constant_fold_frexp_mant_f32_n1() nounwind {
+  %val = call float @llvm.amdgcn.frexp.mant.f32(float -1.0)
+  ret float %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_n1(
+; CHECK-NEXT:  ret double -5.000000e-01
+define double @test_constant_fold_frexp_mant_f64_n1() nounwind {
+  %val = call double @llvm.amdgcn.frexp.mant.f64(double -1.0)
+  ret double %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_nan(
+; CHECK-NEXT: ret float 0x7FF8000000000000
+define float @test_constant_fold_frexp_mant_f32_nan() nounwind {
+  %val = call float @llvm.amdgcn.frexp.mant.f32(float 0x7FF8000000000000)
+  ret float %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_nan(
+; CHECK-NEXT:  ret double 0x7FF8000000000000
+define double @test_constant_fold_frexp_mant_f64_nan() nounwind {
+  %val = call double @llvm.amdgcn.frexp.mant.f64(double 0x7FF8000000000000)
+  ret double %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_inf(
+; CHECK-NEXT: ret float 0x7FF0000000000000
+define float @test_constant_fold_frexp_mant_f32_inf() nounwind {
+  %val = call float @llvm.amdgcn.frexp.mant.f32(float 0x7FF0000000000000)
+  ret float %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_inf(
+; CHECK-NEXT:  ret double 0x7FF0000000000000
+define double @test_constant_fold_frexp_mant_f64_inf() nounwind {
+  %val = call double @llvm.amdgcn.frexp.mant.f64(double 0x7FF0000000000000)
+  ret double %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_ninf(
+; CHECK-NEXT: ret float 0xFFF0000000000000
+define float @test_constant_fold_frexp_mant_f32_ninf() nounwind {
+  %val = call float @llvm.amdgcn.frexp.mant.f32(float 0xFFF0000000000000)
+  ret float %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_ninf(
+; CHECK-NEXT:  ret double 0xFFF0000000000000
+define double @test_constant_fold_frexp_mant_f64_ninf() nounwind {
+  %val = call double @llvm.amdgcn.frexp.mant.f64(double 0xFFF0000000000000)
+  ret double %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_max_num(
+; CHECK-NEXT: ret float 0x3FEFFFFFE0000000
+define float @test_constant_fold_frexp_mant_f32_max_num() nounwind {
+  %val = call float @llvm.amdgcn.frexp.mant.f32(float 0x47EFFFFFE0000000)
+  ret float %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_max_num(
+; CHECK-NEXT:  ret double 0x3FEFFFFFFFFFFFFF
+define double @test_constant_fold_frexp_mant_f64_max_num() nounwind {
+  %val = call double @llvm.amdgcn.frexp.mant.f64(double 0x7FEFFFFFFFFFFFFF)
+  ret double %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f32_min_num(
+; CHECK-NEXT: ret float 5.000000e-01
+define float @test_constant_fold_frexp_mant_f32_min_num() nounwind {
+  %val = call float @llvm.amdgcn.frexp.mant.f32(float 0x36A0000000000000)
+  ret float %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_mant_f64_min_num(
+; CHECK-NEXT:  ret double 5.000000e-01
+define double @test_constant_fold_frexp_mant_f64_min_num() nounwind {
+  %val = call double @llvm.amdgcn.frexp.mant.f64(double 4.940656e-324)
+  ret double %val
+}
+
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.frexp.exp
+; --------------------------------------------------------------------
+
+declare i32 @llvm.amdgcn.frexp.exp.f32(float) nounwind readnone
+declare i32 @llvm.amdgcn.frexp.exp.f64(double) nounwind readnone
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_undef(
+; CHECK-NEXT: ret i32 undef
+define i32 @test_constant_fold_frexp_exp_f32_undef() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float undef)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_undef(
+; CHECK-NEXT:  ret i32 undef
+define i32 @test_constant_fold_frexp_exp_f64_undef() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double undef)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_0(
+; CHECK-NEXT: ret i32 0
+define i32 @test_constant_fold_frexp_exp_f32_0() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0.0)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_0(
+; CHECK-NEXT:  ret i32 0
+define i32 @test_constant_fold_frexp_exp_f64_0() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 0.0)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_n0(
+; CHECK-NEXT: ret i32 0
+define i32 @test_constant_fold_frexp_exp_f32_n0() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float -0.0)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_n0(
+; CHECK-NEXT:  ret i32 0
+define i32 @test_constant_fold_frexp_exp_f64_n0() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double -0.0)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_1024(
+; CHECK-NEXT: ret i32 11
+define i32 @test_constant_fold_frexp_exp_f32_1024() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 1024.0)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_1024(
+; CHECK-NEXT:  ret i32 11
+define i32 @test_constant_fold_frexp_exp_f64_1024() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 1024.0)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_n1024(
+; CHECK-NEXT: ret i32 11
+define i32 @test_constant_fold_frexp_exp_f32_n1024() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float -1024.0)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_n1024(
+; CHECK-NEXT:  ret i32 11
+define i32 @test_constant_fold_frexp_exp_f64_n1024() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double -1024.0)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_1_1024(
+; CHECK-NEXT: ret i32 -9
+define i32 @test_constant_fold_frexp_exp_f32_1_1024() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0.0009765625)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_1_1024(
+; CHECK-NEXT:  ret i32 -9
+define i32 @test_constant_fold_frexp_exp_f64_1_1024() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 0.0009765625)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_nan(
+; CHECK-NEXT: ret i32 0
+define i32 @test_constant_fold_frexp_exp_f32_nan() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0x7FF8000000000000)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_nan(
+; CHECK-NEXT:  ret i32 0
+define i32 @test_constant_fold_frexp_exp_f64_nan() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 0x7FF8000000000000)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_inf(
+; CHECK-NEXT: ret i32 0
+define i32 @test_constant_fold_frexp_exp_f32_inf() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0x7FF0000000000000)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_inf(
+; CHECK-NEXT:  ret i32 0
+define i32 @test_constant_fold_frexp_exp_f64_inf() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 0x7FF0000000000000)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_ninf(
+; CHECK-NEXT: ret i32 0
+define i32 @test_constant_fold_frexp_exp_f32_ninf() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0xFFF0000000000000)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_ninf(
+; CHECK-NEXT:  ret i32 0
+define i32 @test_constant_fold_frexp_exp_f64_ninf() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 0xFFF0000000000000)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_max_num(
+; CHECK-NEXT: ret i32 128
+define i32 @test_constant_fold_frexp_exp_f32_max_num() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0x47EFFFFFE0000000)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_max_num(
+; CHECK-NEXT:  ret i32 1024
+define i32 @test_constant_fold_frexp_exp_f64_max_num() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 0x7FEFFFFFFFFFFFFF)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f32_min_num(
+; CHECK-NEXT: ret i32 -148
+define i32 @test_constant_fold_frexp_exp_f32_min_num() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f32(float 0x36A0000000000000)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_constant_fold_frexp_exp_f64_min_num(
+; CHECK-NEXT:  ret i32 -1073
+define i32 @test_constant_fold_frexp_exp_f64_min_num() nounwind {
+  %val = call i32 @llvm.amdgcn.frexp.exp.f64(double 4.940656e-324)
+  ret i32 %val
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.class
+; --------------------------------------------------------------------
+
+declare i1 @llvm.amdgcn.class.f32(float, i32) nounwind readnone
+declare i1 @llvm.amdgcn.class.f64(double, i32) nounwind readnone
+
+; CHECK-LABEL: @test_class_undef_mask_f32(
+; CHECK: ret i1 false
+define i1 @test_class_undef_mask_f32(float %x) nounwind {
+  %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 undef)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_class_over_max_mask_f32(
+; CHECK: %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 1)
+define i1 @test_class_over_max_mask_f32(float %x) nounwind {
+  %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 1025)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_class_no_mask_f32(
+; CHECK: ret i1 false
+define i1 @test_class_no_mask_f32(float %x) nounwind {
+  %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 0)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_class_full_mask_f32(
+; CHECK: ret i1 true
+define i1 @test_class_full_mask_f32(float %x) nounwind {
+  %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 1023)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_class_undef_no_mask_f32(
+; CHECK: ret i1 false
+define i1 @test_class_undef_no_mask_f32() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f32(float undef, i32 0)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_class_undef_full_mask_f32(
+; CHECK: ret i1 true
+define i1 @test_class_undef_full_mask_f32() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f32(float undef, i32 1023)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_class_undef_val_f32(
+; CHECK: ret i1 undef
+define i1 @test_class_undef_val_f32() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f32(float undef, i32 4)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_class_undef_undef_f32(
+; CHECK: ret i1 undef
+define i1 @test_class_undef_undef_f32() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f32(float undef, i32 undef)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_class_var_mask_f32(
+; CHECK: %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 %mask)
+define i1 @test_class_var_mask_f32(float %x, i32 %mask) nounwind {
+  %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 %mask)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_class_isnan_f32(
+; CHECK: %val = fcmp uno float %x, 0.000000e+00
+define i1 @test_class_isnan_f32(float %x) nounwind {
+  %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 3)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_class_is_p0_n0_f32(
+; CHECK: %val = fcmp oeq float %x, 0.000000e+00
+define i1 @test_class_is_p0_n0_f32(float %x) nounwind {
+  %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 96)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_snan_test_snan_f64(
+; CHECK: ret i1 true
+define i1 @test_constant_class_snan_test_snan_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF0000000000001, i32 1)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_qnan_test_qnan_f64(
+; CHECK: ret i1 true
+define i1 @test_constant_class_qnan_test_qnan_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF8000000000000, i32 2)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_qnan_test_snan_f64(
+; CHECK: ret i1 false
+define i1 @test_constant_class_qnan_test_snan_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF8000000000000, i32 1)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_ninf_test_ninf_f64(
+; CHECK: ret i1 true
+define i1 @test_constant_class_ninf_test_ninf_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0xFFF0000000000000, i32 4)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_pinf_test_ninf_f64(
+; CHECK: ret i1 false
+define i1 @test_constant_class_pinf_test_ninf_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF0000000000000, i32 4)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_qnan_test_ninf_f64(
+; CHECK: ret i1 false
+define i1 @test_constant_class_qnan_test_ninf_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF8000000000000, i32 4)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_snan_test_ninf_f64(
+; CHECK: ret i1 false
+define i1 @test_constant_class_snan_test_ninf_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF0000000000001, i32 4)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_nnormal_test_nnormal_f64(
+; CHECK: ret i1 true
+define i1 @test_constant_class_nnormal_test_nnormal_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double -1.0, i32 8)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_pnormal_test_nnormal_f64(
+; CHECK: ret i1 false
+define i1 @test_constant_class_pnormal_test_nnormal_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 1.0, i32 8)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_nsubnormal_test_nsubnormal_f64(
+; CHECK: ret i1 true
+define i1 @test_constant_class_nsubnormal_test_nsubnormal_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0x800fffffffffffff, i32 16)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_psubnormal_test_nsubnormal_f64(
+; CHECK: ret i1 false
+define i1 @test_constant_class_psubnormal_test_nsubnormal_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0x000fffffffffffff, i32 16)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_nzero_test_nzero_f64(
+; CHECK: ret i1 true
+define i1 @test_constant_class_nzero_test_nzero_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double -0.0, i32 32)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_pzero_test_nzero_f64(
+; CHECK: ret i1 false
+define i1 @test_constant_class_pzero_test_nzero_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0.0, i32 32)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_pzero_test_pzero_f64(
+; CHECK: ret i1 true
+define i1 @test_constant_class_pzero_test_pzero_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0.0, i32 64)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_nzero_test_pzero_f64(
+; CHECK: ret i1 false
+define i1 @test_constant_class_nzero_test_pzero_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double -0.0, i32 64)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_psubnormal_test_psubnormal_f64(
+; CHECK: ret i1 true
+define i1 @test_constant_class_psubnormal_test_psubnormal_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0x000fffffffffffff, i32 128)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_nsubnormal_test_psubnormal_f64(
+; CHECK: ret i1 false
+define i1 @test_constant_class_nsubnormal_test_psubnormal_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0x800fffffffffffff, i32 128)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_pnormal_test_pnormal_f64(
+; CHECK: ret i1 true
+define i1 @test_constant_class_pnormal_test_pnormal_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 1.0, i32 256)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_nnormal_test_pnormal_f64(
+; CHECK: ret i1 false
+define i1 @test_constant_class_nnormal_test_pnormal_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double -1.0, i32 256)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_pinf_test_pinf_f64(
+; CHECK: ret i1 true
+define i1 @test_constant_class_pinf_test_pinf_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF0000000000000, i32 512)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_ninf_test_pinf_f64(
+; CHECK: ret i1 false
+define i1 @test_constant_class_ninf_test_pinf_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0xFFF0000000000000, i32 512)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_qnan_test_pinf_f64(
+; CHECK: ret i1 false
+define i1 @test_constant_class_qnan_test_pinf_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF8000000000000, i32 512)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_constant_class_snan_test_pinf_f64(
+; CHECK: ret i1 false
+define i1 @test_constant_class_snan_test_pinf_f64() nounwind {
+  %val = call i1 @llvm.amdgcn.class.f64(double 0x7FF0000000000001, i32 512)
+  ret i1 %val
+}
+
+; CHECK-LABEL: @test_class_is_snan_nnan_src(
+; CHECK-NEXT: ret i1 false
+define i1 @test_class_is_snan_nnan_src(float %x) {
+  %nnan = fadd nnan float %x, 1.0
+  %class = call i1 @llvm.amdgcn.class.f32(float %nnan, i32 1)
+  ret i1 %class
+}
+
+; CHECK-LABEL: @test_class_is_qnan_nnan_src(
+; CHECK-NEXT: ret i1 false
+define i1 @test_class_is_qnan_nnan_src(float %x) {
+  %nnan = fadd nnan float %x, 1.0
+  %class = call i1 @llvm.amdgcn.class.f32(float %nnan, i32 2)
+  ret i1 %class
+}
+
+; CHECK-LABEL: @test_class_is_nan_nnan_src(
+; CHECK-NEXT: ret i1 false
+define i1 @test_class_is_nan_nnan_src(float %x) {
+  %nnan = fadd nnan float %x, 1.0
+  %class = call i1 @llvm.amdgcn.class.f32(float %nnan, i32 3)
+  ret i1 %class
+}
+
+; CHECK-LABEL: @test_class_is_nan_other_nnan_src(
+; CHECK-NEXT: %nnan = fadd nnan float %x, 1.000000e+00
+; CHECK-NEXT: %class = call i1 @llvm.amdgcn.class.f32(float %nnan, i32 264)
+define i1 @test_class_is_nan_other_nnan_src(float %x) {
+  %nnan = fadd nnan float %x, 1.0
+  %class = call i1 @llvm.amdgcn.class.f32(float %nnan, i32 267)
+  ret i1 %class
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.cos
+; --------------------------------------------------------------------
+declare float @llvm.amdgcn.cos.f32(float) nounwind readnone
+declare float @llvm.fabs.f32(float) nounwind readnone
+
+; CHECK-LABEL: @cos_fneg_f32(
+; CHECK: %cos = call float @llvm.amdgcn.cos.f32(float %x)
+; CHECK-NEXT: ret float %cos
+define float @cos_fneg_f32(float %x) {
+  %x.fneg = fsub float -0.0, %x
+  %cos = call float @llvm.amdgcn.cos.f32(float %x.fneg)
+  ret float %cos
+}
+
+; CHECK-LABEL: @cos_fabs_f32(
+; CHECK-NEXT: %cos = call float @llvm.amdgcn.cos.f32(float %x)
+; CHECK-NEXT: ret float %cos
+define float @cos_fabs_f32(float %x) {
+  %x.fabs = call float @llvm.fabs.f32(float %x)
+  %cos = call float @llvm.amdgcn.cos.f32(float %x.fabs)
+  ret float %cos
+}
+
+; CHECK-LABEL: @cos_fabs_fneg_f32(
+; CHECK-NEXT: %cos = call float @llvm.amdgcn.cos.f32(float %x)
+; CHECK-NEXT: ret float %cos
+define float @cos_fabs_fneg_f32(float %x) {
+  %x.fabs = call float @llvm.fabs.f32(float %x)
+  %x.fabs.fneg = fsub float -0.0, %x.fabs
+  %cos = call float @llvm.amdgcn.cos.f32(float %x.fabs.fneg)
+  ret float %cos
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.cvt.pkrtz
+; --------------------------------------------------------------------
+
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) nounwind readnone
+
+; CHECK-LABEL: @vars_lhs_cvt_pkrtz(
+; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
+define <2 x half> @vars_lhs_cvt_pkrtz(float %x, float %y) {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
+  ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @constant_lhs_cvt_pkrtz(
+; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.000000e+00, float %y)
+define <2 x half> @constant_lhs_cvt_pkrtz(float %y) {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %y)
+  ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @constant_rhs_cvt_pkrtz(
+; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float 0.000000e+00)
+define <2 x half> @constant_rhs_cvt_pkrtz(float %x) {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float 0.0)
+  ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @undef_lhs_cvt_pkrtz(
+; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %y)
+define <2 x half> @undef_lhs_cvt_pkrtz(float %y) {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %y)
+  ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @undef_rhs_cvt_pkrtz(
+; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float undef)
+define <2 x half> @undef_rhs_cvt_pkrtz(float %x) {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float undef)
+  ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @undef_cvt_pkrtz(
+; CHECK: ret <2 x half> undef
+define <2 x half> @undef_cvt_pkrtz() {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float undef)
+  ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @constant_splat0_cvt_pkrtz(
+; CHECK: ret <2 x half> zeroinitializer
+define <2 x half> @constant_splat0_cvt_pkrtz() {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float 0.0)
+  ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @constant_cvt_pkrtz(
+; CHECK: ret <2 x half> <half 0xH4000, half 0xH4400>
+define <2 x half> @constant_cvt_pkrtz() {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 2.0, float 4.0)
+  ret <2 x half> %cvt
+}
+
+; Test constant values where rtz changes result
+; CHECK-LABEL: @constant_rtz_pkrtz(
+; CHECK: ret <2 x half> <half 0xH7BFF, half 0xH7BFF>
+define <2 x half> @constant_rtz_pkrtz() {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 65535.0, float 65535.0)
+  ret <2 x half> %cvt
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.cvt.pknorm.i16
+; --------------------------------------------------------------------
+
+declare <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float, float) nounwind readnone
+
+; CHECK-LABEL: @undef_lhs_cvt_pknorm_i16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float %y)
+define <2 x i16> @undef_lhs_cvt_pknorm_i16(float %y) {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float %y)
+  ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_rhs_cvt_pknorm_i16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float undef)
+define <2 x i16> @undef_rhs_cvt_pknorm_i16(float %x) {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float undef)
+  ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_cvt_pknorm_i16(
+; CHECK: ret <2 x i16> undef
+define <2 x i16> @undef_cvt_pknorm_i16() {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float undef)
+  ret <2 x i16> %cvt
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.cvt.pknorm.u16
+; --------------------------------------------------------------------
+
+declare <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float, float) nounwind readnone
+
+; CHECK-LABEL: @undef_lhs_cvt_pknorm_u16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float %y)
+define <2 x i16> @undef_lhs_cvt_pknorm_u16(float %y) {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float %y)
+  ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_rhs_cvt_pknorm_u16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float undef)
+define <2 x i16> @undef_rhs_cvt_pknorm_u16(float %x) {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float undef)
+  ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_cvt_pknorm_u16(
+; CHECK: ret <2 x i16> undef
+define <2 x i16> @undef_cvt_pknorm_u16() {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float undef)
+  ret <2 x i16> %cvt
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.cvt.pk.i16
+; --------------------------------------------------------------------
+
+declare <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32, i32) nounwind readnone
+
+; CHECK-LABEL: @undef_lhs_cvt_pk_i16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 %y)
+define <2 x i16> @undef_lhs_cvt_pk_i16(i32 %y) {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 %y)
+  ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_rhs_cvt_pk_i16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 undef)
+define <2 x i16> @undef_rhs_cvt_pk_i16(i32 %x) {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 undef)
+  ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_cvt_pk_i16(
+; CHECK: ret <2 x i16> undef
+define <2 x i16> @undef_cvt_pk_i16() {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 undef)
+  ret <2 x i16> %cvt
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.cvt.pk.u16
+; --------------------------------------------------------------------
+
+declare <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32, i32) nounwind readnone
+
+; CHECK-LABEL: @undef_lhs_cvt_pk_u16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 %y)
+define <2 x i16> @undef_lhs_cvt_pk_u16(i32 %y) {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 %y)
+  ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_rhs_cvt_pk_u16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 undef)
+define <2 x i16> @undef_rhs_cvt_pk_u16(i32 %x) {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 undef)
+  ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_cvt_pk_u16(
+; CHECK: ret <2 x i16> undef
+define <2 x i16> @undef_cvt_pk_u16() {
+  %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 undef)
+  ret <2 x i16> %cvt
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.ubfe
+; --------------------------------------------------------------------
+
+declare i32 @llvm.amdgcn.ubfe.i32(i32, i32, i32) nounwind readnone
+declare i64 @llvm.amdgcn.ubfe.i64(i64, i32, i32) nounwind readnone
+
+; CHECK-LABEL: @ubfe_var_i32(
+; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 %width)
+define i32 @ubfe_var_i32(i32 %src, i32 %offset, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_clear_high_bits_constant_offset_i32(
+; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 5, i32 %width)
+define i32 @ubfe_clear_high_bits_constant_offset_i32(i32 %src, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 133, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_clear_high_bits_constant_width_i32(
+; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 5)
+define i32 @ubfe_clear_high_bits_constant_width_i32(i32 %src, i32 %offset) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 133)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_width_0(
+; CHECK-NEXT: ret i32 0
+define i32 @ubfe_width_0(i32 %src, i32 %offset) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 0)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_width_31(
+; CHECK: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 31)
+define i32 @ubfe_width_31(i32 %src, i32 %offset) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 31)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_width_32(
+; CHECK-NEXT: ret i32 0
+define i32 @ubfe_width_32(i32 %src, i32 %offset) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 32)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_width_33(
+; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 1)
+define i32 @ubfe_width_33(i32 %src, i32 %offset) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 33)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_33(
+; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 1, i32 %width)
+define i32 @ubfe_offset_33(i32 %src, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 33, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_0(
+; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 0, i32 %width)
+define i32 @ubfe_offset_0(i32 %src, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 0, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_32(
+; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 0, i32 %width)
+define i32 @ubfe_offset_32(i32 %src, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 32, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_31(
+; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 31, i32 %width)
+define i32 @ubfe_offset_31(i32 %src, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 31, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_0_width_0(
+; CHECK-NEXT: ret i32 0
+define i32 @ubfe_offset_0_width_0(i32 %src) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 0, i32 0)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_0_width_3(
+; CHECK-NEXT: and i32 %src, 7
+; CHECK-NEXT: ret
+define i32 @ubfe_offset_0_width_3(i32 %src) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 0, i32 3)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_3_width_1(
+; CHECK-NEXT: %1 = lshr i32 %src, 3
+; CHECK-NEXT: and i32 %1, 1
+; CHECK-NEXT: ret i32
+define i32 @ubfe_offset_3_width_1(i32 %src) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 3, i32 1)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_3_width_4(
+; CHECK-NEXT: %1 = lshr i32 %src, 3
+; CHECK-NEXT: and i32 %1, 15
+; CHECK-NEXT: ret i32
+define i32 @ubfe_offset_3_width_4(i32 %src) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 3, i32 4)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_0_0_0(
+; CHECK-NEXT: ret i32 0
+define i32 @ubfe_0_0_0() {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 0)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_neg1_5_7(
+; CHECK-NEXT: ret i32 127
+define i32 @ubfe_neg1_5_7() {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 -1, i32 5, i32 7)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_undef_src_i32(
+; CHECK-NEXT: ret i32 undef
+define i32 @ubfe_undef_src_i32(i32 %offset, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 undef, i32 %offset, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_undef_offset_i32(
+; CHECK: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 undef, i32 %width)
+define i32 @ubfe_undef_offset_i32(i32 %src, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 undef, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_undef_width_i32(
+; CHECK: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 undef)
+define i32 @ubfe_undef_width_i32(i32 %src, i32 %offset) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 undef)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_33_width_4_i64(
+; CHECK-NEXT: %1 = lshr i64 %src, 33
+; CHECK-NEXT: %bfe = and i64 %1, 15
+define i64 @ubfe_offset_33_width_4_i64(i64 %src) {
+  %bfe = call i64 @llvm.amdgcn.ubfe.i64(i64 %src, i32 33, i32 4)
+  ret i64 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_0_i64(
+; CHECK-NEXT: %bfe = call i64 @llvm.amdgcn.ubfe.i64(i64 %src, i32 0, i32 %width)
+define i64 @ubfe_offset_0_i64(i64 %src, i32 %width) {
+  %bfe = call i64 @llvm.amdgcn.ubfe.i64(i64 %src, i32 0, i32 %width)
+  ret i64 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_32_width_32_i64(
+; CHECK-NEXT: %bfe = lshr i64 %src, 32
+; CHECK-NEXT: ret i64 %bfe
+define i64 @ubfe_offset_32_width_32_i64(i64 %src) {
+  %bfe = call i64 @llvm.amdgcn.ubfe.i64(i64 %src, i32 32, i32 32)
+  ret i64 %bfe
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.sbfe
+; --------------------------------------------------------------------
+
+declare i32 @llvm.amdgcn.sbfe.i32(i32, i32, i32) nounwind readnone
+declare i64 @llvm.amdgcn.sbfe.i64(i64, i32, i32) nounwind readnone
+
+; CHECK-LABEL: @sbfe_offset_31(
+; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %src, i32 31, i32 %width)
+define i32 @sbfe_offset_31(i32 %src, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %src, i32 31, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @sbfe_neg1_5_7(
+; CHECK-NEXT: ret i32 -1
+define i32 @sbfe_neg1_5_7() {
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 -1, i32 5, i32 7)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @sbfe_offset_32_width_32_i64(
+; CHECK-NEXT: %bfe = ashr i64 %src, 32
+; CHECK-NEXT: ret i64 %bfe
+define i64 @sbfe_offset_32_width_32_i64(i64 %src) {
+  %bfe = call i64 @llvm.amdgcn.sbfe.i64(i64 %src, i32 32, i32 32)
+  ret i64 %bfe
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.exp
+; --------------------------------------------------------------------
+
+declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) nounwind inaccessiblememonly
+
+; CHECK-LABEL: @exp_disabled_inputs_to_undef(
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float 1.000000e+00, float undef, float undef, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float undef, float 2.000000e+00, float undef, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float undef, float undef, float 5.000000e-01, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float undef, float undef, float undef, float 4.000000e+00, i1 true, i1 false)
+
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float %x, float undef, float undef, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float undef, float %y, float undef, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float undef, float undef, float %z, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float undef, float undef, float undef, float %w, i1 true, i1 false)
+
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 false)
+
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 3, float 1.000000e+00, float 2.000000e+00, float undef, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 5, float 1.000000e+00, float undef, float 5.000000e-01, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 9, float 1.000000e+00, float undef, float undef, float 4.000000e+00, i1 false, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.000000e+00, float 2.000000e+00, float 5.000000e-01, float 4.000000e+00, i1 false, i1 false)
+define void @exp_disabled_inputs_to_undef(float %x, float %y, float %z, float %w) {
+  ; enable src0..src3 constants
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+
+  ; enable src0..src3 variables
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float %x, float %y, float %z, float %w, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float %x, float %y, float %z, float %w, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float %x, float %y, float %z, float %w, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float %x, float %y, float %z, float %w, i1 true, i1 false)
+
+  ; enable none
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float %x, float %y, float %z, float %w, i1 true, i1 false)
+
+  ; enable different source combinations
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 3, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 5, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 9, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
+
+  ret void
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.exp.compr
+; --------------------------------------------------------------------
+
+declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) nounwind inaccessiblememonly
+
+; CHECK-LABEL: @exp_compr_disabled_inputs_to_undef(
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> undef, <2 x half> undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 1, <2 x half> <half 0xH3C00, half 0xH4000>, <2 x half> undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 2, <2 x half> <half 0xH3C00, half 0xH4000>, <2 x half> undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> <half 0xH3C00, half 0xH4000>, <2 x half> undef, i1 true, i1 false)
+
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> undef, <2 x half> undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 1, <2 x half> %xy, <2 x half> undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 2, <2 x half> %xy, <2 x half> undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> %xy, <2 x half> undef, i1 true, i1 false)
+
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 12, <2 x half> undef, <2 x half> %zw, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
+define void @exp_compr_disabled_inputs_to_undef(<2 x half> %xy, <2 x half> %zw) {
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 1, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 2, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 1, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 2, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
+
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 12, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
+  ret void
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.fmed3
+; --------------------------------------------------------------------
+
+declare float @llvm.amdgcn.fmed3.f32(float, float, float) nounwind readnone
+
+; CHECK-LABEL: @fmed3_f32(
+; CHECK: %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float %z)
+define float @fmed3_f32(float %x, float %y, float %z) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float %z)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_canonicalize_x_c0_c1_f32(
+; CHECK: call float @llvm.amdgcn.fmed3.f32(float %x, float 0.000000e+00, float 1.000000e+00)
+define float @fmed3_canonicalize_x_c0_c1_f32(float %x) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float 0.0, float 1.0)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_canonicalize_c0_x_c1_f32(
+; CHECK: call float @llvm.amdgcn.fmed3.f32(float %x, float 0.000000e+00, float 1.000000e+00)
+define float @fmed3_canonicalize_c0_x_c1_f32(float %x) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %x, float 1.0)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_canonicalize_c0_c1_x_f32(
+; CHECK: call float @llvm.amdgcn.fmed3.f32(float %x, float 0.000000e+00, float 1.000000e+00)
+define float @fmed3_canonicalize_c0_c1_x_f32(float %x) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %x)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_canonicalize_x_y_c_f32(
+; CHECK: call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float 1.000000e+00)
+define float @fmed3_canonicalize_x_y_c_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float 1.0)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_canonicalize_x_c_y_f32(
+; CHECK: %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float 1.000000e+00)
+define float @fmed3_canonicalize_x_c_y_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float 1.0, float %y)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_canonicalize_c_x_y_f32(
+; CHECK: call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float 1.000000e+00)
+define float @fmed3_canonicalize_c_x_y_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %x, float %y)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_undef_x_y_f32(
+; CHECK: call float @llvm.minnum.f32(float %x, float %y)
+define float @fmed3_undef_x_y_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float undef, float %x, float %y)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_fmf_undef_x_y_f32(
+; CHECK: call nnan float @llvm.minnum.f32(float %x, float %y)
+define float @fmed3_fmf_undef_x_y_f32(float %x, float %y) {
+  %med3 = call nnan float @llvm.amdgcn.fmed3.f32(float undef, float %x, float %y)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_x_undef_y_f32(
+; CHECK: call float @llvm.minnum.f32(float %x, float %y)
+define float @fmed3_x_undef_y_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float undef, float %y)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_x_y_undef_f32(
+; CHECK: call float @llvm.maxnum.f32(float %x, float %y)
+define float @fmed3_x_y_undef_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float undef)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_qnan0_x_y_f32(
+; CHECK: call float @llvm.minnum.f32(float %x, float %y)
+define float @fmed3_qnan0_x_y_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8000000000000, float %x, float %y)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_x_qnan0_y_f32(
+; CHECK: call float @llvm.minnum.f32(float %x, float %y)
+define float @fmed3_x_qnan0_y_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float 0x7FF8000000000000, float %y)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_x_y_qnan0_f32(
+; CHECK: call float @llvm.maxnum.f32(float %x, float %y)
+define float @fmed3_x_y_qnan0_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float 0x7FF8000000000000)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_qnan1_x_y_f32(
+; CHECK: call float @llvm.minnum.f32(float %x, float %y)
+define float @fmed3_qnan1_x_y_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8000100000000, float %x, float %y)
+  ret float %med3
+}
+
+; This can return any of the qnans.
+; CHECK-LABEL: @fmed3_qnan0_qnan1_qnan2_f32(
+; CHECK: ret float 0x7FF8030000000000
+define float @fmed3_qnan0_qnan1_qnan2_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8000100000000, float 0x7FF8002000000000, float 0x7FF8030000000000)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_constant_src0_0_f32(
+; CHECK: ret float 5.000000e-01
+define float @fmed3_constant_src0_0_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0.5, float -1.0, float 4.0)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_constant_src0_1_f32(
+; CHECK: ret float 5.000000e-01
+define float @fmed3_constant_src0_1_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0.5, float 4.0, float -1.0)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_constant_src1_0_f32(
+; CHECK: ret float 5.000000e-01
+define float @fmed3_constant_src1_0_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float -1.0, float 0.5, float 4.0)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_constant_src1_1_f32(
+; CHECK: ret float 5.000000e-01
+define float @fmed3_constant_src1_1_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 4.0, float 0.5, float -1.0)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_constant_src2_0_f32(
+; CHECK: ret float 5.000000e-01
+define float @fmed3_constant_src2_0_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float -1.0, float 4.0, float 0.5)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_constant_src2_1_f32(
+; CHECK: ret float 5.000000e-01
+define float @fmed3_constant_src2_1_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 4.0, float -1.0, float 0.5)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_x_qnan0_qnan1_f32(
+; CHECK: ret float %x
+define float @fmed3_x_qnan0_qnan1_f32(float %x) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float 0x7FF8001000000000, float 0x7FF8002000000000)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_qnan0_x_qnan1_f32(
+; CHECK: ret float %x
+define float @fmed3_qnan0_x_qnan1_f32(float %x) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8001000000000, float %x, float 0x7FF8002000000000)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_qnan0_qnan1_x_f32(
+; CHECK: ret float %x
+define float @fmed3_qnan0_qnan1_x_f32(float %x) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8001000000000, float 0x7FF8002000000000, float %x)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_nan_0_1_f32(
+; CHECK: ret float 0.0
+define float @fmed3_nan_0_1_f32() {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8001000000000, float 0.0, float 1.0)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_0_nan_1_f32(
+; CHECK: ret float 0.0
+define float @fmed3_0_nan_1_f32() {
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 0x7FF8001000000000, float 1.0)
+  ret float %med
+}
+
+; CHECK-LABEL: @fmed3_0_1_nan_f32(
+; CHECK: ret float 1.0
+define float @fmed3_0_1_nan_f32() {
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8001000000000)
+  ret float %med
+}
+
+; CHECK-LABEL: @fmed3_undef_0_1_f32(
+; CHECK: ret float 0.0
+define float @fmed3_undef_0_1_f32() {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float undef, float 0.0, float 1.0)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_0_undef_1_f32(
+; CHECK: ret float 0.0
+define float @fmed3_0_undef_1_f32() {
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float undef, float 1.0)
+  ret float %med
+}
+
+; CHECK-LABEL: @fmed3_0_1_undef_f32(
+; CHECK: ret float 1.0
+define float @fmed3_0_1_undef_f32() {
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float undef)
+  ret float %med
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.icmp
+; --------------------------------------------------------------------
+
+declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32 immarg) nounwind readnone convergent
+declare i64 @llvm.amdgcn.icmp.i64(i64, i64, i32 immarg) nounwind readnone convergent
+declare i64 @llvm.amdgcn.icmp.i1(i1, i1, i32 immarg) nounwind readnone convergent
+
+; CHECK-LABEL: @invalid_icmp_code(
+; CHECK: %under = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 31)
+; CHECK: %over = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 42)
+define i64 @invalid_icmp_code(i32 %a, i32 %b) {
+  %under = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 31)
+  %over = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 42)
+  %or = or i64 %under, %over
+  ret i64 %or
+}
+
+; CHECK-LABEL: @icmp_constant_inputs_false(
+; CHECK: ret i64 0
+define i64 @icmp_constant_inputs_false() {
+  %result = call i64 @llvm.amdgcn.icmp.i32(i32 9, i32 8, i32 32)
+  ret i64 %result
+}
+
+; CHECK-LABEL: @icmp_constant_inputs_true(
+; CHECK: %result = call i64 @llvm.read_register.i64(metadata !0) #5
+define i64 @icmp_constant_inputs_true() {
+  %result = call i64 @llvm.amdgcn.icmp.i32(i32 9, i32 8, i32 34)
+  ret i64 %result
+}
+
+; CHECK-LABEL: @icmp_constant_to_rhs_slt(
+; CHECK: %result = call i64 @llvm.amdgcn.icmp.i32(i32 %x, i32 9, i32 38)
+define i64 @icmp_constant_to_rhs_slt(i32 %x) {
+  %result = call i64 @llvm.amdgcn.icmp.i32(i32 9, i32 %x, i32 40)
+  ret i64 %result
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_eq_i32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
+define i64 @fold_icmp_ne_0_zext_icmp_eq_i32(i32 %a, i32 %b) {
+  %cmp = icmp eq i32 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_ne_i32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 33)
+define i64 @fold_icmp_ne_0_zext_icmp_ne_i32(i32 %a, i32 %b) {
+  %cmp = icmp ne i32 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_sle_i32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 41)
+define i64 @fold_icmp_ne_0_zext_icmp_sle_i32(i32 %a, i32 %b) {
+  %cmp = icmp sle i32 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_ugt_i64(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 34)
+define i64 @fold_icmp_ne_0_zext_icmp_ugt_i64(i64 %a, i64 %b) {
+  %cmp = icmp ugt i64 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_ult_swap_i64(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 34)
+define i64 @fold_icmp_ne_0_zext_icmp_ult_swap_i64(i64 %a, i64 %b) {
+  %cmp = icmp ugt i64 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 0, i32 %zext.cmp, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_fcmp_oeq_f32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 1)
+define i64 @fold_icmp_ne_0_zext_fcmp_oeq_f32(float %a, float %b) {
+  %cmp = fcmp oeq float %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_fcmp_une_f32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 14)
+define i64 @fold_icmp_ne_0_zext_fcmp_une_f32(float %a, float %b) {
+  %cmp = fcmp une float %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_fcmp_olt_f64(
+; CHECK-NEXT: call i64 @llvm.amdgcn.fcmp.f64(double %a, double %b, i32 4)
+define i64 @fold_icmp_ne_0_zext_fcmp_olt_f64(double %a, double %b) {
+  %cmp = fcmp olt double %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_sext_icmp_ne_0_i32(
+; CHECK: %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
+define i64 @fold_icmp_sext_icmp_ne_0_i32(i32 %a, i32 %b) {
+  %cmp = icmp eq i32 %a, %b
+  %sext.cmp = sext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_eq_0_zext_icmp_eq_i32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 33)
+define i64 @fold_icmp_eq_0_zext_icmp_eq_i32(i32 %a, i32 %b) {
+  %cmp = icmp eq i32 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_eq_0_zext_icmp_slt_i32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 39)
+define i64 @fold_icmp_eq_0_zext_icmp_slt_i32(i32 %a, i32 %b) {
+  %cmp = icmp slt i32 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_eq_0_zext_fcmp_oeq_f32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 14)
+define i64 @fold_icmp_eq_0_zext_fcmp_oeq_f32(float %a, float %b) {
+  %cmp = fcmp oeq float %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_eq_0_zext_fcmp_ule_f32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 2)
+define i64 @fold_icmp_eq_0_zext_fcmp_ule_f32(float %a, float %b) {
+  %cmp = fcmp ule float %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_eq_0_zext_fcmp_ogt_f32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 13)
+define i64 @fold_icmp_eq_0_zext_fcmp_ogt_f32(float %a, float %b) {
+  %cmp = fcmp ogt float %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_zext_icmp_eq_1_i32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
+define i64 @fold_icmp_zext_icmp_eq_1_i32(i32 %a, i32 %b) {
+  %cmp = icmp eq i32 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_zext_argi1_eq_1_i32(
+; CHECK: %zext.cond = zext i1 %cond to i32
+; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cond, i32 0, i32 33)
+define i64 @fold_icmp_zext_argi1_eq_1_i32(i1 %cond) {
+  %zext.cond = zext i1 %cond to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cond, i32 1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_zext_argi1_eq_neg1_i32(
+; CHECK: %zext.cond = zext i1 %cond to i32
+; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cond, i32 -1, i32 32)
+define i64 @fold_icmp_zext_argi1_eq_neg1_i32(i1 %cond) {
+  %zext.cond = zext i1 %cond to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cond, i32 -1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_sext_argi1_eq_1_i32(
+; CHECK: %sext.cond = sext i1 %cond to i32
+; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cond, i32 1, i32 32)
+define i64 @fold_icmp_sext_argi1_eq_1_i32(i1 %cond) {
+  %sext.cond = sext i1 %cond to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cond, i32 1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_sext_argi1_eq_neg1_i32(
+; CHECK: %sext.cond = sext i1 %cond to i32
+; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cond, i32 0, i32 33)
+define i64 @fold_icmp_sext_argi1_eq_neg1_i32(i1 %cond) {
+  %sext.cond = sext i1 %cond to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cond, i32 -1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_sext_argi1_eq_neg1_i64(
+; CHECK: %sext.cond = sext i1 %cond to i64
+; CHECK: call i64 @llvm.amdgcn.icmp.i64(i64 %sext.cond, i64 0, i32 33)
+define i64 @fold_icmp_sext_argi1_eq_neg1_i64(i1 %cond) {
+  %sext.cond = sext i1 %cond to i64
+  %mask = call i64 @llvm.amdgcn.icmp.i64(i64 %sext.cond, i64 -1, i32 32)
+  ret i64 %mask
+}
+
+; TODO: Should be able to fold to false
+; CHECK-LABEL: @fold_icmp_sext_icmp_eq_1_i32(
+; CHECK: %cmp = icmp eq i32 %a, %b
+; CHECK: %sext.cmp = sext i1 %cmp to i32
+; CHECK: %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cmp, i32 1, i32 32)
+define i64 @fold_icmp_sext_icmp_eq_1_i32(i32 %a, i32 %b) {
+  %cmp = icmp eq i32 %a, %b
+  %sext.cmp = sext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cmp, i32 1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_sext_icmp_eq_neg1_i32(
+; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
+define i64 @fold_icmp_sext_icmp_eq_neg1_i32(i32 %a, i32 %b) {
+  %cmp = icmp eq i32 %a, %b
+  %sext.cmp = sext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cmp, i32 -1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_sext_icmp_sge_neg1_i32(
+; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 39)
+define i64 @fold_icmp_sext_icmp_sge_neg1_i32(i32 %a, i32 %b) {
+  %cmp = icmp sge i32 %a, %b
+  %sext.cmp = sext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cmp, i32 -1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_not_icmp_ne_0_zext_icmp_sle_i32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 38)
+define i64 @fold_not_icmp_ne_0_zext_icmp_sle_i32(i32 %a, i32 %b) {
+  %cmp = icmp sle i32 %a, %b
+  %not = xor i1 %cmp, true
+  %zext.cmp = zext i1 %not to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_eq_i4(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i4 [[A:%.*]] to i16
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i4 [[B:%.*]] to i16
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i16(i16 [[TMP1]], i16 [[TMP2]], i32 32)
+; CHECK-NEXT:    ret i64 [[MASK]]
+define i64 @fold_icmp_ne_0_zext_icmp_eq_i4(i4 %a, i4 %b) {
+  %cmp = icmp eq i4 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_eq_i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i16
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i16(i16 [[TMP1]], i16 [[TMP2]], i32 32)
+; CHECK-NEXT:    ret i64 [[MASK]]
+define i64 @fold_icmp_ne_0_zext_icmp_eq_i8(i8 %a, i8 %b) {
+  %cmp = icmp eq i8 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_eq_i16(
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i16(i16 %a, i16 %b, i32 32)
+; CHECK-NEXT:    ret i64 [[MASK]]
+define i64 @fold_icmp_ne_0_zext_icmp_eq_i16(i16 %a, i16 %b) {
+  %cmp = icmp eq i16 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_eq_i36(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i36 [[A:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i36 [[B:%.*]] to i64
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i64(i64 [[TMP1]], i64 [[TMP2]], i32 32)
+; CHECK-NEXT:    ret i64 [[MASK]]
+define i64 @fold_icmp_ne_0_zext_icmp_eq_i36(i36 %a, i36 %b) {
+  %cmp = icmp eq i36 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_eq_i128(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i128 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[ZEXT_CMP:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i32(i32 [[ZEXT_CMP]], i32 0, i32 33)
+; CHECK-NEXT:    ret i64 [[MASK]]
+define i64 @fold_icmp_ne_0_zext_icmp_eq_i128(i128 %a, i128 %b) {
+  %cmp = icmp eq i128 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_fcmp_oeq_f16(
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.fcmp.f16(half [[A:%.*]], half [[B:%.*]], i32 1)
+; CHECK-NEXT:    ret i64 [[MASK]]
+define i64 @fold_icmp_ne_0_zext_fcmp_oeq_f16(half %a, half %b) {
+  %cmp = fcmp oeq half %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_fcmp_oeq_f128(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq fp128 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[ZEXT_CMP:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i32(i32 [[ZEXT_CMP]], i32 0, i32 33)
+; CHECK-NEXT:    ret i64 [[MASK]]
+define i64 @fold_icmp_ne_0_zext_fcmp_oeq_f128(fp128 %a, fp128 %b) {
+;
+  %cmp = fcmp oeq fp128 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_slt_i4(
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i4 [[A:%.*]] to i16
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i4 [[B:%.*]] to i16
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i16(i16 [[TMP1]], i16 [[TMP2]], i32 40)
+; CHECK-NEXT:    ret i64 [[MASK]]
+define i64 @fold_icmp_ne_0_zext_icmp_slt_i4(i4 %a, i4 %b) {
+  %cmp = icmp slt i4 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_slt_i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[A:%.*]] to i16
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i8 [[B:%.*]] to i16
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i16(i16 [[TMP1]], i16 [[TMP2]], i32 40)
+; CHECK-NEXT:    ret i64 [[MASK]]
+define i64 @fold_icmp_ne_0_zext_icmp_slt_i8(i8 %a, i8 %b) {
+  %cmp = icmp slt i8 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_slt_i16(
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i16(i16 %a, i16 %b, i32 40)
+; CHECK-NEXT:    ret i64 [[MASK]]
+define i64 @fold_icmp_ne_0_zext_icmp_slt_i16(i16 %a, i16 %b) {
+  %cmp = icmp slt i16 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_ult_i4(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i4 [[A:%.*]] to i16
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i4 [[B:%.*]] to i16
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i16(i16 [[TMP1]], i16 [[TMP2]], i32 36)
+; CHECK-NEXT:    ret i64 [[MASK]]
+define i64 @fold_icmp_ne_0_zext_icmp_ult_i4(i4 %a, i4 %b) {
+  %cmp = icmp ult i4 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_ult_i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i16
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i16(i16 [[TMP1]], i16 [[TMP2]], i32 36)
+; CHECK-NEXT:    ret i64 [[MASK]]
+define i64 @fold_icmp_ne_0_zext_icmp_ult_i8(i8 %a, i8 %b) {
+  %cmp = icmp ult i8 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_ult_i16(
+; CHECK-NEXT:    [[MASK:%.*]] = call i64 @llvm.amdgcn.icmp.i16(i16 %a, i16 %b, i32 36)
+; CHECK-NEXT:    ret i64 [[MASK]]
+define i64 @fold_icmp_ne_0_zext_icmp_ult_i16(i16 %a, i16 %b) {
+  %cmp = icmp ult i16 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; 1-bit NE comparisons
+
+; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_eq_i1(
+; CHECK-NEXT: icmp
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+define i64 @fold_icmp_i1_ne_0_icmp_eq_i1(i32 %a, i32 %b) {
+  %cmp = icmp eq i32 %a, %b
+  %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_ne_i1(
+; CHECK-NEXT: icmp
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+define i64 @fold_icmp_i1_ne_0_icmp_ne_i1(i32 %a, i32 %b) {
+  %cmp = icmp ne i32 %a, %b
+  %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_sle_i1(
+; CHECK-NEXT: icmp
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+define i64 @fold_icmp_i1_ne_0_icmp_sle_i1(i32 %a, i32 %b) {
+  %cmp = icmp sle i32 %a, %b
+  %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_ugt_i64(
+; CHECK-NEXT: icmp
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+define i64 @fold_icmp_i1_ne_0_icmp_ugt_i64(i64 %a, i64 %b) {
+  %cmp = icmp ugt i64 %a, %b
+  %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_ult_swap_i64(
+; CHECK-NEXT: icmp
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+define i64 @fold_icmp_i1_ne_0_icmp_ult_swap_i64(i64 %a, i64 %b) {
+  %cmp = icmp ugt i64 %a, %b
+  %mask = call i64 @llvm.amdgcn.icmp.i1(i1 false, i1 %cmp, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_i1_ne_0_fcmp_oeq_f32(
+; CHECK-NEXT: fcmp
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+define i64 @fold_icmp_i1_ne_0_fcmp_oeq_f32(float %a, float %b) {
+  %cmp = fcmp oeq float %a, %b
+  %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_i1_ne_0_fcmp_une_f32(
+; CHECK-NEXT: fcmp
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+define i64 @fold_icmp_i1_ne_0_fcmp_une_f32(float %a, float %b) {
+  %cmp = fcmp une float %a, %b
+  %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_i1_ne_0_fcmp_olt_f64(
+; CHECK-NEXT: fcmp
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+define i64 @fold_icmp_i1_ne_0_fcmp_olt_f64(double %a, double %b) {
+  %cmp = fcmp olt double %a, %b
+  %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_eq_i4(
+; CHECK-NEXT: icmp
+; CHECK: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+define i64 @fold_icmp_i1_ne_0_icmp_eq_i4(i4 %a, i4 %b) {
+  %cmp = icmp eq i4 %a, %b
+  %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_eq_i8(
+; CHECK-NEXT: icmp
+; CHECK: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+define i64 @fold_icmp_i1_ne_0_icmp_eq_i8(i8 %a, i8 %b) {
+  %cmp = icmp eq i8 %a, %b
+  %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_eq_i16(
+; CHECK-NEXT: icmp
+; CHECK: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+define i64 @fold_icmp_i1_ne_0_icmp_eq_i16(i16 %a, i16 %b) {
+  %cmp = icmp eq i16 %a, %b
+  %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_eq_i36(
+; CHECK-NEXT: icmp
+; CHECK: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+define i64 @fold_icmp_i1_ne_0_icmp_eq_i36(i36 %a, i36 %b) {
+  %cmp = icmp eq i36 %a, %b
+  %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_eq_i128(
+; CHECK-NEXT: icmp
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+define i64 @fold_icmp_i1_ne_0_icmp_eq_i128(i128 %a, i128 %b) {
+  %cmp = icmp eq i128 %a, %b
+  %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_i1_ne_0_fcmp_oeq_f16(
+; CHECK-NEXT: fcmp
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+define i64 @fold_icmp_i1_ne_0_fcmp_oeq_f16(half %a, half %b) {
+  %cmp = fcmp oeq half %a, %b
+  %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_i1_ne_0_fcmp_oeq_f128(
+; CHECK-NEXT: fcmp
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+define i64 @fold_icmp_i1_ne_0_fcmp_oeq_f128(fp128 %a, fp128 %b) {
+;
+  %cmp = fcmp oeq fp128 %a, %b
+  %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_slt_i4(
+; CHECK-NEXT: icmp
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+define i64 @fold_icmp_i1_ne_0_icmp_slt_i4(i4 %a, i4 %b) {
+  %cmp = icmp slt i4 %a, %b
+  %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_slt_i8(
+; CHECK-NEXT: icmp
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+define i64 @fold_icmp_i1_ne_0_icmp_slt_i8(i8 %a, i8 %b) {
+  %cmp = icmp slt i8 %a, %b
+  %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_slt_i16(
+; CHECK-NEXT: icmp
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+define i64 @fold_icmp_i1_ne_0_icmp_slt_i16(i16 %a, i16 %b) {
+  %cmp = icmp slt i16 %a, %b
+  %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_ult_i4(
+; CHECK-NEXT: icmp
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+define i64 @fold_icmp_i1_ne_0_icmp_ult_i4(i4 %a, i4 %b) {
+  %cmp = icmp ult i4 %a, %b
+  %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_ult_i8(
+; CHECK-NEXT: icmp
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+define i64 @fold_icmp_i1_ne_0_icmp_ult_i8(i8 %a, i8 %b) {
+  %cmp = icmp ult i8 %a, %b
+  %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_ult_i16(
+; CHECK-NEXT: icmp
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+define i64 @fold_icmp_i1_ne_0_icmp_ult_i16(i16 %a, i16 %b) {
+  %cmp = icmp ult i16 %a, %b
+  %mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
+  ret i64 %mask
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.fcmp
+; --------------------------------------------------------------------
+
+declare i64 @llvm.amdgcn.fcmp.f32(float, float, i32 immarg) nounwind readnone convergent
+
+; CHECK-LABEL: @invalid_fcmp_code(
+; CHECK: %under = call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 -1)
+; CHECK: %over = call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 16)
+define i64 @invalid_fcmp_code(float %a, float %b) {
+  %under = call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 -1)
+  %over = call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 16)
+  %or = or i64 %under, %over
+  ret i64 %or
+}
+
+; CHECK-LABEL: @fcmp_constant_inputs_false(
+; CHECK: ret i64 0
+define i64 @fcmp_constant_inputs_false() {
+  %result = call i64 @llvm.amdgcn.fcmp.f32(float 2.0, float 4.0, i32 1)
+  ret i64 %result
+}
+
+; CHECK-LABEL: @fcmp_constant_inputs_true(
+; CHECK: %result = call i64 @llvm.read_register.i64(metadata !0) #5
+define i64 @fcmp_constant_inputs_true() {
+  %result = call i64 @llvm.amdgcn.fcmp.f32(float 2.0, float 4.0, i32 4)
+  ret i64 %result
+}
+
+; CHECK-LABEL: @fcmp_constant_to_rhs_olt(
+; CHECK: %result = call i64 @llvm.amdgcn.fcmp.f32(float %x, float 4.000000e+00, i32 2)
+define i64 @fcmp_constant_to_rhs_olt(float %x) {
+  %result = call i64 @llvm.amdgcn.fcmp.f32(float 4.0, float %x, i32 4)
+  ret i64 %result
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.wqm.vote
+; --------------------------------------------------------------------
+
+declare i1 @llvm.amdgcn.wqm.vote(i1)
+
+; CHECK-LABEL: @wqm_vote_true(
+; CHECK: ret float 1.000000e+00
+define float @wqm_vote_true() {
+main_body:
+  %w = call i1 @llvm.amdgcn.wqm.vote(i1 true)
+  %r = select i1 %w, float 1.0, float 0.0
+  ret float %r
+}
+
+; CHECK-LABEL: @wqm_vote_false(
+; CHECK: ret float 0.000000e+00
+define float @wqm_vote_false() {
+main_body:
+  %w = call i1 @llvm.amdgcn.wqm.vote(i1 false)
+  %r = select i1 %w, float 1.0, float 0.0
+  ret float %r
+}
+
+; CHECK-LABEL: @wqm_vote_undef(
+; CHECK: ret float 0.000000e+00
+define float @wqm_vote_undef() {
+main_body:
+  %w = call i1 @llvm.amdgcn.wqm.vote(i1 undef)
+  %r = select i1 %w, float 1.0, float 0.0
+  ret float %r
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.kill
+; --------------------------------------------------------------------
+
+declare void @llvm.amdgcn.kill(i1)
+
+; CHECK-LABEL: @kill_true() {
+; CHECK-NEXT: ret void
+; CHECK-NEXT: }
+define void @kill_true() {
+  call void @llvm.amdgcn.kill(i1 true)
+  ret void
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.update.dpp.i32
+; --------------------------------------------------------------------
+
+declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1)
+
+; CHECK-LABEL: {{^}}define amdgpu_kernel void @update_dpp_no_combine(
+; CHECK: @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false)
+define amdgpu_kernel void @update_dpp_no_combine(i32 addrspace(1)* %out, i32 %in1, i32 %in2) {
+  %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 0)
+  store i32 %tmp0, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}define amdgpu_kernel void @update_dpp_drop_old(
+; CHECK: @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in2, i32 3, i32 15, i32 15, i1 true)
+define amdgpu_kernel void @update_dpp_drop_old(i32 addrspace(1)* %out, i32 %in1, i32 %in2) {
+  %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 3, i32 15, i32 15, i1 1)
+  store i32 %tmp0, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}define amdgpu_kernel void @update_dpp_undef_old(
+; CHECK: @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in1, i32 4, i32 15, i32 15, i1 true)
+define amdgpu_kernel void @update_dpp_undef_old(i32 addrspace(1)* %out, i32 %in1) {
+  %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 %in1, i32 4, i32 15, i32 15, i1 1)
+  store i32 %tmp0, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: attributes #5 = { convergent }

Added: llvm/trunk/test/Transforms/InstCombine/AMDGPU/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/AMDGPU/lit.local.cfg?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/AMDGPU/lit.local.cfg (added)
+++ llvm/trunk/test/Transforms/InstCombine/AMDGPU/lit.local.cfg Tue Apr 16 21:52:47 2019
@@ -0,0 +1,2 @@
+if not 'AMDGPU' in config.root.targets:
+    config.unsupported = True

Added: llvm/trunk/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,65 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+define <4 x i32> @mulByZero(<4 x i16> %x) nounwind readnone ssp {
+entry:
+  %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %x, <4 x i16> zeroinitializer) nounwind
+  ret <4 x i32> %a
+; CHECK: entry:
+; CHECK-NEXT: ret <4 x i32> zeroinitializer
+}
+
+define <4 x i32> @mulByOne(<4 x i16> %x) nounwind readnone ssp {
+entry:
+  %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %x, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+  ret <4 x i32> %a
+; CHECK: entry:
+; CHECK-NEXT: %a = sext <4 x i16> %x to <4 x i32>
+; CHECK-NEXT: ret <4 x i32> %a
+}
+
+define <4 x i32> @constantMul() nounwind readnone ssp {
+entry:
+  %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
+  ret <4 x i32> %a
+; CHECK: entry:
+; CHECK-NEXT: ret <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+}
+
+define <4 x i32> @constantMulS() nounwind readnone ssp {
+entry:
+  %b = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+  ret <4 x i32> %b
+; CHECK: entry:
+; CHECK-NEXT: ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+}
+
+define <4 x i32> @constantMulU() nounwind readnone ssp {
+entry:
+  %b = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+  ret <4 x i32> %b
+; CHECK: entry:
+; CHECK-NEXT: ret <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>
+}
+
+define <4 x i32> @complex1(<4 x i16> %x) nounwind readnone ssp {
+entry:
+  %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) nounwind
+  %b = add <4 x i32> zeroinitializer, %a
+  ret <4 x i32> %b
+; CHECK: entry:
+; CHECK-NEXT: %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) [[NUW:#[0-9]+]]
+; CHECK-NEXT: ret <4 x i32> %a
+}
+
+define <4 x i32> @complex2(<4 x i32> %x) nounwind readnone ssp {
+entry:
+  %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
+  %b = add <4 x i32> %x, %a
+  ret <4 x i32> %b  
+; CHECK: entry:
+; CHECK-NEXT: %b = add <4 x i32> %x, <i32 6, i32 6, i32 6, i32 6>
+; CHECK-NEXT: ret <4 x i32> %b
+}
+
+declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone

Added: llvm/trunk/test/Transforms/InstCombine/ARM/aes-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/ARM/aes-intrinsics.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/ARM/aes-intrinsics.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/ARM/aes-intrinsics.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,43 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+; ARM AES intrinsic variants
+
+define <16 x i8> @combineXorAeseZeroARM(<16 x i8> %data, <16 x i8> %key) {
+; CHECK-LABEL: @combineXorAeseZeroARM(
+; CHECK-NEXT:    %data.aes = tail call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %data, <16 x i8> %key)
+; CHECK-NEXT:    ret <16 x i8> %data.aes
+  %data.xor = xor <16 x i8> %data, %key
+  %data.aes = tail call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %data.xor, <16 x i8> zeroinitializer)
+  ret <16 x i8> %data.aes
+}
+
+define <16 x i8> @combineXorAeseNonZeroARM(<16 x i8> %data, <16 x i8> %key) {
+; CHECK-LABEL: @combineXorAeseNonZeroARM(
+; CHECK-NEXT:    %data.xor = xor <16 x i8> %data, %key
+; CHECK-NEXT:    %data.aes = tail call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %data.xor, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    ret <16 x i8> %data.aes
+  %data.xor = xor <16 x i8> %data, %key
+  %data.aes = tail call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %data.xor, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  ret <16 x i8> %data.aes
+}
+
+define <16 x i8> @combineXorAesdZeroARM(<16 x i8> %data, <16 x i8> %key) {
+; CHECK-LABEL: @combineXorAesdZeroARM(
+; CHECK-NEXT:    %data.aes = tail call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %data, <16 x i8> %key)
+; CHECK-NEXT:    ret <16 x i8> %data.aes
+  %data.xor = xor <16 x i8> %data, %key
+  %data.aes = tail call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %data.xor, <16 x i8> zeroinitializer)
+  ret <16 x i8> %data.aes
+}
+
+define <16 x i8> @combineXorAesdNonZeroARM(<16 x i8> %data, <16 x i8> %key) {
+; CHECK-LABEL: @combineXorAesdNonZeroARM(
+; CHECK-NEXT:    %data.xor = xor <16 x i8> %data, %key
+; CHECK-NEXT:    %data.aes = tail call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %data.xor, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    ret <16 x i8> %data.aes
+  %data.xor = xor <16 x i8> %data, %key
+  %data.aes = tail call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %data.xor, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+  ret <16 x i8> %data.aes
+}
+
+declare <16 x i8> @llvm.arm.neon.aese(<16 x i8>, <16 x i8>) #0
+declare <16 x i8> @llvm.arm.neon.aesd(<16 x i8>, <16 x i8>) #0

Added: llvm/trunk/test/Transforms/InstCombine/ARM/constant-fold-hang.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/ARM/constant-fold-hang.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/ARM/constant-fold-hang.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/ARM/constant-fold-hang.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,14 @@
+; RUN: opt -instcombine < %s
+
+; Function Attrs: nounwind readnone ssp
+define void @mulByZero(<4 x i16> %x) #0 {
+entry:
+  %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %x, <4 x i16> zeroinitializer) #2
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) #1
+
+attributes #0 = { nounwind readnone ssp }
+attributes #1 = { nounwind readnone }

Added: llvm/trunk/test/Transforms/InstCombine/ARM/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/ARM/lit.local.cfg?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/ARM/lit.local.cfg (added)
+++ llvm/trunk/test/Transforms/InstCombine/ARM/lit.local.cfg Tue Apr 16 21:52:47 2019
@@ -0,0 +1,2 @@
+if not 'ARM' in config.root.targets:
+    config.unsupported = True

Added: llvm/trunk/test/Transforms/InstCombine/ARM/neon-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/ARM/neon-intrinsics.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/ARM/neon-intrinsics.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/ARM/neon-intrinsics.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,25 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; The alignment arguments for NEON load/store intrinsics can be increased
+; by instcombine.  Check for this.
+
+; CHECK: vld4.v2i32.p0i8({{.*}}, i32 32)
+; CHECK: vst4.p0i8.v2i32({{.*}}, i32 16)
+
+ at x = common global [8 x i32] zeroinitializer, align 32
+ at y = common global [8 x i32] zeroinitializer, align 16
+
+%struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
+
+define void @test() nounwind ssp {
+  %tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32.p0i8(i8* bitcast ([8 x i32]* @x to i8*), i32 1)
+  %tmp2 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 0
+  %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 1
+  %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 2
+  %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 3
+  call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* bitcast ([8 x i32]* @y to i8*), <2 x i32> %tmp2, <2 x i32> %tmp3, <2 x i32> %tmp4, <2 x i32> %tmp5, i32 1)
+  ret void
+}
+
+declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32.p0i8(i8*, i32) nounwind readonly
+declare void @llvm.arm.neon.vst4.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind

Added: llvm/trunk/test/Transforms/InstCombine/ARM/strcmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/ARM/strcmp.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/ARM/strcmp.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/ARM/strcmp.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,153 @@
+; Test that the strcmp library call simplifier works correctly.
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+ at hello = constant [6 x i8] c"hello\00"
+ at hell = constant [5 x i8] c"hell\00"
+ at bell = constant [5 x i8] c"bell\00"
+ at null = constant [1 x i8] zeroinitializer
+
+declare i32 @strcmp(i8*, i8*)
+
+; strcmp("", x) -> -*x
+define arm_aapcscc i32 @test1(i8* %str2) {
+; CHECK-LABEL: @test1(
+; CHECK: %strcmpload = load i8, i8* %str
+; CHECK: %1 = zext i8 %strcmpload to i32
+; CHECK: %2 = sub nsw i32 0, %1
+; CHECK: ret i32 %2
+
+  %str1 = getelementptr inbounds [1 x i8], [1 x i8]* @null, i32 0, i32 0
+  %temp1 = call arm_apcscc i32 @strcmp(i8* %str1, i8* %str2)
+  ret i32 %temp1
+
+}
+
+; strcmp(x, "") -> *x
+define arm_aapcscc i32 @test2(i8* %str1) {
+; CHECK-LABEL: @test2(
+; CHECK: %strcmpload = load i8, i8* %str
+; CHECK: %1 = zext i8 %strcmpload to i32
+; CHECK: ret i32 %1
+
+  %str2 = getelementptr inbounds [1 x i8], [1 x i8]* @null, i32 0, i32 0
+  %temp1 = call arm_aapcscc i32 @strcmp(i8* %str1, i8* %str2)
+  ret i32 %temp1
+}
+
+; strcmp(x, y)  -> cnst
+define arm_aapcscc i32 @test3() {
+; CHECK-LABEL: @test3(
+; CHECK: ret i32 -1
+
+  %str1 = getelementptr inbounds [5 x i8], [5 x i8]* @hell, i32 0, i32 0
+  %str2 = getelementptr inbounds [6 x i8], [6 x i8]* @hello, i32 0, i32 0
+  %temp1 = call arm_aapcscc i32 @strcmp(i8* %str1, i8* %str2)
+  ret i32 %temp1
+}
+
+define arm_aapcscc i32 @test4() {
+; CHECK-LABEL: @test4(
+; CHECK: ret i32 1
+
+  %str1 = getelementptr inbounds [5 x i8], [5 x i8]* @hell, i32 0, i32 0
+  %str2 = getelementptr inbounds [1 x i8], [1 x i8]* @null, i32 0, i32 0
+  %temp1 = call arm_aapcscc i32 @strcmp(i8* %str1, i8* %str2)
+  ret i32 %temp1
+}
+
+; strcmp(x, y)   -> memcmp(x, y, <known length>)
+; (This transform is rather difficult to trigger in a useful manner)
+define arm_aapcscc i32 @test5(i1 %b) {
+; CHECK-LABEL: @test5(
+; CHECK: %memcmp = call i32 @memcmp(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @hello, i32 0, i32 0), i8* %str2, i32 5)
+; CHECK: ret i32 %memcmp
+
+  %str1 = getelementptr inbounds [6 x i8], [6 x i8]* @hello, i32 0, i32 0
+  %temp1 = getelementptr inbounds [5 x i8], [5 x i8]* @hell, i32 0, i32 0
+  %temp2 = getelementptr inbounds [5 x i8], [5 x i8]* @bell, i32 0, i32 0
+  %str2 = select i1 %b, i8* %temp1, i8* %temp2
+  %temp3 = call arm_aapcscc i32 @strcmp(i8* %str1, i8* %str2)
+  ret i32 %temp3
+}
+
+; strcmp(x,x)  -> 0
+define arm_aapcscc i32 @test6(i8* %str) {
+; CHECK-LABEL: @test6(
+; CHECK: ret i32 0
+
+  %temp1 = call arm_aapcscc i32 @strcmp(i8* %str, i8* %str)
+  ret i32 %temp1
+}
+
+; strcmp("", x) -> -*x
+define arm_aapcs_vfpcc i32 @test1_vfp(i8* %str2) {
+; CHECK-LABEL: @test1_vfp(
+; CHECK: %strcmpload = load i8, i8* %str
+; CHECK: %1 = zext i8 %strcmpload to i32
+; CHECK: %2 = sub nsw i32 0, %1
+; CHECK: ret i32 %2
+
+  %str1 = getelementptr inbounds [1 x i8], [1 x i8]* @null, i32 0, i32 0
+  %temp1 = call arm_aapcs_vfpcc i32 @strcmp(i8* %str1, i8* %str2)
+  ret i32 %temp1
+
+}
+
+; strcmp(x, "") -> *x
+define arm_aapcs_vfpcc i32 @test2_vfp(i8* %str1) {
+; CHECK-LABEL: @test2_vfp(
+; CHECK: %strcmpload = load i8, i8* %str
+; CHECK: %1 = zext i8 %strcmpload to i32
+; CHECK: ret i32 %1
+
+  %str2 = getelementptr inbounds [1 x i8], [1 x i8]* @null, i32 0, i32 0
+  %temp1 = call arm_aapcs_vfpcc i32 @strcmp(i8* %str1, i8* %str2)
+  ret i32 %temp1
+}
+
+; strcmp(x, y)  -> cnst
+define arm_aapcs_vfpcc i32 @test3_vfp() {
+; CHECK-LABEL: @test3_vfp(
+; CHECK: ret i32 -1
+
+  %str1 = getelementptr inbounds [5 x i8], [5 x i8]* @hell, i32 0, i32 0
+  %str2 = getelementptr inbounds [6 x i8], [6 x i8]* @hello, i32 0, i32 0
+  %temp1 = call arm_aapcs_vfpcc i32 @strcmp(i8* %str1, i8* %str2)
+  ret i32 %temp1
+}
+
+define arm_aapcs_vfpcc i32 @test4_vfp() {
+; CHECK-LABEL: @test4_vfp(
+; CHECK: ret i32 1
+
+  %str1 = getelementptr inbounds [5 x i8], [5 x i8]* @hell, i32 0, i32 0
+  %str2 = getelementptr inbounds [1 x i8], [1 x i8]* @null, i32 0, i32 0
+  %temp1 = call arm_aapcs_vfpcc i32 @strcmp(i8* %str1, i8* %str2)
+  ret i32 %temp1
+}
+
+; strcmp(x, y)   -> memcmp(x, y, <known length>)
+; (This transform is rather difficult to trigger in a useful manner)
+define arm_aapcs_vfpcc i32 @test5_vfp(i1 %b) {
+; CHECK-LABEL: @test5_vfp(
+; CHECK: %memcmp = call i32 @memcmp(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @hello, i32 0, i32 0), i8* %str2, i32 5)
+; CHECK: ret i32 %memcmp
+
+  %str1 = getelementptr inbounds [6 x i8], [6 x i8]* @hello, i32 0, i32 0
+  %temp1 = getelementptr inbounds [5 x i8], [5 x i8]* @hell, i32 0, i32 0
+  %temp2 = getelementptr inbounds [5 x i8], [5 x i8]* @bell, i32 0, i32 0
+  %str2 = select i1 %b, i8* %temp1, i8* %temp2
+  %temp3 = call arm_aapcs_vfpcc i32 @strcmp(i8* %str1, i8* %str2)
+  ret i32 %temp3
+}
+
+; strcmp(x,x)  -> 0
+define arm_aapcs_vfpcc i32 @test6_vfp(i8* %str) {
+; CHECK-LABEL: @test6_vfp(
+; CHECK: ret i32 0
+
+  %temp1 = call arm_aapcs_vfpcc i32 @strcmp(i8* %str, i8* %str)
+  ret i32 %temp1
+}

Added: llvm/trunk/test/Transforms/InstCombine/ARM/strcpy.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/ARM/strcpy.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/ARM/strcpy.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/ARM/strcpy.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,76 @@
+; Test that the strcpy library call simplifier works correctly for ARM procedure calls
+; RUN: opt < %s -instcombine -S | FileCheck %s
+;
+; This transformation requires the pointer size, as it assumes that size_t is
+; the size of a pointer.
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+ at hello = constant [6 x i8] c"hello\00"
+ at a = common global [32 x i8] zeroinitializer, align 1
+ at b = common global [32 x i8] zeroinitializer, align 1
+
+declare i8* @strcpy(i8*, i8*)
+
+define arm_aapcscc void @test_simplify1() {
+; CHECK-LABEL: @test_simplify1(
+
+  %dst = getelementptr [32 x i8], [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8], [6 x i8]* @hello, i32 0, i32 0
+
+  call arm_aapcscc i8* @strcpy(i8* %dst, i8* %src)
+; CHECK: @llvm.memcpy.p0i8.p0i8.i32
+  ret void
+}
+
+define arm_aapcscc i8* @test_simplify2() {
+; CHECK-LABEL: @test_simplify2(
+
+  %dst = getelementptr [32 x i8], [32 x i8]* @a, i32 0, i32 0
+
+  %ret = call arm_aapcscc i8* @strcpy(i8* %dst, i8* %dst)
+; CHECK: ret i8* getelementptr inbounds ([32 x i8], [32 x i8]* @a, i32 0, i32 0)
+  ret i8* %ret
+}
+
+define arm_aapcscc i8* @test_no_simplify1() {
+; CHECK-LABEL: @test_no_simplify1(
+
+  %dst = getelementptr [32 x i8], [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [32 x i8], [32 x i8]* @b, i32 0, i32 0
+
+  %ret = call arm_aapcscc i8* @strcpy(i8* %dst, i8* %src)
+; CHECK: call arm_aapcscc i8* @strcpy
+  ret i8* %ret
+}
+
+define arm_aapcs_vfpcc void @test_simplify1_vfp() {
+; CHECK-LABEL: @test_simplify1_vfp(
+
+  %dst = getelementptr [32 x i8], [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8], [6 x i8]* @hello, i32 0, i32 0
+
+  call arm_aapcs_vfpcc i8* @strcpy(i8* %dst, i8* %src)
+; CHECK: @llvm.memcpy.p0i8.p0i8.i32
+  ret void
+}
+
+define arm_aapcs_vfpcc i8* @test_simplify2_vfp() {
+; CHECK-LABEL: @test_simplify2_vfp(
+
+  %dst = getelementptr [32 x i8], [32 x i8]* @a, i32 0, i32 0
+
+  %ret = call arm_aapcs_vfpcc i8* @strcpy(i8* %dst, i8* %dst)
+; CHECK: ret i8* getelementptr inbounds ([32 x i8], [32 x i8]* @a, i32 0, i32 0)
+  ret i8* %ret
+}
+
+define arm_aapcs_vfpcc i8* @test_no_simplify1_vfp() {
+; CHECK-LABEL: @test_no_simplify1_vfp(
+
+  %dst = getelementptr [32 x i8], [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [32 x i8], [32 x i8]* @b, i32 0, i32 0
+
+  %ret = call arm_aapcs_vfpcc i8* @strcpy(i8* %dst, i8* %src)
+; CHECK: call arm_aapcs_vfpcc i8* @strcpy
+  ret i8* %ret
+}

Added: llvm/trunk/test/Transforms/InstCombine/ARM/tbl1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/ARM/tbl1.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/ARM/tbl1.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/ARM/tbl1.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv8-arm-none-eabi"
+
+; Turning a table lookup intrinsic into a shuffle vector instruction
+; can be beneficial. If the mask used for the lookup is the constant
+; vector {7,6,5,4,3,2,1,0}, then the back-end generates rev64
+; instructions instead.
+
+define <8 x i8> @tbl1_8x8(<8 x i8> %vec) {
+; CHECK-LABEL: @tbl1_8x8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VEC:%.*]], <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+;
+entry:
+  %vtbl1 = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %vec, <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <8 x i8> %vtbl1
+}
+
+; Bail the optimization if a mask index is out of range.
+define <8 x i8> @tbl1_8x8_out_of_range(<8 x i8> %vec) {
+; CHECK-LABEL: @tbl1_8x8_out_of_range(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[VTBL1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[VEC:%.*]], <8 x i8> <i8 8, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; CHECK-NEXT:    ret <8 x i8> [[VTBL1]]
+;
+entry:
+  %vtbl1 = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %vec, <8 x i8> <i8 8, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <8 x i8> %vtbl1
+}
+
+declare <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8>, <8 x i8>)

Added: llvm/trunk/test/Transforms/InstCombine/ARM/vld1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/ARM/vld1.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/ARM/vld1.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/ARM/vld1.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,118 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv8-arm-none-eabi"
+
+; Turning a vld1 intrinsic into an llvm load is beneficial
+; when the underlying object being addressed comes from a
+; constant, since we get constant-folding for free.
+
+; Bail the optimization if the alignment is not a constant.
+define <2 x i64> @vld1_align(i8* %ptr, i32 %align) {
+; CHECK-LABEL: @vld1_align(
+; CHECK-NEXT:    [[VLD1:%.*]] = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8* [[PTR:%.*]], i32 [[ALIGN:%.*]])
+; CHECK-NEXT:    ret <2 x i64> [[VLD1]]
+;
+  %vld1 = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8* %ptr, i32 %align)
+  ret <2 x i64> %vld1
+}
+
+; Bail the optimization if the alignment is not power of 2.
+define <2 x i64> @vld1_align_pow2(i8* %ptr) {
+; CHECK-LABEL: @vld1_align_pow2(
+; CHECK-NEXT:    [[VLD1:%.*]] = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8* [[PTR:%.*]], i32 3)
+; CHECK-NEXT:    ret <2 x i64> [[VLD1]]
+;
+  %vld1 = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8* %ptr, i32 3)
+  ret <2 x i64> %vld1
+}
+
+define <8 x i8> @vld1_8x8(i8* %ptr) {
+; CHECK-LABEL: @vld1_8x8(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[PTR:%.*]] to <8 x i8>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    ret <8 x i8> [[TMP2]]
+;
+  %vld1 = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* %ptr, i32 1)
+  ret <8 x i8> %vld1
+}
+
+define <4 x i16> @vld1_4x16(i8* %ptr) {
+; CHECK-LABEL: @vld1_4x16(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[PTR:%.*]] to <4 x i16>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2
+; CHECK-NEXT:    ret <4 x i16> [[TMP2]]
+;
+  %vld1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* %ptr, i32 2)
+  ret <4 x i16> %vld1
+}
+
+define <2 x i32> @vld1_2x32(i8* %ptr) {
+; CHECK-LABEL: @vld1_2x32(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[PTR:%.*]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    ret <2 x i32> [[TMP2]]
+;
+  %vld1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8* %ptr, i32 4)
+  ret <2 x i32> %vld1
+}
+
+define <1 x i64> @vld1_1x64(i8* %ptr) {
+; CHECK-LABEL: @vld1_1x64(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[PTR:%.*]] to <1 x i64>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]], align 8
+; CHECK-NEXT:    ret <1 x i64> [[TMP2]]
+;
+  %vld1 = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* %ptr, i32 8)
+  ret <1 x i64> %vld1
+}
+
+define <8 x i16> @vld1_8x16(i8* %ptr) {
+; CHECK-LABEL: @vld1_8x16(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[PTR:%.*]] to <8 x i16>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2
+; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+;
+  %vld1 = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %ptr, i32 2)
+  ret <8 x i16> %vld1
+}
+
+define <16 x i8> @vld1_16x8(i8* %ptr) {
+; CHECK-LABEL: @vld1_16x8(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[PTR:%.*]] to <16 x i8>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+;
+  %vld1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %ptr, i32 1)
+  ret <16 x i8> %vld1
+}
+
+define <4 x i32> @vld1_4x32(i8* %ptr) {
+; CHECK-LABEL: @vld1_4x32(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[PTR:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+;
+  %vld1 = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8* %ptr, i32 4)
+  ret <4 x i32> %vld1
+}
+
+define <2 x i64> @vld1_2x64(i8* %ptr) {
+; CHECK-LABEL: @vld1_2x64(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[PTR:%.*]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+;
+  %vld1 = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8* %ptr, i32 8)
+  ret <2 x i64> %vld1
+}
+
+declare <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8*, i32)
+declare <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8*, i32)
+declare <2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8*, i32)
+declare <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8*, i32)
+declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32)
+declare <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8*, i32)
+declare <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8*, i32)
+declare <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8*, i32)

Added: llvm/trunk/test/Transforms/InstCombine/AddOverFlow.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/AddOverFlow.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/AddOverFlow.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/AddOverFlow.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,266 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; %a is negative, %b is positive
+define i16 @oppositesign(i16 %x, i16 %y) {
+; CHECK-LABEL: @oppositesign(
+; CHECK-NEXT:    [[A:%.*]] = or i16 [[X:%.*]], -32768
+; CHECK-NEXT:    [[B:%.*]] = and i16 [[Y:%.*]], 32767
+; CHECK-NEXT:    [[C:%.*]] = add nsw i16 [[A]], [[B]]
+; CHECK-NEXT:    ret i16 [[C]]
+;
+  %a = or i16 %x, 32768
+  %b = and i16 %y, 32767
+  %c = add i16 %a, %b
+  ret i16 %c
+}
+
+define i16 @zero_sign_bit(i16 %a) {
+; CHECK-LABEL: @zero_sign_bit(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i16 [[A:%.*]], 32767
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw i16 [[TMP1]], 512
+; CHECK-NEXT:    ret i16 [[TMP2]]
+;
+  %1 = and i16 %a, 32767
+  %2 = add i16 %1, 512
+  ret i16 %2
+}
+
+define i16 @zero_sign_bit2(i16 %a, i16 %b) {
+; CHECK-LABEL: @zero_sign_bit2(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i16 [[A:%.*]], 32767
+; CHECK-NEXT:    [[TMP2:%.*]] = and i16 [[B:%.*]], 32767
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw i16 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i16 [[TMP3]]
+;
+  %1 = and i16 %a, 32767
+  %2 = and i16 %b, 32767
+  %3 = add i16 %1, %2
+  ret i16 %3
+}
+
+declare i16 @bounded(i16 %input);
+declare i32 @__gxx_personality_v0(...);
+!0 = !{i16 0, i16 32768} ; [0, 32767]
+!1 = !{i16 0, i16 32769} ; [0, 32768]
+
+define i16 @add_bounded_values(i16 %a, i16 %b) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+; CHECK-LABEL: @add_bounded_values(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C:%.*]] = call i16 @bounded(i16 [[A:%.*]]), !range !0
+; CHECK-NEXT:    [[D:%.*]] = invoke i16 @bounded(i16 [[B:%.*]])
+; CHECK-NEXT:    to label [[CONT:%.*]] unwind label [[LPAD:%.*]], !range !0
+; CHECK:       cont:
+; CHECK-NEXT:    [[E:%.*]] = add nuw i16 [[C]], [[D]]
+; CHECK-NEXT:    ret i16 [[E]]
+; CHECK:       lpad:
+; CHECK-NEXT:    [[TMP0:%.*]] = landingpad { i8*, i32 }
+; CHECK-NEXT:    filter [0 x i8*] zeroinitializer
+; CHECK-NEXT:    ret i16 42
+;
+entry:
+  %c = call i16 @bounded(i16 %a), !range !0
+  %d = invoke i16 @bounded(i16 %b) to label %cont unwind label %lpad, !range !0
+cont:
+; %c and %d are in [0, 32767]. Therefore, %c + %d doesn't unsigned overflow.
+  %e = add i16 %c, %d
+  ret i16 %e
+lpad:
+  %0 = landingpad { i8*, i32 }
+  filter [0 x i8*] zeroinitializer
+  ret i16 42
+}
+
+define i16 @add_bounded_values_2(i16 %a, i16 %b) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+; CHECK-LABEL: @add_bounded_values_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C:%.*]] = call i16 @bounded(i16 [[A:%.*]]), !range !1
+; CHECK-NEXT:    [[D:%.*]] = invoke i16 @bounded(i16 [[B:%.*]])
+; CHECK-NEXT:    to label [[CONT:%.*]] unwind label [[LPAD:%.*]], !range !1
+; CHECK:       cont:
+; CHECK-NEXT:    [[E:%.*]] = add i16 [[C]], [[D]]
+; CHECK-NEXT:    ret i16 [[E]]
+; CHECK:       lpad:
+; CHECK-NEXT:    [[TMP0:%.*]] = landingpad { i8*, i32 }
+; CHECK-NEXT:    filter [0 x i8*] zeroinitializer
+; CHECK-NEXT:    ret i16 42
+;
+entry:
+  %c = call i16 @bounded(i16 %a), !range !1
+  %d = invoke i16 @bounded(i16 %b) to label %cont unwind label %lpad, !range !1
+cont:
+; Similar to add_bounded_values, but %c and %d are in [0, 32768]. Therefore,
+; %c + %d may unsigned overflow and we cannot add NUW.
+  %e = add i16 %c, %d
+  ret i16 %e
+lpad:
+  %0 = landingpad { i8*, i32 }
+  filter [0 x i8*] zeroinitializer
+  ret i16 42
+}
+
+; %a has at most one bit set
+; %b has a 0 bit other than the sign bit
+define i16 @ripple_nsw1(i16 %x, i16 %y) {
+; CHECK-LABEL: @ripple_nsw1(
+; CHECK-NEXT:    [[A:%.*]] = and i16 [[Y:%.*]], 1
+; CHECK-NEXT:    [[B:%.*]] = and i16 [[X:%.*]], -16385
+; CHECK-NEXT:    [[C:%.*]] = add nuw nsw i16 [[A]], [[B]]
+; CHECK-NEXT:    ret i16 [[C]]
+;
+  %a = and i16 %y, 1
+  %b = and i16 %x, 49151
+  %c = add i16 %a, %b
+  ret i16 %c
+}
+
+; Like the previous test, but flip %a and %b
+define i16 @ripple_nsw2(i16 %x, i16 %y) {
+; CHECK-LABEL: @ripple_nsw2(
+; CHECK-NEXT:    [[A:%.*]] = and i16 [[Y:%.*]], 1
+; CHECK-NEXT:    [[B:%.*]] = and i16 [[X:%.*]], -16385
+; CHECK-NEXT:    [[C:%.*]] = add nuw nsw i16 [[B]], [[A]]
+; CHECK-NEXT:    ret i16 [[C]]
+;
+  %a = and i16 %y, 1
+  %b = and i16 %x, 49151
+  %c = add i16 %b, %a
+  ret i16 %c
+}
+
+define i16 @ripple_nsw3(i16 %x, i16 %y) {
+; CHECK-LABEL: @ripple_nsw3(
+; CHECK-NEXT:    [[A:%.*]] = and i16 [[Y:%.*]], -21845
+; CHECK-NEXT:    [[B:%.*]] = and i16 [[X:%.*]], 21843
+; CHECK-NEXT:    [[C:%.*]] = add nuw nsw i16 [[A]], [[B]]
+; CHECK-NEXT:    ret i16 [[C]]
+;
+  %a = and i16 %y, 43691
+  %b = and i16 %x, 21843
+  %c = add i16 %a, %b
+  ret i16 %c
+}
+
+; Like the previous test, but flip %a and %b
+define i16 @ripple_nsw4(i16 %x, i16 %y) {
+; CHECK-LABEL: @ripple_nsw4(
+; CHECK-NEXT:    [[A:%.*]] = and i16 [[Y:%.*]], -21845
+; CHECK-NEXT:    [[B:%.*]] = and i16 [[X:%.*]], 21843
+; CHECK-NEXT:    [[C:%.*]] = add nuw nsw i16 [[B]], [[A]]
+; CHECK-NEXT:    ret i16 [[C]]
+;
+  %a = and i16 %y, 43691
+  %b = and i16 %x, 21843
+  %c = add i16 %b, %a
+  ret i16 %c
+}
+
+define i16 @ripple_nsw5(i16 %x, i16 %y) {
+; CHECK-LABEL: @ripple_nsw5(
+; CHECK-NEXT:    [[A:%.*]] = or i16 [[Y:%.*]], -21845
+; CHECK-NEXT:    [[B:%.*]] = or i16 [[X:%.*]], -10923
+; CHECK-NEXT:    [[C:%.*]] = add nsw i16 [[A]], [[B]]
+; CHECK-NEXT:    ret i16 [[C]]
+;
+  %a = or i16 %y, 43691
+  %b = or i16 %x, 54613
+  %c = add i16 %a, %b
+  ret i16 %c
+}
+
+; Like the previous test, but flip %a and %b
+define i16 @ripple_nsw6(i16 %x, i16 %y) {
+; CHECK-LABEL: @ripple_nsw6(
+; CHECK-NEXT:    [[A:%.*]] = or i16 [[Y:%.*]], -21845
+; CHECK-NEXT:    [[B:%.*]] = or i16 [[X:%.*]], -10923
+; CHECK-NEXT:    [[C:%.*]] = add nsw i16 [[B]], [[A]]
+; CHECK-NEXT:    ret i16 [[C]]
+;
+  %a = or i16 %y, 43691
+  %b = or i16 %x, 54613
+  %c = add i16 %b, %a
+  ret i16 %c
+}
+
+; We know nothing about %x
+define i32 @ripple_no_nsw1(i32 %x, i32 %y) {
+; CHECK-LABEL: @ripple_no_nsw1(
+; CHECK-NEXT:    [[A:%.*]] = and i32 [[Y:%.*]], 1
+; CHECK-NEXT:    [[B:%.*]] = add i32 [[A]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[B]]
+;
+  %a = and i32 %y, 1
+  %b = add i32 %a, %x
+  ret i32 %b
+}
+
+; %a has at most one bit set
+; %b has a 0 bit, but it is the sign bit
+define i16 @ripple_no_nsw2(i16 %x, i16 %y) {
+; CHECK-LABEL: @ripple_no_nsw2(
+; CHECK-NEXT:    [[A:%.*]] = and i16 [[Y:%.*]], 1
+; CHECK-NEXT:    [[B:%.*]] = and i16 [[X:%.*]], 32767
+; CHECK-NEXT:    [[C:%.*]] = add nuw i16 [[A]], [[B]]
+; CHECK-NEXT:    ret i16 [[C]]
+;
+  %a = and i16 %y, 1
+  %b = and i16 %x, 32767
+  %c = add i16 %a, %b
+  ret i16 %c
+}
+
+define i16 @ripple_no_nsw3(i16 %x, i16 %y) {
+; CHECK-LABEL: @ripple_no_nsw3(
+; CHECK-NEXT:    [[A:%.*]] = and i16 [[Y:%.*]], -21845
+; CHECK-NEXT:    [[B:%.*]] = and i16 [[X:%.*]], 21845
+; CHECK-NEXT:    [[C:%.*]] = add i16 [[A]], [[B]]
+; CHECK-NEXT:    ret i16 [[C]]
+;
+  %a = and i16 %y, 43691
+  %b = and i16 %x, 21845
+  %c = add i16 %a, %b
+  ret i16 %c
+}
+
+; Like the previous test, but flip %a and %b
+define i16 @ripple_no_nsw4(i16 %x, i16 %y) {
+; CHECK-LABEL: @ripple_no_nsw4(
+; CHECK-NEXT:    [[A:%.*]] = and i16 [[Y:%.*]], -21845
+; CHECK-NEXT:    [[B:%.*]] = and i16 [[X:%.*]], 21845
+; CHECK-NEXT:    [[C:%.*]] = add i16 [[B]], [[A]]
+; CHECK-NEXT:    ret i16 [[C]]
+;
+  %a = and i16 %y, 43691
+  %b = and i16 %x, 21845
+  %c = add i16 %b, %a
+  ret i16 %c
+}
+
+define i16 @ripple_no_nsw5(i16 %x, i16 %y) {
+; CHECK-LABEL: @ripple_no_nsw5(
+; CHECK-NEXT:    [[A:%.*]] = or i16 [[Y:%.*]], -21847
+; CHECK-NEXT:    [[B:%.*]] = or i16 [[X:%.*]], -10923
+; CHECK-NEXT:    [[C:%.*]] = add i16 [[A]], [[B]]
+; CHECK-NEXT:    ret i16 [[C]]
+;
+  %a = or i16 %y, 43689
+  %b = or i16 %x, 54613
+  %c = add i16 %a, %b
+  ret i16 %c
+}
+
+; Like the previous test, but flip %a and %b
+define i16 @ripple_no_nsw6(i16 %x, i16 %y) {
+; CHECK-LABEL: @ripple_no_nsw6(
+; CHECK-NEXT:    [[A:%.*]] = or i16 [[Y:%.*]], -21847
+; CHECK-NEXT:    [[B:%.*]] = or i16 [[X:%.*]], -10923
+; CHECK-NEXT:    [[C:%.*]] = add i16 [[B]], [[A]]
+; CHECK-NEXT:    ret i16 [[C]]
+;
+  %a = or i16 %y, 43689
+  %b = or i16 %x, 54613
+  %c = add i16 %b, %a
+  ret i16 %c
+}

Added: llvm/trunk/test/Transforms/InstCombine/CPP_min_max.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/CPP_min_max.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/CPP_min_max.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/CPP_min_max.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,34 @@
+; RUN: opt < %s -instcombine -S | \
+; RUN:   grep select | not grep 'i32\*'
+
+; This testcase corresponds to PR362, which notices that this horrible code
+; is generated by the C++ front-end and LLVM optimizers, which has lots of
+; loads and other stuff that are unneeded.
+;
+; Instcombine should propagate the load through the select instructions to
+; allow elimination of the extra stuff by the mem2reg pass.
+
+define void @_Z5test1RiS_(i32* %x, i32* %y) {
+entry:
+        %tmp.1.i = load i32, i32* %y         ; <i32> [#uses=1]
+        %tmp.3.i = load i32, i32* %x         ; <i32> [#uses=1]
+        %tmp.4.i = icmp slt i32 %tmp.1.i, %tmp.3.i              ; <i1> [#uses=1]
+        %retval.i = select i1 %tmp.4.i, i32* %y, i32* %x                ; <i32*> [#uses=1]
+        %tmp.4 = load i32, i32* %retval.i            ; <i32> [#uses=1]
+        store i32 %tmp.4, i32* %x
+        ret void
+}
+
+define void @_Z5test2RiS_(i32* %x, i32* %y) {
+entry:
+        %tmp.0 = alloca i32             ; <i32*> [#uses=2]
+        %tmp.2 = load i32, i32* %x           ; <i32> [#uses=2]
+        store i32 %tmp.2, i32* %tmp.0
+        %tmp.3.i = load i32, i32* %y         ; <i32> [#uses=1]
+        %tmp.4.i = icmp slt i32 %tmp.2, %tmp.3.i                ; <i1> [#uses=1]
+        %retval.i = select i1 %tmp.4.i, i32* %y, i32* %tmp.0            ; <i32*> [#uses=1]
+        %tmp.6 = load i32, i32* %retval.i            ; <i32> [#uses=1]
+        store i32 %tmp.6, i32* %y
+        ret void
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/ExtractCast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/ExtractCast.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/ExtractCast.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/ExtractCast.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,27 @@
+; RUN: opt < %s -instcombine -S -o - | FileCheck %s
+
+; CHECK-LABEL: @a(
+define i32 @a(<4 x i64> %I) {
+entry:
+; CHECK-NOT: trunc <4 x i64>
+        %J = trunc <4 x i64> %I to <4 x i32>
+        %K = extractelement <4 x i32> %J, i32 3
+; CHECK: extractelement <4 x i64>
+; CHECK: trunc i64
+; CHECK: ret
+        ret i32 %K
+}
+
+
+; CHECK-LABEL: @b(
+define i32 @b(<4 x float> %I) {
+entry:
+; CHECK-NOT: fptosi <4 x float>
+        %J = fptosi <4 x float> %I to <4 x i32>
+        %K = extractelement <4 x i32> %J, i32 3
+; CHECK: extractelement <4 x float>
+; CHECK: fptosi float
+; CHECK: ret
+        ret i32 %K
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/IntPtrCast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/IntPtrCast.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/IntPtrCast.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/IntPtrCast.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,10 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e-p:32:32"
+
+define i32* @test(i32* %P) {
+        %V = ptrtoint i32* %P to i32            ; <i32> [#uses=1]
+        %P2 = inttoptr i32 %V to i32*           ; <i32*> [#uses=1]
+        ret i32* %P2
+; CHECK: ret i32* %P
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/JavaCompare.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/JavaCompare.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/JavaCompare.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/JavaCompare.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,14 @@
+; This is the sequence of stuff that the Java front-end expands for a single 
+; <= comparison.  Check to make sure we turn it into a <= (only)
+
+; RUN: opt < %s -instcombine -S | grep "icmp sle i32 %A, %B"
+
+define i1 @le(i32 %A, i32 %B) {
+        %c1 = icmp sgt i32 %A, %B               ; <i1> [#uses=1]
+        %tmp = select i1 %c1, i32 1, i32 0              ; <i32> [#uses=1]
+        %c2 = icmp slt i32 %A, %B               ; <i1> [#uses=1]
+        %result = select i1 %c2, i32 -1, i32 %tmp               ; <i32> [#uses=1]
+        %c3 = icmp sle i32 %result, 0           ; <i1> [#uses=1]
+        ret i1 %c3
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/LandingPadClauses.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/LandingPadClauses.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/LandingPadClauses.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/LandingPadClauses.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,288 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+ at T1 = external constant i32
+ at T2 = external constant i32
+ at T3 = external constant i32
+
+declare i32 @generic_personality(i32, i64, i8*, i8*)
+declare i32 @__gxx_personality_v0(i32, i64, i8*, i8*)
+declare i32 @__objc_personality_v0(i32, i64, i8*, i8*)
+declare i32 @__C_specific_handler(...)
+
+declare void @bar()
+
+define void @foo_generic() personality i32 (i32, i64, i8*, i8*)* @generic_personality {
+; CHECK-LABEL: @foo_generic(
+  invoke void @bar()
+    to label %cont.a unwind label %lpad.a
+cont.a:
+  invoke void @bar()
+    to label %cont.b unwind label %lpad.b
+cont.b:
+  invoke void @bar()
+    to label %cont.c unwind label %lpad.c
+cont.c:
+  invoke void @bar()
+    to label %cont.d unwind label %lpad.d
+cont.d:
+  invoke void @bar()
+    to label %cont.e unwind label %lpad.e
+cont.e:
+  invoke void @bar()
+    to label %cont.f unwind label %lpad.f
+cont.f:
+  invoke void @bar()
+    to label %cont.g unwind label %lpad.g
+cont.g:
+  invoke void @bar()
+    to label %cont.h unwind label %lpad.h
+cont.h:
+  invoke void @bar()
+    to label %cont.i unwind label %lpad.i
+cont.i:
+  ret void
+
+lpad.a:
+  %a = landingpad { i8*, i32 }
+          catch i32* @T1
+          catch i32* @T2
+          catch i32* @T1
+          catch i32* @T2
+  unreachable
+; CHECK: %a = landingpad
+; CHECK-NEXT: @T1
+; CHECK-NEXT: @T2
+; CHECK-NEXT: unreachable
+
+lpad.b:
+  %b = landingpad { i8*, i32 }
+          filter [0 x i32*] zeroinitializer
+          catch i32* @T1
+  unreachable
+; CHECK: %b = landingpad
+; CHECK-NEXT: filter
+; CHECK-NEXT: unreachable
+
+lpad.c:
+  %c = landingpad { i8*, i32 }
+          catch i32* @T1
+          filter [1 x i32*] [i32* @T1]
+          catch i32* @T2
+  unreachable
+; Caught types should not be removed from filters
+; CHECK: %c = landingpad
+; CHECK-NEXT: catch i32* @T1
+; CHECK-NEXT: filter [1 x i32*] [i32* @T1]
+; CHECK-NEXT: catch i32* @T2 
+; CHECK-NEXT: unreachable
+
+lpad.d:
+  %d = landingpad { i8*, i32 }
+          filter [3 x i32*] zeroinitializer
+  unreachable
+; CHECK: %d = landingpad
+; CHECK-NEXT: filter [1 x i32*] zeroinitializer
+; CHECK-NEXT: unreachable
+
+lpad.e:
+  %e = landingpad { i8*, i32 }
+          catch i32* @T1
+          filter [3 x i32*] [i32* @T1, i32* @T2, i32* @T2]
+  unreachable
+; Caught types should not be removed from filters
+; CHECK: %e = landingpad
+; CHECK-NEXT: catch i32* @T1
+; CHECK-NEXT: filter [2 x i32*] [i32* @T1, i32* @T2]
+; CHECK-NEXT: unreachable
+
+lpad.f:
+  %f = landingpad { i8*, i32 }
+          filter [2 x i32*] [i32* @T2, i32* @T1]
+          filter [1 x i32*] [i32* @T1]
+  unreachable
+; CHECK: %f = landingpad
+; CHECK-NEXT: filter [1 x i32*] [i32* @T1]
+; CHECK-NEXT: unreachable
+
+lpad.g:
+  %g = landingpad { i8*, i32 }
+          filter [1 x i32*] [i32* @T1]
+          catch i32* @T3
+          filter [2 x i32*] [i32* @T2, i32* @T1]
+  unreachable
+; CHECK: %g = landingpad
+; CHECK-NEXT: filter [1 x i32*] [i32* @T1]
+; CHECK-NEXT: catch i32* @T3
+; CHECK-NEXT: unreachable
+
+lpad.h:
+  %h = landingpad { i8*, i32 }
+          filter [2 x i32*] [i32* @T1, i32* null]
+          filter [1 x i32*] zeroinitializer
+  unreachable
+; CHECK: %h = landingpad
+; CHECK-NEXT: filter [1 x i32*] zeroinitializer
+; CHECK-NEXT: unreachable
+
+lpad.i:
+  %i = landingpad { i8*, i32 }
+          cleanup
+          filter [0 x i32*] zeroinitializer
+  unreachable
+; CHECK: %i = landingpad
+; CHECK-NEXT: filter
+; CHECK-NEXT: unreachable
+}
+
+define void @foo_cxx() personality i32 (i32, i64, i8*, i8*)* @__gxx_personality_v0 {
+; CHECK-LABEL: @foo_cxx(
+  invoke void @bar()
+    to label %cont.a unwind label %lpad.a
+cont.a:
+  invoke void @bar()
+    to label %cont.b unwind label %lpad.b
+cont.b:
+  invoke void @bar()
+    to label %cont.c unwind label %lpad.c
+cont.c:
+  invoke void @bar()
+    to label %cont.d unwind label %lpad.d
+cont.d:
+  ret void
+
+lpad.a:
+  %a = landingpad { i8*, i32 }
+          catch i32* null
+          catch i32* @T1
+  unreachable
+; CHECK: %a = landingpad
+; CHECK-NEXT: null
+; CHECK-NEXT: unreachable
+
+lpad.b:
+  %b = landingpad { i8*, i32 }
+          filter [1 x i32*] zeroinitializer
+  unreachable
+; CHECK: %b = landingpad
+; CHECK-NEXT: cleanup
+; CHECK-NEXT: unreachable
+
+lpad.c:
+  %c = landingpad { i8*, i32 }
+          filter [2 x i32*] [i32* @T1, i32* null]
+  unreachable
+; CHECK: %c = landingpad
+; CHECK-NEXT: cleanup
+; CHECK-NEXT: unreachable
+
+lpad.d:
+  %d = landingpad { i8*, i32 }
+          cleanup
+          catch i32* null
+  unreachable
+; CHECK: %d = landingpad
+; CHECK-NEXT: null
+; CHECK-NEXT: unreachable
+}
+
+define void @foo_objc() personality i32 (i32, i64, i8*, i8*)* @__objc_personality_v0 {
+; CHECK-LABEL: @foo_objc(
+  invoke void @bar()
+    to label %cont.a unwind label %lpad.a
+cont.a:
+  invoke void @bar()
+    to label %cont.b unwind label %lpad.b
+cont.b:
+  invoke void @bar()
+    to label %cont.c unwind label %lpad.c
+cont.c:
+  invoke void @bar()
+    to label %cont.d unwind label %lpad.d
+cont.d:
+  ret void
+
+lpad.a:
+  %a = landingpad { i8*, i32 }
+          catch i32* null
+          catch i32* @T1
+  unreachable
+; CHECK: %a = landingpad
+; CHECK-NEXT: null
+; CHECK-NEXT: unreachable
+
+lpad.b:
+  %b = landingpad { i8*, i32 }
+          filter [1 x i32*] zeroinitializer
+  unreachable
+; CHECK: %b = landingpad
+; CHECK-NEXT: cleanup
+; CHECK-NEXT: unreachable
+
+lpad.c:
+  %c = landingpad { i8*, i32 }
+          filter [2 x i32*] [i32* @T1, i32* null]
+  unreachable
+; CHECK: %c = landingpad
+; CHECK-NEXT: cleanup
+; CHECK-NEXT: unreachable
+
+lpad.d:
+  %d = landingpad { i8*, i32 }
+          cleanup
+          catch i32* null
+  unreachable
+; CHECK: %d = landingpad
+; CHECK-NEXT: null
+; CHECK-NEXT: unreachable
+}
+
+define void @foo_seh() personality i32 (...)* @__C_specific_handler {
+; CHECK-LABEL: @foo_seh(
+  invoke void @bar()
+    to label %cont.a unwind label %lpad.a
+cont.a:
+  invoke void @bar()
+    to label %cont.b unwind label %lpad.b
+cont.b:
+  invoke void @bar()
+    to label %cont.c unwind label %lpad.c
+cont.c:
+  invoke void @bar()
+    to label %cont.d unwind label %lpad.d
+cont.d:
+  ret void
+
+lpad.a:
+  %a = landingpad { i8*, i32 }
+          catch i32* null
+          catch i32* @T1
+  unreachable
+; CHECK: %a = landingpad
+; CHECK-NEXT: null
+; CHECK-NEXT: unreachable
+
+lpad.b:
+  %b = landingpad { i8*, i32 }
+          filter [1 x i32*] zeroinitializer
+  unreachable
+; CHECK: %b = landingpad
+; CHECK-NEXT: cleanup
+; CHECK-NEXT: unreachable
+
+lpad.c:
+  %c = landingpad { i8*, i32 }
+          filter [2 x i32*] [i32* @T1, i32* null]
+  unreachable
+; CHECK: %c = landingpad
+; CHECK-NEXT: cleanup
+; CHECK-NEXT: unreachable
+
+lpad.d:
+  %d = landingpad { i8*, i32 }
+          cleanup
+          catch i32* null
+  unreachable
+; CHECK: %d = landingpad
+; CHECK-NEXT: null
+; CHECK-NEXT: unreachable
+}

Added: llvm/trunk/test/Transforms/InstCombine/NVPTX/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/NVPTX/lit.local.cfg?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/NVPTX/lit.local.cfg (added)
+++ llvm/trunk/test/Transforms/InstCombine/NVPTX/lit.local.cfg Tue Apr 16 21:52:47 2019
@@ -0,0 +1,2 @@
+if not 'NVPTX' in config.root.targets:
+    config.unsupported = True

Added: llvm/trunk/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,471 @@
+; Check that nvvm intrinsics get simplified to target-generic intrinsics where
+; possible.
+;
+; We run this test twice; once with ftz on, and again with ftz off.  Behold the
+; hackery:
+
+; RUN: cat %s > %t.ftz
+; RUN: echo 'attributes #0 = { "nvptx-f32ftz" = "true" }' >> %t.ftz
+; RUN: opt < %t.ftz -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=FTZ
+
+; RUN: cat %s > %t.noftz
+; RUN: echo 'attributes #0 = { "nvptx-f32ftz" = "false" }' >> %t.noftz
+; RUN: opt < %t.noftz -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=NOFTZ
+
+; We handle nvvm intrinsics with ftz variants as follows:
+;  - If the module is in ftz mode, the ftz variant is transformed into the
+;    regular llvm intrinsic, and the non-ftz variant is left alone.
+;  - If the module is not in ftz mode, it's the reverse: Only the non-ftz
+;    variant is transformed, and the ftz variant is left alone.
+
+; Check NVVM intrinsics that map directly to LLVM target-generic intrinsics.
+
+; CHECK-LABEL: @ceil_double
+define double @ceil_double(double %a) #0 {
+; CHECK: call double @llvm.ceil.f64
+  %ret = call double @llvm.nvvm.ceil.d(double %a)
+  ret double %ret
+}
+; CHECK-LABEL: @ceil_float
+define float @ceil_float(float %a) #0 {
+; NOFTZ: call float @llvm.ceil.f32
+; FTZ: call float @llvm.nvvm.ceil.f
+  %ret = call float @llvm.nvvm.ceil.f(float %a)
+  ret float %ret
+}
+; CHECK-LABEL: @ceil_float_ftz
+define float @ceil_float_ftz(float %a) #0 {
+; NOFTZ: call float @llvm.nvvm.ceil.ftz.f
+; FTZ: call float @llvm.ceil.f32
+  %ret = call float @llvm.nvvm.ceil.ftz.f(float %a)
+  ret float %ret
+}
+
+; CHECK-LABEL: @fabs_double
+define double @fabs_double(double %a) #0 {
+; CHECK: call double @llvm.fabs.f64
+  %ret = call double @llvm.nvvm.fabs.d(double %a)
+  ret double %ret
+}
+; CHECK-LABEL: @fabs_float
+define float @fabs_float(float %a) #0 {
+; NOFTZ: call float @llvm.fabs.f32
+; FTZ: call float @llvm.nvvm.fabs.f
+  %ret = call float @llvm.nvvm.fabs.f(float %a)
+  ret float %ret
+}
+; CHECK-LABEL: @fabs_float_ftz
+define float @fabs_float_ftz(float %a) #0 {
+; NOFTZ: call float @llvm.nvvm.fabs.ftz.f
+; FTZ: call float @llvm.fabs.f32
+  %ret = call float @llvm.nvvm.fabs.ftz.f(float %a)
+  ret float %ret
+}
+
+; CHECK-LABEL: @floor_double
+define double @floor_double(double %a) #0 {
+; CHECK: call double @llvm.floor.f64
+  %ret = call double @llvm.nvvm.floor.d(double %a)
+  ret double %ret
+}
+; CHECK-LABEL: @floor_float
+define float @floor_float(float %a) #0 {
+; NOFTZ: call float @llvm.floor.f32
+; FTZ: call float @llvm.nvvm.floor.f
+  %ret = call float @llvm.nvvm.floor.f(float %a)
+  ret float %ret
+}
+; CHECK-LABEL: @floor_float_ftz
+define float @floor_float_ftz(float %a) #0 {
+; NOFTZ: call float @llvm.nvvm.floor.ftz.f
+; FTZ: call float @llvm.floor.f32
+  %ret = call float @llvm.nvvm.floor.ftz.f(float %a)
+  ret float %ret
+}
+
+; CHECK-LABEL: @fma_double
+define double @fma_double(double %a, double %b, double %c) #0 {
+; CHECK: call double @llvm.fma.f64
+  %ret = call double @llvm.nvvm.fma.rn.d(double %a, double %b, double %c)
+  ret double %ret
+}
+; CHECK-LABEL: @fma_float
+define float @fma_float(float %a, float %b, float %c) #0 {
+; NOFTZ: call float @llvm.fma.f32
+; FTZ: call float @llvm.nvvm.fma.rn.f
+  %ret = call float @llvm.nvvm.fma.rn.f(float %a, float %b, float %c)
+  ret float %ret
+}
+; CHECK-LABEL: @fma_float_ftz
+define float @fma_float_ftz(float %a, float %b, float %c) #0 {
+; NOFTZ: call float @llvm.nvvm.fma.rn.ftz.f
+; FTZ: call float @llvm.fma.f32
+  %ret = call float @llvm.nvvm.fma.rn.ftz.f(float %a, float %b, float %c)
+  ret float %ret
+}
+
+; CHECK-LABEL: @fmax_double
+define double @fmax_double(double %a, double %b) #0 {
+; CHECK: call double @llvm.maxnum.f64
+  %ret = call double @llvm.nvvm.fmax.d(double %a, double %b)
+  ret double %ret
+}
+; CHECK-LABEL: @fmax_float
+define float @fmax_float(float %a, float %b) #0 {
+; NOFTZ: call float @llvm.maxnum.f32
+; FTZ: call float @llvm.nvvm.fmax.f
+  %ret = call float @llvm.nvvm.fmax.f(float %a, float %b)
+  ret float %ret
+}
+; CHECK-LABEL: @fmax_float_ftz
+define float @fmax_float_ftz(float %a, float %b) #0 {
+; NOFTZ: call float @llvm.nvvm.fmax.ftz.f
+; FTZ: call float @llvm.maxnum.f32
+  %ret = call float @llvm.nvvm.fmax.ftz.f(float %a, float %b)
+  ret float %ret
+}
+
+; CHECK-LABEL: @fmin_double
+define double @fmin_double(double %a, double %b) #0 {
+; CHECK: call double @llvm.minnum.f64
+  %ret = call double @llvm.nvvm.fmin.d(double %a, double %b)
+  ret double %ret
+}
+; CHECK-LABEL: @fmin_float
+define float @fmin_float(float %a, float %b) #0 {
+; NOFTZ: call float @llvm.minnum.f32
+; FTZ: call float @llvm.nvvm.fmin.f
+  %ret = call float @llvm.nvvm.fmin.f(float %a, float %b)
+  ret float %ret
+}
+; CHECK-LABEL: @fmin_float_ftz
+define float @fmin_float_ftz(float %a, float %b) #0 {
+; NOFTZ: call float @llvm.nvvm.fmin.ftz.f
+; FTZ: call float @llvm.minnum.f32
+  %ret = call float @llvm.nvvm.fmin.ftz.f(float %a, float %b)
+  ret float %ret
+}
+
+; CHECK-LABEL: @round_double
+define double @round_double(double %a) #0 {
+; CHECK: call double @llvm.round.f64
+  %ret = call double @llvm.nvvm.round.d(double %a)
+  ret double %ret
+}
+; CHECK-LABEL: @round_float
+define float @round_float(float %a) #0 {
+; NOFTZ: call float @llvm.round.f32
+; FTZ: call float @llvm.nvvm.round.f
+  %ret = call float @llvm.nvvm.round.f(float %a)
+  ret float %ret
+}
+; CHECK-LABEL: @round_float_ftz
+define float @round_float_ftz(float %a) #0 {
+; NOFTZ: call float @llvm.nvvm.round.ftz.f
+; FTZ: call float @llvm.round.f32
+  %ret = call float @llvm.nvvm.round.ftz.f(float %a)
+  ret float %ret
+}
+
+; CHECK-LABEL: @trunc_double
+define double @trunc_double(double %a) #0 {
+; CHECK: call double @llvm.trunc.f64
+  %ret = call double @llvm.nvvm.trunc.d(double %a)
+  ret double %ret
+}
+; CHECK-LABEL: @trunc_float
+define float @trunc_float(float %a) #0 {
+; NOFTZ: call float @llvm.trunc.f32
+; FTZ: call float @llvm.nvvm.trunc.f
+  %ret = call float @llvm.nvvm.trunc.f(float %a)
+  ret float %ret
+}
+; CHECK-LABEL: @trunc_float_ftz
+define float @trunc_float_ftz(float %a) #0 {
+; NOFTZ: call float @llvm.nvvm.trunc.ftz.f
+; FTZ: call float @llvm.trunc.f32
+  %ret = call float @llvm.nvvm.trunc.ftz.f(float %a)
+  ret float %ret
+}
+
+; Check NVVM intrinsics that correspond to LLVM cast operations.
+
+; CHECK-LABEL: @test_d2i
+define i32 @test_d2i(double %a) #0 {
+; CHECK: fptosi double %a to i32
+  %ret = call i32 @llvm.nvvm.d2i.rz(double %a)
+  ret i32 %ret
+}
+; CHECK-LABEL: @test_f2i
+define i32 @test_f2i(float %a) #0 {
+; CHECK: fptosi float %a to i32
+  %ret = call i32 @llvm.nvvm.f2i.rz(float %a)
+  ret i32 %ret
+}
+; CHECK-LABEL: @test_d2ll
+define i64 @test_d2ll(double %a) #0 {
+; CHECK: fptosi double %a to i64
+  %ret = call i64 @llvm.nvvm.d2ll.rz(double %a)
+  ret i64 %ret
+}
+; CHECK-LABEL: @test_f2ll
+define i64 @test_f2ll(float %a) #0 {
+; CHECK: fptosi float %a to i64
+  %ret = call i64 @llvm.nvvm.f2ll.rz(float %a)
+  ret i64 %ret
+}
+; CHECK-LABEL: @test_d2ui
+define i32 @test_d2ui(double %a) #0 {
+; CHECK: fptoui double %a to i32
+  %ret = call i32 @llvm.nvvm.d2ui.rz(double %a)
+  ret i32 %ret
+}
+; CHECK-LABEL: @test_f2ui
+define i32 @test_f2ui(float %a) #0 {
+; CHECK: fptoui float %a to i32
+  %ret = call i32 @llvm.nvvm.f2ui.rz(float %a)
+  ret i32 %ret
+}
+; CHECK-LABEL: @test_d2ull
+define i64 @test_d2ull(double %a) #0 {
+; CHECK: fptoui double %a to i64
+  %ret = call i64 @llvm.nvvm.d2ull.rz(double %a)
+  ret i64 %ret
+}
+; CHECK-LABEL: @test_f2ull
+define i64 @test_f2ull(float %a) #0 {
+; CHECK: fptoui float %a to i64
+  %ret = call i64 @llvm.nvvm.f2ull.rz(float %a)
+  ret i64 %ret
+}
+
+; CHECK-LABEL: @test_i2d
+define double @test_i2d(i32 %a) #0 {
+; CHECK: sitofp i32 %a to double
+  %ret = call double @llvm.nvvm.i2d.rz(i32 %a)
+  ret double %ret
+}
+; CHECK-LABEL: @test_i2f
+define float @test_i2f(i32 %a) #0 {
+; CHECK: sitofp i32 %a to float
+  %ret = call float @llvm.nvvm.i2f.rz(i32 %a)
+  ret float %ret
+}
+; CHECK-LABEL: @test_ll2d
+define double @test_ll2d(i64 %a) #0 {
+; CHECK: sitofp i64 %a to double
+  %ret = call double @llvm.nvvm.ll2d.rz(i64 %a)
+  ret double %ret
+}
+; CHECK-LABEL: @test_ll2f
+define float @test_ll2f(i64 %a) #0 {
+; CHECK: sitofp i64 %a to float
+  %ret = call float @llvm.nvvm.ll2f.rz(i64 %a)
+  ret float %ret
+}
+; CHECK-LABEL: @test_ui2d
+define double @test_ui2d(i32 %a) #0 {
+; CHECK: uitofp i32 %a to double
+  %ret = call double @llvm.nvvm.ui2d.rz(i32 %a)
+  ret double %ret
+}
+; CHECK-LABEL: @test_ui2f
+define float @test_ui2f(i32 %a) #0 {
+; CHECK: uitofp i32 %a to float
+  %ret = call float @llvm.nvvm.ui2f.rz(i32 %a)
+  ret float %ret
+}
+; CHECK-LABEL: @test_ull2d
+define double @test_ull2d(i64 %a) #0 {
+; CHECK: uitofp i64 %a to double
+  %ret = call double @llvm.nvvm.ull2d.rz(i64 %a)
+  ret double %ret
+}
+; CHECK-LABEL: @test_ull2f
+define float @test_ull2f(i64 %a) #0 {
+; CHECK: uitofp i64 %a to float
+  %ret = call float @llvm.nvvm.ull2f.rz(i64 %a)
+  ret float %ret
+}
+
+; Check NVVM intrinsics that map to LLVM binary operations.
+
+; CHECK-LABEL: @test_add_rn_d
+define double @test_add_rn_d(double %a, double %b) #0 {
+; CHECK: fadd
+  %ret = call double @llvm.nvvm.add.rn.d(double %a, double %b)
+  ret double %ret
+}
+; CHECK-LABEL: @test_add_rn_f
+define float @test_add_rn_f(float %a, float %b) #0 {
+; NOFTZ: fadd
+; FTZ: call float @llvm.nvvm.add.rn.f
+  %ret = call float @llvm.nvvm.add.rn.f(float %a, float %b)
+  ret float %ret
+}
+; CHECK-LABEL: @test_add_rn_f_ftz
+define float @test_add_rn_f_ftz(float %a, float %b) #0 {
+; NOFTZ: call float @llvm.nvvm.add.rn.f
+; FTZ: fadd
+  %ret = call float @llvm.nvvm.add.rn.ftz.f(float %a, float %b)
+  ret float %ret
+}
+
+; CHECK-LABEL: @test_mul_rn_d
+define double @test_mul_rn_d(double %a, double %b) #0 {
+; CHECK: fmul
+  %ret = call double @llvm.nvvm.mul.rn.d(double %a, double %b)
+  ret double %ret
+}
+; CHECK-LABEL: @test_mul_rn_f
+define float @test_mul_rn_f(float %a, float %b) #0 {
+; NOFTZ: fmul
+; FTZ: call float @llvm.nvvm.mul.rn.f
+  %ret = call float @llvm.nvvm.mul.rn.f(float %a, float %b)
+  ret float %ret
+}
+; CHECK-LABEL: @test_mul_rn_f_ftz
+define float @test_mul_rn_f_ftz(float %a, float %b) #0 {
+; NOFTZ: call float @llvm.nvvm.mul.rn.f
+; FTZ: fmul
+  %ret = call float @llvm.nvvm.mul.rn.ftz.f(float %a, float %b)
+  ret float %ret
+}
+
+; CHECK-LABEL: @test_div_rn_d
+define double @test_div_rn_d(double %a, double %b) #0 {
+; CHECK: fdiv
+  %ret = call double @llvm.nvvm.div.rn.d(double %a, double %b)
+  ret double %ret
+}
+; CHECK-LABEL: @test_div_rn_f
+define float @test_div_rn_f(float %a, float %b) #0 {
+; NOFTZ: fdiv
+; FTZ: call float @llvm.nvvm.div.rn.f
+  %ret = call float @llvm.nvvm.div.rn.f(float %a, float %b)
+  ret float %ret
+}
+; CHECK-LABEL: @test_div_rn_f_ftz
+define float @test_div_rn_f_ftz(float %a, float %b) #0 {
+; NOFTZ: call float @llvm.nvvm.div.rn.f
+; FTZ: fdiv
+  %ret = call float @llvm.nvvm.div.rn.ftz.f(float %a, float %b)
+  ret float %ret
+}
+
+; Check NVVM intrinsics that require us to emit custom IR.
+
+; CHECK-LABEL: @test_rcp_rn_f
+define float @test_rcp_rn_f(float %a) #0 {
+; NOFTZ: fdiv float 1.0{{.*}} %a
+; FTZ: call float @llvm.nvvm.rcp.rn.f
+  %ret = call float @llvm.nvvm.rcp.rn.f(float %a)
+  ret float %ret
+}
+; CHECK-LABEL: @test_rcp_rn_f_ftz
+define float @test_rcp_rn_f_ftz(float %a) #0 {
+; NOFTZ: call float @llvm.nvvm.rcp.rn.f
+; FTZ: fdiv float 1.0{{.*}} %a
+  %ret = call float @llvm.nvvm.rcp.rn.ftz.f(float %a)
+  ret float %ret
+}
+
+; CHECK-LABEL: @test_sqrt_rn_d
+define double @test_sqrt_rn_d(double %a) #0 {
+; CHECK: call double @llvm.sqrt.f64(double %a)
+  %ret = call double @llvm.nvvm.sqrt.rn.d(double %a)
+  ret double %ret
+}
+; nvvm.sqrt.f is a special case: It goes to a llvm.sqrt.f
+; CHECK-LABEL: @test_sqrt_f
+define float @test_sqrt_f(float %a) #0 {
+; CHECK: call float @llvm.sqrt.f32(float %a)
+  %ret = call float @llvm.nvvm.sqrt.f(float %a)
+  ret float %ret
+}
+; CHECK-LABEL: @test_sqrt_rn_f
+define float @test_sqrt_rn_f(float %a) #0 {
+; NOFTZ: call float @llvm.sqrt.f32(float %a)
+; FTZ: call float @llvm.nvvm.sqrt.rn.f
+  %ret = call float @llvm.nvvm.sqrt.rn.f(float %a)
+  ret float %ret
+}
+; CHECK-LABEL: @test_sqrt_rn_f_ftz
+define float @test_sqrt_rn_f_ftz(float %a) #0 {
+; NOFTZ: call float @llvm.nvvm.sqrt.rn.f
+; FTZ: call float @llvm.sqrt.f32(float %a)
+  %ret = call float @llvm.nvvm.sqrt.rn.ftz.f(float %a)
+  ret float %ret
+}
+
+declare double @llvm.nvvm.add.rn.d(double, double)
+declare float @llvm.nvvm.add.rn.f(float, float)
+declare float @llvm.nvvm.add.rn.ftz.f(float, float)
+declare double @llvm.nvvm.ceil.d(double)
+declare float @llvm.nvvm.ceil.f(float)
+declare float @llvm.nvvm.ceil.ftz.f(float)
+declare float @llvm.nvvm.d2f.rm(double)
+declare float @llvm.nvvm.d2f.rm.ftz(double)
+declare float @llvm.nvvm.d2f.rp(double)
+declare float @llvm.nvvm.d2f.rp.ftz(double)
+declare float @llvm.nvvm.d2f.rz(double)
+declare float @llvm.nvvm.d2f.rz.ftz(double)
+declare i32 @llvm.nvvm.d2i.rz(double)
+declare i64 @llvm.nvvm.d2ll.rz(double)
+declare i32 @llvm.nvvm.d2ui.rz(double)
+declare i64 @llvm.nvvm.d2ull.rz(double)
+declare double @llvm.nvvm.div.rn.d(double, double)
+declare float @llvm.nvvm.div.rn.f(float, float)
+declare float @llvm.nvvm.div.rn.ftz.f(float, float)
+declare i16 @llvm.nvvm.f2h.rz(float)
+declare i16 @llvm.nvvm.f2h.rz.ftz(float)
+declare i32 @llvm.nvvm.f2i.rz(float)
+declare i32 @llvm.nvvm.f2i.rz.ftz(float)
+declare i64 @llvm.nvvm.f2ll.rz(float)
+declare i64 @llvm.nvvm.f2ll.rz.ftz(float)
+declare i32 @llvm.nvvm.f2ui.rz(float)
+declare i32 @llvm.nvvm.f2ui.rz.ftz(float)
+declare i64 @llvm.nvvm.f2ull.rz(float)
+declare i64 @llvm.nvvm.f2ull.rz.ftz(float)
+declare double @llvm.nvvm.fabs.d(double)
+declare float @llvm.nvvm.fabs.f(float)
+declare float @llvm.nvvm.fabs.ftz.f(float)
+declare double @llvm.nvvm.floor.d(double)
+declare float @llvm.nvvm.floor.f(float)
+declare float @llvm.nvvm.floor.ftz.f(float)
+declare double @llvm.nvvm.fma.rn.d(double, double, double)
+declare float @llvm.nvvm.fma.rn.f(float, float, float)
+declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float)
+declare double @llvm.nvvm.fmax.d(double, double)
+declare float @llvm.nvvm.fmax.f(float, float)
+declare float @llvm.nvvm.fmax.ftz.f(float, float)
+declare double @llvm.nvvm.fmin.d(double, double)
+declare float @llvm.nvvm.fmin.f(float, float)
+declare float @llvm.nvvm.fmin.ftz.f(float, float)
+declare double @llvm.nvvm.i2d.rz(i32)
+declare float @llvm.nvvm.i2f.rz(i32)
+declare double @llvm.nvvm.ll2d.rz(i64)
+declare float @llvm.nvvm.ll2f.rz(i64)
+declare double @llvm.nvvm.lohi.i2d(i32, i32)
+declare double @llvm.nvvm.mul.rn.d(double, double)
+declare float @llvm.nvvm.mul.rn.f(float, float)
+declare float @llvm.nvvm.mul.rn.ftz.f(float, float)
+declare double @llvm.nvvm.rcp.rm.d(double)
+declare double @llvm.nvvm.rcp.rn.d(double)
+declare float @llvm.nvvm.rcp.rn.f(float)
+declare float @llvm.nvvm.rcp.rn.ftz.f(float)
+declare double @llvm.nvvm.round.d(double)
+declare float @llvm.nvvm.round.f(float)
+declare float @llvm.nvvm.round.ftz.f(float)
+declare float @llvm.nvvm.sqrt.f(float)
+declare double @llvm.nvvm.sqrt.rn.d(double)
+declare float @llvm.nvvm.sqrt.rn.f(float)
+declare float @llvm.nvvm.sqrt.rn.ftz.f(float)
+declare double @llvm.nvvm.trunc.d(double)
+declare float @llvm.nvvm.trunc.f(float)
+declare float @llvm.nvvm.trunc.ftz.f(float)
+declare double @llvm.nvvm.ui2d.rz(i32)
+declare float @llvm.nvvm.ui2f.rn(i32)
+declare float @llvm.nvvm.ui2f.rz(i32)
+declare double @llvm.nvvm.ull2d.rz(i64)
+declare float @llvm.nvvm.ull2f.rz(i64)

Added: llvm/trunk/test/Transforms/InstCombine/OverlappingInsertvalues.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/OverlappingInsertvalues.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/OverlappingInsertvalues.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/OverlappingInsertvalues.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,36 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Check that we can find and remove redundant insertvalues
+; CHECK-LABEL: foo_simple
+; CHECK-NOT: i8* %x, 0
+define { i8*, i64, i32 } @foo_simple(i8* %x, i8* %y) nounwind {
+entry:
+  %0 = insertvalue { i8*, i64, i32 } undef, i8* %x, 0
+  %1 = insertvalue { i8*, i64, i32 } %0, i8* %y, 0
+  ret { i8*, i64, i32 } %1
+}
+; Check that we can find and remove redundant nodes in insertvalues chain
+; CHECK-LABEL: foo_ovwrt_chain
+; CHECK-NOT: i64 %y, 1
+; CHECK-NOT: i32 555, 2
+define { i8*, i64, i32 } @foo_ovwrt_chain(i8* %x, i64 %y, i64 %z) nounwind {
+entry:
+  %0 = insertvalue { i8*, i64, i32 } undef, i8* %x, 0
+  %1 = insertvalue { i8*, i64, i32 } %0, i64 %y, 1
+  %2 = insertvalue { i8*, i64, i32 } %1, i32 555, 2
+  %3 = insertvalue { i8*, i64, i32 } %2, i64 %z, 1
+  %4 = insertvalue { i8*, i64, i32 } %3, i32 777, 2
+  ret { i8*, i64, i32 } %4
+}
+; Check that we propagate insertvalues only if they are use as the first
+; operand (as initial value of aggregate)
+; CHECK-LABEL: foo_use_as_second_operand
+; CHECK: i16 %x, 0
+; CHECK: %0, 1
+define { i8, {i16, i32} } @foo_use_as_second_operand(i16 %x) nounwind {
+entry:
+  %0 = insertvalue { i16, i32 } undef, i16 %x, 0
+  %1 = insertvalue { i8, {i16, i32} } undef, { i16, i32 } %0, 1
+  ret { i8, {i16, i32} } %1
+}

Added: llvm/trunk/test/Transforms/InstCombine/PR30597.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/PR30597.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/PR30597.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/PR30597.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,32 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: readonly uwtable
+define i1 @dot_ref_s(i32** noalias nocapture readonly dereferenceable(8)) {
+entry-block:
+  %loadedptr = load i32*, i32** %0, align 8, !nonnull !0
+  %ptrtoint = ptrtoint i32* %loadedptr to i64
+  %inttoptr = inttoptr i64 %ptrtoint to i32*
+  %switchtmp = icmp eq i32* %inttoptr, null
+  ret i1 %switchtmp
+
+; CHECK-LABEL: @dot_ref_s
+; CHECK-NEXT: entry-block:
+; CHECK-NEXT: ret i1 false
+}
+
+; Function Attrs: readonly uwtable
+define i64* @function(i64* noalias nocapture readonly dereferenceable(8)) {
+entry-block:
+  %loaded = load i64, i64* %0, align 8, !range !1
+  %inttoptr = inttoptr i64 %loaded to i64*
+  ret i64* %inttoptr
+; CHECK-LABEL: @function
+; CHECK: %{{.+}} = load i64*, i64** %{{.+}}, align 8, !nonnull
+}
+
+
+!0 = !{}
+!1 = !{i64 1, i64 140737488355327}

Added: llvm/trunk/test/Transforms/InstCombine/PR37526.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/PR37526.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/PR37526.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/PR37526.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+define void @PR37526(i32* %pz, i32* %px, i32* %py) {
+; CHECK-LABEL: @PR37526(
+; CHECK-NEXT:    [[T2:%.*]] = load i32, i32* [[PY:%.*]], align 4
+; CHECK-NEXT:    [[T3:%.*]] = load i32, i32* [[PX:%.*]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[T2]], [[T3]]
+; CHECK-NEXT:    [[R1:%.*]] = select i1 [[CMP]], i32 [[T3]], i32 [[T2]]
+; CHECK-NEXT:    store i32 [[R1]], i32* [[PZ:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+  %t1 = bitcast i32* %pz to i64*
+  %t2 = load i32, i32* %py
+  %t3 = load i32, i32* %px
+  %cmp = icmp slt i32 %t2, %t3
+  %select = select i1 %cmp, i32* %px, i32* %py
+  %bc = bitcast i32* %select to i64*
+  %r = load i64, i64* %bc
+  store i64 %r, i64* %t1
+  ret void
+}

Added: llvm/trunk/test/Transforms/InstCombine/PowerPC/aligned-altivec.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/PowerPC/aligned-altivec.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/PowerPC/aligned-altivec.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/PowerPC/aligned-altivec.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,131 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+declare <4 x i32> @llvm.ppc.altivec.lvx(i8*) #1
+
+define <4 x i32> @test1(<4 x i32>* %h) #0 {
+entry:
+  %h1 = getelementptr <4 x i32>, <4 x i32>* %h, i64 1
+  %hv = bitcast <4 x i32>* %h1 to i8*
+  %vl = call <4 x i32> @llvm.ppc.altivec.lvx(i8* %hv)
+
+; CHECK-LABEL: @test1
+; CHECK: @llvm.ppc.altivec.lvx
+; CHECK: ret <4 x i32>
+
+  %v0 = load <4 x i32>, <4 x i32>* %h, align 8
+  %a = add <4 x i32> %v0, %vl
+  ret <4 x i32> %a
+}
+
+define <4 x i32> @test1a(<4 x i32>* align 16 %h) #0 {
+entry:
+  %h1 = getelementptr <4 x i32>, <4 x i32>* %h, i64 1
+  %hv = bitcast <4 x i32>* %h1 to i8*
+  %vl = call <4 x i32> @llvm.ppc.altivec.lvx(i8* %hv)
+
+; CHECK-LABEL: @test1a
+; CHECK-NOT: @llvm.ppc.altivec.lvx
+; CHECK: ret <4 x i32>
+
+  %v0 = load <4 x i32>, <4 x i32>* %h, align 8
+  %a = add <4 x i32> %v0, %vl
+  ret <4 x i32> %a
+}
+
+declare void @llvm.ppc.altivec.stvx(<4 x i32>, i8*) #0
+
+define <4 x i32> @test2(<4 x i32>* %h, <4 x i32> %d) #0 {
+entry:
+  %h1 = getelementptr <4 x i32>, <4 x i32>* %h, i64 1
+  %hv = bitcast <4 x i32>* %h1 to i8*
+  call void @llvm.ppc.altivec.stvx(<4 x i32> %d, i8* %hv)
+
+  %v0 = load <4 x i32>, <4 x i32>* %h, align 8
+  ret <4 x i32> %v0
+
+; CHECK-LABEL: @test2
+; CHECK: @llvm.ppc.altivec.stvx
+; CHECK: ret <4 x i32>
+}
+
+define <4 x i32> @test2a(<4 x i32>* align 16 %h, <4 x i32> %d) #0 {
+entry:
+  %h1 = getelementptr <4 x i32>, <4 x i32>* %h, i64 1
+  %hv = bitcast <4 x i32>* %h1 to i8*
+  call void @llvm.ppc.altivec.stvx(<4 x i32> %d, i8* %hv)
+
+  %v0 = load <4 x i32>, <4 x i32>* %h, align 8
+  ret <4 x i32> %v0
+
+; CHECK-LABEL: @test2
+; CHECK-NOT: @llvm.ppc.altivec.stvx
+; CHECK: ret <4 x i32>
+}
+
+declare <4 x i32> @llvm.ppc.altivec.lvxl(i8*) #1
+
+define <4 x i32> @test1l(<4 x i32>* %h) #0 {
+entry:
+  %h1 = getelementptr <4 x i32>, <4 x i32>* %h, i64 1
+  %hv = bitcast <4 x i32>* %h1 to i8*
+  %vl = call <4 x i32> @llvm.ppc.altivec.lvxl(i8* %hv)
+
+; CHECK-LABEL: @test1l
+; CHECK: @llvm.ppc.altivec.lvxl
+; CHECK: ret <4 x i32>
+
+  %v0 = load <4 x i32>, <4 x i32>* %h, align 8
+  %a = add <4 x i32> %v0, %vl
+  ret <4 x i32> %a
+}
+
+define <4 x i32> @test1la(<4 x i32>* align 16 %h) #0 {
+entry:
+  %h1 = getelementptr <4 x i32>, <4 x i32>* %h, i64 1
+  %hv = bitcast <4 x i32>* %h1 to i8*
+  %vl = call <4 x i32> @llvm.ppc.altivec.lvxl(i8* %hv)
+
+; CHECK-LABEL: @test1la
+; CHECK-NOT: @llvm.ppc.altivec.lvxl
+; CHECK: ret <4 x i32>
+
+  %v0 = load <4 x i32>, <4 x i32>* %h, align 8
+  %a = add <4 x i32> %v0, %vl
+  ret <4 x i32> %a
+}
+
+declare void @llvm.ppc.altivec.stvxl(<4 x i32>, i8*) #0
+
+define <4 x i32> @test2l(<4 x i32>* %h, <4 x i32> %d) #0 {
+entry:
+  %h1 = getelementptr <4 x i32>, <4 x i32>* %h, i64 1
+  %hv = bitcast <4 x i32>* %h1 to i8*
+  call void @llvm.ppc.altivec.stvxl(<4 x i32> %d, i8* %hv)
+
+  %v0 = load <4 x i32>, <4 x i32>* %h, align 8
+  ret <4 x i32> %v0
+
+; CHECK-LABEL: @test2l
+; CHECK: @llvm.ppc.altivec.stvxl
+; CHECK: ret <4 x i32>
+}
+
+define <4 x i32> @test2la(<4 x i32>* align 16 %h, <4 x i32> %d) #0 {
+entry:
+  %h1 = getelementptr <4 x i32>, <4 x i32>* %h, i64 1
+  %hv = bitcast <4 x i32>* %h1 to i8*
+  call void @llvm.ppc.altivec.stvxl(<4 x i32> %d, i8* %hv)
+
+  %v0 = load <4 x i32>, <4 x i32>* %h, align 8
+  ret <4 x i32> %v0
+
+; CHECK-LABEL: @test2l
+; CHECK-NOT: @llvm.ppc.altivec.stvxl
+; CHECK: ret <4 x i32>
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+

Added: llvm/trunk/test/Transforms/InstCombine/PowerPC/aligned-qpx.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/PowerPC/aligned-qpx.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/PowerPC/aligned-qpx.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/PowerPC/aligned-qpx.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,165 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+declare <4 x double> @llvm.ppc.qpx.qvlfs(i8*) #1
+
+define <4 x double> @test1(<4 x float>* %h) #0 {
+entry:
+  %h1 = getelementptr <4 x float>, <4 x float>* %h, i64 1
+  %hv = bitcast <4 x float>* %h1 to i8*
+  %vl = call <4 x double> @llvm.ppc.qpx.qvlfs(i8* %hv)
+
+; CHECK-LABEL: @test1
+; CHECK: @llvm.ppc.qpx.qvlfs
+; CHECK: ret <4 x double>
+
+  %v0 = load <4 x float>, <4 x float>* %h, align 8
+  %v0e = fpext <4 x float> %v0 to <4 x double>
+  %a = fadd <4 x double> %v0e, %vl
+  ret <4 x double> %a
+}
+
+define <4 x double> @test1a(<4 x float>* align 16 %h) #0 {
+entry:
+  %h1 = getelementptr <4 x float>, <4 x float>* %h, i64 1
+  %hv = bitcast <4 x float>* %h1 to i8*
+  %vl = call <4 x double> @llvm.ppc.qpx.qvlfs(i8* %hv)
+
+; CHECK-LABEL: @test1a
+; CHECK-NOT: @llvm.ppc.qpx.qvlfs
+; CHECK-NOT: load <4 x double>
+; CHECK: ret <4 x double>
+
+  %v0 = load <4 x float>, <4 x float>* %h, align 8
+  %v0e = fpext <4 x float> %v0 to <4 x double>
+  %a = fadd <4 x double> %v0e, %vl
+  ret <4 x double> %a
+}
+
+declare void @llvm.ppc.qpx.qvstfs(<4 x double>, i8*) #0
+
+define <4 x float> @test2(<4 x float>* %h, <4 x double> %d) #0 {
+entry:
+  %h1 = getelementptr <4 x float>, <4 x float>* %h, i64 1
+  %hv = bitcast <4 x float>* %h1 to i8*
+  call void @llvm.ppc.qpx.qvstfs(<4 x double> %d, i8* %hv)
+
+  %v0 = load <4 x float>, <4 x float>* %h, align 8
+  ret <4 x float> %v0
+
+; CHECK-LABEL: @test2
+; CHECK: @llvm.ppc.qpx.qvstfs
+; CHECK: ret <4 x float>
+}
+
+define <4 x float> @test2a(<4 x float>* align 16 %h, <4 x double> %d) #0 {
+entry:
+  %h1 = getelementptr <4 x float>, <4 x float>* %h, i64 1
+  %hv = bitcast <4 x float>* %h1 to i8*
+  call void @llvm.ppc.qpx.qvstfs(<4 x double> %d, i8* %hv)
+
+  %v0 = load <4 x float>, <4 x float>* %h, align 8
+  ret <4 x float> %v0
+
+; CHECK-LABEL: @test2
+; CHECK: fptrunc <4 x double> %d to <4 x float>
+; CHECK-NOT: @llvm.ppc.qpx.qvstfs
+; CHECK-NOT: store <4 x double>
+; CHECK: ret <4 x float>
+}
+
+declare <4 x double> @llvm.ppc.qpx.qvlfd(i8*) #1
+
+define <4 x double> @test1l(<4 x double>* %h) #0 {
+entry:
+  %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1
+  %hv = bitcast <4 x double>* %h1 to i8*
+  %vl = call <4 x double> @llvm.ppc.qpx.qvlfd(i8* %hv)
+
+; CHECK-LABEL: @test1l
+; CHECK: @llvm.ppc.qpx.qvlfd
+; CHECK: ret <4 x double>
+
+  %v0 = load <4 x double>, <4 x double>* %h, align 8
+  %a = fadd <4 x double> %v0, %vl
+  ret <4 x double> %a
+}
+
+define <4 x double> @test1ln(<4 x double>* align 16 %h) #0 {
+entry:
+  %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1
+  %hv = bitcast <4 x double>* %h1 to i8*
+  %vl = call <4 x double> @llvm.ppc.qpx.qvlfd(i8* %hv)
+
+; CHECK-LABEL: @test1ln
+; CHECK: @llvm.ppc.qpx.qvlfd
+; CHECK: ret <4 x double>
+
+  %v0 = load <4 x double>, <4 x double>* %h, align 8
+  %a = fadd <4 x double> %v0, %vl
+  ret <4 x double> %a
+}
+
+define <4 x double> @test1la(<4 x double>* align 32 %h) #0 {
+entry:
+  %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1
+  %hv = bitcast <4 x double>* %h1 to i8*
+  %vl = call <4 x double> @llvm.ppc.qpx.qvlfd(i8* %hv)
+
+; CHECK-LABEL: @test1la
+; CHECK-NOT: @llvm.ppc.qpx.qvlfd
+; CHECK: ret <4 x double>
+
+  %v0 = load <4 x double>, <4 x double>* %h, align 8
+  %a = fadd <4 x double> %v0, %vl
+  ret <4 x double> %a
+}
+
+declare void @llvm.ppc.qpx.qvstfd(<4 x double>, i8*) #0
+
+define <4 x double> @test2l(<4 x double>* %h, <4 x double> %d) #0 {
+entry:
+  %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1
+  %hv = bitcast <4 x double>* %h1 to i8*
+  call void @llvm.ppc.qpx.qvstfd(<4 x double> %d, i8* %hv)
+
+  %v0 = load <4 x double>, <4 x double>* %h, align 8
+  ret <4 x double> %v0
+
+; CHECK-LABEL: @test2l
+; CHECK: @llvm.ppc.qpx.qvstfd
+; CHECK: ret <4 x double>
+}
+
+define <4 x double> @test2ln(<4 x double>* align 16 %h, <4 x double> %d) #0 {
+entry:
+  %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1
+  %hv = bitcast <4 x double>* %h1 to i8*
+  call void @llvm.ppc.qpx.qvstfd(<4 x double> %d, i8* %hv)
+
+  %v0 = load <4 x double>, <4 x double>* %h, align 8
+  ret <4 x double> %v0
+
+; CHECK-LABEL: @test2ln
+; CHECK: @llvm.ppc.qpx.qvstfd
+; CHECK: ret <4 x double>
+}
+
+define <4 x double> @test2la(<4 x double>* align 32 %h, <4 x double> %d) #0 {
+entry:
+  %h1 = getelementptr <4 x double>, <4 x double>* %h, i64 1
+  %hv = bitcast <4 x double>* %h1 to i8*
+  call void @llvm.ppc.qpx.qvstfd(<4 x double> %d, i8* %hv)
+
+  %v0 = load <4 x double>, <4 x double>* %h, align 8
+  ret <4 x double> %v0
+
+; CHECK-LABEL: @test2l
+; CHECK-NOT: @llvm.ppc.qpx.qvstfd
+; CHECK: ret <4 x double>
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+

Added: llvm/trunk/test/Transforms/InstCombine/PowerPC/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/PowerPC/lit.local.cfg?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/PowerPC/lit.local.cfg (added)
+++ llvm/trunk/test/Transforms/InstCombine/PowerPC/lit.local.cfg Tue Apr 16 21:52:47 2019
@@ -0,0 +1,3 @@
+if not 'PowerPC' in config.root.targets:
+    config.unsupported = True
+

Added: llvm/trunk/test/Transforms/InstCombine/PowerPC/vsx-unaligned.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/PowerPC/vsx-unaligned.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/PowerPC/vsx-unaligned.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/PowerPC/vsx-unaligned.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,44 @@
+; Verify that we can create unaligned loads and stores from VSX intrinsics.
+
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target triple = "powerpc64-unknown-linux-gnu"
+
+ at vf = common global <4 x float> zeroinitializer, align 1
+ at res_vf = common global <4 x float> zeroinitializer, align 1
+ at vd = common global <2 x double> zeroinitializer, align 1
+ at res_vd = common global <2 x double> zeroinitializer, align 1
+
+define void @test1() {
+entry:
+  %t1 = alloca <4 x float>*, align 8
+  %t2 = alloca <2 x double>*, align 8
+  store <4 x float>* @vf, <4 x float>** %t1, align 8
+  %0 = load <4 x float>*, <4 x float>** %t1, align 8
+  %1 = bitcast <4 x float>* %0 to i8*
+  %2 = call <4 x i32> @llvm.ppc.vsx.lxvw4x(i8* %1)
+  store <4 x float>* @res_vf, <4 x float>** %t1, align 8
+  %3 = load <4 x float>*, <4 x float>** %t1, align 8
+  %4 = bitcast <4 x float>* %3 to i8*
+  call void @llvm.ppc.vsx.stxvw4x(<4 x i32> %2, i8* %4)
+  store <2 x double>* @vd, <2 x double>** %t2, align 8
+  %5 = load <2 x double>*, <2 x double>** %t2, align 8
+  %6 = bitcast <2 x double>* %5 to i8*
+  %7 = call <2 x double> @llvm.ppc.vsx.lxvd2x(i8* %6)
+  store <2 x double>* @res_vd, <2 x double>** %t2, align 8
+  %8 = load <2 x double>*, <2 x double>** %t2, align 8
+  %9 = bitcast <2 x double>* %8 to i8*
+  call void @llvm.ppc.vsx.stxvd2x(<2 x double> %7, i8* %9)
+  ret void
+}
+
+; CHECK-LABEL: @test1
+; CHECK: %0 = load <4 x i32>, <4 x i32>* bitcast (<4 x float>* @vf to <4 x i32>*), align 1
+; CHECK: store <4 x i32> %0, <4 x i32>* bitcast (<4 x float>* @res_vf to <4 x i32>*), align 1
+; CHECK: %1 = load <2 x double>, <2 x double>* @vd, align 1
+; CHECK: store <2 x double> %1, <2 x double>* @res_vd, align 1
+
+declare <4 x i32> @llvm.ppc.vsx.lxvw4x(i8*)
+declare void @llvm.ppc.vsx.stxvw4x(<4 x i32>, i8*)
+declare <2 x double> @llvm.ppc.vsx.lxvd2x(i8*)
+declare void @llvm.ppc.vsx.stxvd2x(<2 x double>, i8*)

Added: llvm/trunk/test/Transforms/InstCombine/README.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/README.txt?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/README.txt (added)
+++ llvm/trunk/test/Transforms/InstCombine/README.txt Tue Apr 16 21:52:47 2019
@@ -0,0 +1,4 @@
+This directory contains test cases for the instcombine transformation.  The
+dated tests are actual bug tests, whereas the named tests are used to test
+for features that the this pass should be capable of performing.
+

Added: llvm/trunk/test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,210 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; The test checks the folding of cmp(sub(a,b),0) into cmp(a,b).
+
+define i8 @sub_compare_foldingPD128_safe(<2 x double> %a, <2 x double> %b){
+; CHECK-LABEL: @sub_compare_foldingPD128_safe(
+; CHECK-NEXT:    [[SUB_SAFE:%.*]] = fsub <2 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[T0:%.*]] = call <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double> [[SUB_SAFE]], <2 x double> zeroinitializer, i32 5)
+; CHECK-NEXT:    [[T1:%.*]] = shufflevector <2 x i1> [[T0]], <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+; CHECK-NEXT:    [[T2:%.*]] = bitcast <8 x i1> [[T1]] to i8
+; CHECK-NEXT:    ret i8 [[T2]]
+;
+  %sub.safe = fsub <2 x double> %a, %b
+  %t0 = call <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double> %sub.safe, <2 x double> zeroinitializer, i32 5)
+  %t1 = shufflevector <2 x i1> %t0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %t2 = bitcast <8 x i1> %t1 to i8
+  ret i8 %t2
+}
+
+define i8 @sub_compare_foldingPD128(<2 x double> %a, <2 x double> %b){
+; CHECK-LABEL: @sub_compare_foldingPD128(
+; CHECK-NEXT:    [[T0:%.*]] = call <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], i32 5)
+; CHECK-NEXT:    [[T1:%.*]] = shufflevector <2 x i1> [[T0]], <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+; CHECK-NEXT:    [[T2:%.*]] = bitcast <8 x i1> [[T1]] to i8
+; CHECK-NEXT:    ret i8 [[T2]]
+;
+  %sub.i = fsub ninf <2 x double> %a, %b
+  %t0 = call <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double> %sub.i, <2 x double> zeroinitializer, i32 5)
+  %t1 = shufflevector <2 x i1> %t0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %t2 = bitcast <8 x i1> %t1 to i8
+  ret i8 %t2
+}
+
+define i8 @sub_compare_foldingPD128_undef_elt(<2 x double> %a, <2 x double> %b){
+; CHECK-LABEL: @sub_compare_foldingPD128_undef_elt(
+; CHECK-NEXT:    [[T0:%.*]] = call <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], i32 5)
+; CHECK-NEXT:    [[T1:%.*]] = shufflevector <2 x i1> [[T0]], <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+; CHECK-NEXT:    [[T2:%.*]] = bitcast <8 x i1> [[T1]] to i8
+; CHECK-NEXT:    ret i8 [[T2]]
+;
+  %sub.i = fsub ninf <2 x double> %a, %b
+  %t0 = call <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double> %sub.i, <2 x double> <double 0.0, double undef>, i32 5)
+  %t1 = shufflevector <2 x i1> %t0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %t2 = bitcast <8 x i1> %t1 to i8
+  ret i8 %t2
+}
+
+define i8 @sub_compare_foldingPD256(<4 x double> %a, <4 x double> %b){
+; CHECK-LABEL: @sub_compare_foldingPD256(
+; CHECK-NEXT:    [[T0:%.*]] = call <4 x i1> @llvm.x86.avx512.cmp.pd.256(<4 x double> [[A:%.*]], <4 x double> [[B:%.*]], i32 5)
+; CHECK-NEXT:    [[T1:%.*]] = shufflevector <4 x i1> [[T0]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[T2:%.*]] = bitcast <8 x i1> [[T1]] to i8
+; CHECK-NEXT:    ret i8 [[T2]]
+;
+  %sub.i1 = fsub ninf <4 x double> %a, %b
+  %t0 = call <4 x i1> @llvm.x86.avx512.cmp.pd.256(<4 x double> %sub.i1, <4 x double> zeroinitializer, i32 5)
+  %t1 = shufflevector <4 x i1> %t0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %t2 = bitcast <8 x i1> %t1 to i8
+  ret i8 %t2
+}
+
+define i8 @sub_compare_foldingPD512(<8 x double> %a, <8 x double> %b){
+; CHECK-LABEL: @sub_compare_foldingPD512(
+; CHECK-NEXT:    [[T0:%.*]] = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 11, i32 4)
+; CHECK-NEXT:    [[T1:%.*]] = bitcast <8 x i1> [[T0]] to i8
+; CHECK-NEXT:    ret i8 [[T1]]
+;
+  %sub.i2 = fsub ninf <8 x double> %a, %b
+  %t0 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %sub.i2, <8 x double> zeroinitializer, i32 11, i32 4)
+  %t1 = bitcast <8 x i1> %t0 to i8
+  ret i8 %t1
+}
+
+define i8 @sub_compare_foldingPS128(<4 x float> %a, <4 x float> %b){
+; CHECK-LABEL: @sub_compare_foldingPS128(
+; CHECK-NEXT:    [[T0:%.*]] = call <4 x i1> @llvm.x86.avx512.cmp.ps.128(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 12)
+; CHECK-NEXT:    [[T1:%.*]] = shufflevector <4 x i1> [[T0]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[T2:%.*]] = bitcast <8 x i1> [[T1]] to i8
+; CHECK-NEXT:    ret i8 [[T2]]
+;
+  %sub.i3 = fsub ninf <4 x float> %a, %b
+  %t0 = call <4 x i1> @llvm.x86.avx512.cmp.ps.128(<4 x float> %sub.i3, <4 x float> zeroinitializer, i32 12)
+  %t1 = shufflevector <4 x i1> %t0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %t2 = bitcast <8 x i1> %t1 to i8
+  ret i8 %t2
+}
+
+define i8 @sub_compare_foldingPS256(<8 x float> %a, <8 x float> %b){
+; CHECK-LABEL: @sub_compare_foldingPS256(
+; CHECK-NEXT:    [[T0:%.*]] = call <8 x i1> @llvm.x86.avx512.cmp.ps.256(<8 x float> [[A:%.*]], <8 x float> [[B:%.*]], i32 5)
+; CHECK-NEXT:    [[T1:%.*]] = bitcast <8 x i1> [[T0]] to i8
+; CHECK-NEXT:    ret i8 [[T1]]
+;
+  %sub.i4 = fsub ninf <8 x float> %a, %b
+  %t0 = call <8 x i1> @llvm.x86.avx512.cmp.ps.256(<8 x float> %sub.i4, <8 x float> zeroinitializer, i32 5)
+  %t1 = bitcast <8 x i1> %t0 to i8
+  ret i8 %t1
+}
+
+define i16 @sub_compare_foldingPS512(<16 x float> %a, <16 x float> %b){
+; CHECK-LABEL: @sub_compare_foldingPS512(
+; CHECK-NEXT:    [[T0:%.*]] = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 11, i32 4)
+; CHECK-NEXT:    [[T1:%.*]] = bitcast <16 x i1> [[T0]] to i16
+; CHECK-NEXT:    ret i16 [[T1]]
+;
+  %sub.i5 = fsub ninf <16 x float> %a, %b
+  %t0 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %sub.i5, <16 x float> zeroinitializer, i32 11, i32 4)
+  %t1 = bitcast <16 x i1> %t0 to i16
+  ret i16 %t1
+}
+
+define i8 @sub_compare_folding_swapPD128(<2 x double> %a, <2 x double> %b){
+; CHECK-LABEL: @sub_compare_folding_swapPD128(
+; CHECK-NEXT:    [[T0:%.*]] = call <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double> [[B:%.*]], <2 x double> [[A:%.*]], i32 5)
+; CHECK-NEXT:    [[T1:%.*]] = shufflevector <2 x i1> [[T0]], <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+; CHECK-NEXT:    [[T2:%.*]] = bitcast <8 x i1> [[T1]] to i8
+; CHECK-NEXT:    ret i8 [[T2]]
+;
+  %sub.i = fsub ninf <2 x double> %a, %b
+  %t0 = call <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double> zeroinitializer, <2 x double> %sub.i, i32 5)
+  %t1 = shufflevector <2 x i1> %t0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %t2 = bitcast <8 x i1> %t1 to i8
+  ret i8 %t2
+}
+
+define i8 @sub_compare_folding_swapPD256(<4 x double> %a, <4 x double> %b){
+; CHECK-LABEL: @sub_compare_folding_swapPD256(
+; CHECK-NEXT:    [[T0:%.*]] = call <4 x i1> @llvm.x86.avx512.cmp.pd.256(<4 x double> [[B:%.*]], <4 x double> [[A:%.*]], i32 5)
+; CHECK-NEXT:    [[T1:%.*]] = shufflevector <4 x i1> [[T0]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[T2:%.*]] = bitcast <8 x i1> [[T1]] to i8
+; CHECK-NEXT:    ret i8 [[T2]]
+;
+  %sub.i = fsub ninf <4 x double> %a, %b
+  %t0 = call <4 x i1> @llvm.x86.avx512.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> %sub.i, i32 5)
+  %t1 = shufflevector <4 x i1> %t0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %t2 = bitcast <8 x i1> %t1 to i8
+  ret i8 %t2
+}
+
+define i8 @sub_compare_folding_swapPD256_undef(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: @sub_compare_folding_swapPD256_undef(
+; CHECK-NEXT:    [[TMP:%.*]] = call <4 x i1> @llvm.x86.avx512.cmp.pd.256(<4 x double> undef, <4 x double> zeroinitializer, i32 5)
+; CHECK-NEXT:    [[T0:%.*]] = shufflevector <4 x i1> [[TMP]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[T1:%.*]] = bitcast <8 x i1> [[T0]] to i8
+; CHECK-NEXT:    ret i8 [[T1]]
+;
+  %sub.i1 = fsub ninf <4 x double> undef, undef
+  %tmp = call <4 x i1> @llvm.x86.avx512.cmp.pd.256(<4 x double> %sub.i1, <4 x double> zeroinitializer, i32 5)
+  %t0 = shufflevector <4 x i1> %tmp, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %t1 = bitcast <8 x i1> %t0 to i8
+  ret i8 %t1
+}
+
+define i8 @sub_compare_folding_swapPD512(<8 x double> %a, <8 x double> %b){
+; CHECK-LABEL: @sub_compare_folding_swapPD512(
+; CHECK-NEXT:    [[T0:%.*]] = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> [[B:%.*]], <8 x double> [[A:%.*]], i32 11, i32 4)
+; CHECK-NEXT:    [[T1:%.*]] = bitcast <8 x i1> [[T0]] to i8
+; CHECK-NEXT:    ret i8 [[T1]]
+;
+  %sub.i = fsub ninf <8 x double> %a, %b
+  %t0 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> zeroinitializer, <8 x double> %sub.i, i32 11, i32 4)
+  %t1 = bitcast <8 x i1> %t0 to i8
+  ret i8 %t1
+}
+
+define i8 @sub_compare_folding_swapPS128(<4 x float> %a, <4 x float> %b){
+; CHECK-LABEL: @sub_compare_folding_swapPS128(
+; CHECK-NEXT:    [[T0:%.*]] = call <4 x i1> @llvm.x86.avx512.cmp.ps.128(<4 x float> [[B:%.*]], <4 x float> [[A:%.*]], i32 12)
+; CHECK-NEXT:    [[T1:%.*]] = shufflevector <4 x i1> [[T0]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[T2:%.*]] = bitcast <8 x i1> [[T1]] to i8
+; CHECK-NEXT:    ret i8 [[T2]]
+;
+  %sub.i = fsub ninf <4 x float> %a, %b
+  %t0 = call <4 x i1> @llvm.x86.avx512.cmp.ps.128(<4 x float> zeroinitializer, <4 x float> %sub.i, i32 12)
+  %t1 = shufflevector <4 x i1> %t0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %t2 = bitcast <8 x i1> %t1 to i8
+  ret i8 %t2
+}
+
+define i8 @sub_compare_folding_swapPS256(<8 x float> %a, <8 x float> %b){
+; CHECK-LABEL: @sub_compare_folding_swapPS256(
+; CHECK-NEXT:    [[T0:%.*]] = call <8 x i1> @llvm.x86.avx512.cmp.ps.256(<8 x float> [[B:%.*]], <8 x float> [[A:%.*]], i32 5)
+; CHECK-NEXT:    [[T1:%.*]] = bitcast <8 x i1> [[T0]] to i8
+; CHECK-NEXT:    ret i8 [[T1]]
+;
+  %sub.i = fsub ninf <8 x float> %a, %b
+  %t0 = call <8 x i1> @llvm.x86.avx512.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> %sub.i, i32 5)
+  %t1 = bitcast <8 x i1> %t0 to i8
+  ret i8 %t1
+}
+
+define i16 @sub_compare_folding_swapPS512(<16 x float> %a, <16 x float> %b){
+; CHECK-LABEL: @sub_compare_folding_swapPS512(
+; CHECK-NEXT:    [[T0:%.*]] = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> [[B:%.*]], <16 x float> [[A:%.*]], i32 11, i32 4)
+; CHECK-NEXT:    [[T1:%.*]] = bitcast <16 x i1> [[T0]] to i16
+; CHECK-NEXT:    ret i16 [[T1]]
+;
+  %sub.i = fsub ninf <16 x float> %a, %b
+  %t0 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> zeroinitializer, <16 x float> %sub.i, i32 11, i32 4)
+  %t1 = bitcast <16 x i1> %t0 to i16
+  ret i16 %t1
+}
+
+declare <2 x i1> @llvm.x86.avx512.cmp.pd.128(<2 x double>, <2 x double>, i32)
+declare <4 x i1> @llvm.x86.avx512.cmp.pd.256(<4 x double>, <4 x double>, i32)
+declare <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double>, <8 x double>, i32, i32)
+declare <4 x i1> @llvm.x86.avx512.cmp.ps.128(<4 x float>, <4 x float>, i32)
+declare <8 x i1> @llvm.x86.avx512.cmp.ps.256(<8 x float>, <8 x float>, i32)
+declare <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float>, <16 x float>, i32, i32)

Added: llvm/trunk/test/Transforms/InstCombine/X86/addcarry.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/X86/addcarry.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/X86/addcarry.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/X86/addcarry.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+declare { i8, i32 } @llvm.x86.addcarry.32(i8, i32, i32)
+declare { i8, i64 } @llvm.x86.addcarry.64(i8, i64, i64)
+
+define i32 @no_carryin_i32(i32 %x, i32 %y, i8* %p) {
+; CHECK-LABEL: @no_carryin_i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[X:%.*]], i32 [[Y:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i8
+; CHECK-NEXT:    store i8 [[TMP4]], i8* [[P:%.*]], align 1
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %s = call { i8, i32 } @llvm.x86.addcarry.32(i8 0, i32 %x, i32 %y)
+  %ov = extractvalue { i8, i32 } %s, 0
+  store i8 %ov, i8* %p
+  %r = extractvalue { i8, i32 } %s, 1
+  ret i32 %r
+}
+
+define i64 @no_carryin_i64(i64 %x, i64 %y, i8* %p) {
+; CHECK-LABEL: @no_carryin_i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[X:%.*]], i64 [[Y:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i8
+; CHECK-NEXT:    store i8 [[TMP4]], i8* [[P:%.*]], align 1
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+  %s = call { i8, i64 } @llvm.x86.addcarry.64(i8 0, i64 %x, i64 %y)
+  %ov = extractvalue { i8, i64 } %s, 0
+  store i8 %ov, i8* %p
+  %r = extractvalue { i8, i64 } %s, 1
+  ret i64 %r
+}
+

Added: llvm/trunk/test/Transforms/InstCombine/X86/blend_x86.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/X86/blend_x86.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/X86/blend_x86.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/X86/blend_x86.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,296 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -mtriple=x86_64-apple-macosx -mcpu=core-avx2 -S | FileCheck %s
+
+define <2 x double> @constant_blendvpd(<2 x double> %xy, <2 x double> %ab) {
+; CHECK-LABEL: @constant_blendvpd(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[AB:%.*]], <2 x double> [[XY:%.*]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = tail call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %xy, <2 x double> %ab, <2 x double> <double 0xFFFFFFFFE0000000, double 0.000000e+00>)
+  ret <2 x double> %1
+}
+
+define <2 x double> @constant_blendvpd_zero(<2 x double> %xy, <2 x double> %ab) {
+; CHECK-LABEL: @constant_blendvpd_zero(
+; CHECK-NEXT:    ret <2 x double> [[XY:%.*]]
+;
+  %1 = tail call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %xy, <2 x double> %ab, <2 x double> zeroinitializer)
+  ret <2 x double> %1
+}
+
+define <2 x double> @constant_blendvpd_dup(<2 x double> %xy, <2 x double> %sel) {
+; CHECK-LABEL: @constant_blendvpd_dup(
+; CHECK-NEXT:    ret <2 x double> [[XY:%.*]]
+;
+  %1 = tail call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %xy, <2 x double> %xy, <2 x double> %sel)
+  ret <2 x double> %1
+}
+
+define <4 x float> @constant_blendvps(<4 x float> %xyzw, <4 x float> %abcd) {
+; CHECK-LABEL: @constant_blendvps(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[XYZW:%.*]], <4 x float> [[ABCD:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %xyzw, <4 x float> %abcd, <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0xFFFFFFFFE0000000>)
+  ret <4 x float> %1
+}
+
+define <4 x float> @constant_blendvps_zero(<4 x float> %xyzw, <4 x float> %abcd) {
+; CHECK-LABEL: @constant_blendvps_zero(
+; CHECK-NEXT:    ret <4 x float> [[XYZW:%.*]]
+;
+  %1 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %xyzw, <4 x float> %abcd, <4 x float> zeroinitializer)
+  ret <4 x float> %1
+}
+
+define <4 x float> @constant_blendvps_dup(<4 x float> %xyzw, <4 x float> %sel) {
+; CHECK-LABEL: @constant_blendvps_dup(
+; CHECK-NEXT:    ret <4 x float> [[XYZW:%.*]]
+;
+  %1 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %xyzw, <4 x float> %xyzw, <4 x float> %sel)
+  ret <4 x float> %1
+}
+
+define <16 x i8> @constant_pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd) {
+; CHECK-LABEL: @constant_pblendvb(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[XYZW:%.*]], <16 x i8> [[ABCD:%.*]], <16 x i32> <i32 0, i32 1, i32 18, i32 3, i32 20, i32 21, i32 22, i32 7, i32 8, i32 9, i32 26, i32 11, i32 28, i32 29, i32 30, i32 15>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd, <16 x i8> <i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0>)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @constant_pblendvb_zero(<16 x i8> %xyzw, <16 x i8> %abcd) {
+; CHECK-LABEL: @constant_pblendvb_zero(
+; CHECK-NEXT:    ret <16 x i8> [[XYZW:%.*]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd, <16 x i8> zeroinitializer)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @constant_pblendvb_dup(<16 x i8> %xyzw, <16 x i8> %sel) {
+; CHECK-LABEL: @constant_pblendvb_dup(
+; CHECK-NEXT:    ret <16 x i8> [[XYZW:%.*]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %xyzw, <16 x i8> %xyzw, <16 x i8> %sel)
+  ret <16 x i8> %1
+}
+
+define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) {
+; CHECK-LABEL: @constant_blendvpd_avx(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[AB:%.*]], <4 x double> [[XY:%.*]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %1 = tail call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %xy, <4 x double> %ab, <4 x double> <double 0xFFFFFFFFE0000000, double 0.000000e+00, double 0xFFFFFFFFE0000000, double 0.000000e+00>)
+  ret <4 x double> %1
+}
+
+define <4 x double> @constant_blendvpd_avx_zero(<4 x double> %xy, <4 x double> %ab) {
+; CHECK-LABEL: @constant_blendvpd_avx_zero(
+; CHECK-NEXT:    ret <4 x double> [[XY:%.*]]
+;
+  %1 = tail call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %xy, <4 x double> %ab, <4 x double> zeroinitializer)
+  ret <4 x double> %1
+}
+
+define <4 x double> @constant_blendvpd_avx_dup(<4 x double> %xy, <4 x double> %sel) {
+; CHECK-LABEL: @constant_blendvpd_avx_dup(
+; CHECK-NEXT:    ret <4 x double> [[XY:%.*]]
+;
+  %1 = tail call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %xy, <4 x double> %xy, <4 x double> %sel)
+  ret <4 x double> %1
+}
+
+define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd) {
+; CHECK-LABEL: @constant_blendvps_avx(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[XYZW:%.*]], <8 x float> [[ABCD:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %1 = tail call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %xyzw, <8 x float> %abcd, <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0xFFFFFFFFE0000000, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0xFFFFFFFFE0000000>)
+  ret <8 x float> %1
+}
+
+define <8 x float> @constant_blendvps_avx_zero(<8 x float> %xyzw, <8 x float> %abcd) {
+; CHECK-LABEL: @constant_blendvps_avx_zero(
+; CHECK-NEXT:    ret <8 x float> [[XYZW:%.*]]
+;
+  %1 = tail call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %xyzw, <8 x float> %abcd, <8 x float> zeroinitializer)
+  ret <8 x float> %1
+}
+
+define <8 x float> @constant_blendvps_avx_dup(<8 x float> %xyzw, <8 x float> %sel) {
+; CHECK-LABEL: @constant_blendvps_avx_dup(
+; CHECK-NEXT:    ret <8 x float> [[XYZW:%.*]]
+;
+  %1 = tail call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %xyzw, <8 x float> %xyzw, <8 x float> %sel)
+  ret <8 x float> %1
+}
+
+define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
+; CHECK-LABEL: @constant_pblendvb_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[XYZW:%.*]], <32 x i8> [[ABCD:%.*]], <32 x i32> <i32 0, i32 1, i32 34, i32 3, i32 36, i32 37, i32 38, i32 7, i32 8, i32 9, i32 42, i32 11, i32 44, i32 45, i32 46, i32 15, i32 16, i32 17, i32 50, i32 19, i32 52, i32 53, i32 54, i32 23, i32 24, i32 25, i32 58, i32 27, i32 60, i32 61, i32 62, i32 31>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %xyzw, <32 x i8> %abcd,
+  <32 x i8> <i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0,
+  i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0,
+  i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0,
+  i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0>)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @constant_pblendvb_avx2_zero(<32 x i8> %xyzw, <32 x i8> %abcd) {
+; CHECK-LABEL: @constant_pblendvb_avx2_zero(
+; CHECK-NEXT:    ret <32 x i8> [[XYZW:%.*]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %xyzw, <32 x i8> %abcd, <32 x i8> zeroinitializer)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @constant_pblendvb_avx2_dup(<32 x i8> %xyzw, <32 x i8> %sel) {
+; CHECK-LABEL: @constant_pblendvb_avx2_dup(
+; CHECK-NEXT:    ret <32 x i8> [[XYZW:%.*]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %xyzw, <32 x i8> %xyzw, <32 x i8> %sel)
+  ret <32 x i8> %1
+}
+
+define <4 x float> @sel_v4f32(<4 x float> %x, <4 x float> %y, <4 x i1> %cond) {
+; CHECK-LABEL: @sel_v4f32(
+; CHECK-NEXT:    [[R:%.*]] = select <4 x i1> [[COND:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[X:%.*]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %s = sext <4 x i1> %cond to <4 x i32>
+  %b = bitcast <4 x i32> %s to <4 x float>
+  %r = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %x, <4 x float> %y, <4 x float> %b)
+  ret <4 x float> %r
+}
+
+define <2 x double> @sel_v2f64(<2 x double> %x, <2 x double> %y, <2 x i1> %cond) {
+; CHECK-LABEL: @sel_v2f64(
+; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[COND:%.*]], <2 x double> [[Y:%.*]], <2 x double> [[X:%.*]]
+; CHECK-NEXT:    ret <2 x double> [[R]]
+;
+  %s = sext <2 x i1> %cond to <2 x i64>
+  %b = bitcast <2 x i64> %s to <2 x double>
+  %r = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %x, <2 x double> %y, <2 x double> %b)
+  ret <2 x double> %r
+}
+
+; Bitcast X, Y, and the select and remove the intrinsic.
+
+define <16 x i8> @sel_v4i32(<16 x i8> %x, <16 x i8> %y, <4 x i1> %cond) {
+; CHECK-LABEL: @sel_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[X:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[Y:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[COND:%.*]], <4 x i32> [[TMP2]], <4 x i32> [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+; CHECK-NEXT:    ret <16 x i8> [[R]]
+;
+  %s = sext <4 x i1> %cond to <4 x i32>
+  %b = bitcast <4 x i32> %s to <16 x i8>
+  %r = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b)
+  ret <16 x i8> %r
+}
+
+define <16 x i8> @sel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i1> %cond) {
+; CHECK-LABEL: @sel_v16i8(
+; CHECK-NEXT:    [[R:%.*]] = select <16 x i1> [[COND:%.*]], <16 x i8> [[Y:%.*]], <16 x i8> [[X:%.*]]
+; CHECK-NEXT:    ret <16 x i8> [[R]]
+;
+  %s = sext <16 x i1> %cond to <16 x i8>
+  %r = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %x, <16 x i8> %y, <16 x i8> %s)
+  ret <16 x i8> %r
+}
+
+; PR38814: https://bugs.llvm.org/show_bug.cgi?id=38814
+; Repeat the tests above using the minimal form that we expect when using C intrinsics in code.
+; This verifies that nothing is interfering with the blend transform. This also tests the
+; expected IR when 1 of the blend operands is a constant 0 vector. Potentially, this could
+; be transformed to bitwise logic in IR, but currently that transform is left to the backend.
+
+define <4 x float> @sel_v4f32_sse_reality(<4 x float>* %x, <4 x float> %y, <4 x float> %z) {
+; CHECK-LABEL: @sel_v4f32_sse_reality(
+; CHECK-NEXT:    [[LD:%.*]] = load <4 x float>, <4 x float>* [[X:%.*]], align 16
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt <4 x float> [[Z:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = select <4 x i1> [[CMP]], <4 x float> zeroinitializer, <4 x float> [[LD]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %ld = load <4 x float>, <4 x float>* %x, align 16
+  %cmp = fcmp olt <4 x float> %z, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %cond = bitcast <4 x i32> %sext to <4 x float>
+  %r = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %ld, <4 x float> zeroinitializer, <4 x float> %cond)
+  ret <4 x float> %r
+}
+
+define <2 x double> @sel_v2f64_sse_reality(<2 x double>* nocapture readonly %x, <2 x double> %y, <2 x double> %z) {
+; CHECK-LABEL: @sel_v2f64_sse_reality(
+; CHECK-NEXT:    [[LD:%.*]] = load <2 x double>, <2 x double>* [[X:%.*]], align 16
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt <2 x double> [[Z:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[CMP]], <2 x double> zeroinitializer, <2 x double> [[LD]]
+; CHECK-NEXT:    ret <2 x double> [[R]]
+;
+  %ld = load <2 x double>, <2 x double>* %x, align 16
+  %cmp = fcmp olt <2 x double> %z, %y
+  %sext = sext <2 x i1> %cmp to <2 x i64>
+  %cond = bitcast <2 x i64> %sext to <2 x double>
+  %r = tail call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %ld, <2 x double> zeroinitializer, <2 x double> %cond)
+  ret <2 x double> %r
+}
+
+; Bitcast the inputs and the result and remove the intrinsic.
+
+define <2 x i64> @sel_v4i32_sse_reality(<2 x i64>* nocapture readonly %x, <2 x i64> %y, <2 x i64> %z) {
+; CHECK-LABEL: @sel_v4i32_sse_reality(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64>* [[X:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[LD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 16
+; CHECK-NEXT:    [[YCAST:%.*]] = bitcast <2 x i64> [[Y:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[ZCAST:%.*]] = bitcast <2 x i64> [[Z:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <4 x i32> [[YCAST]], [[ZCAST]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[CMP]], <4 x i32> zeroinitializer, <4 x i32> [[LD1]]
+; CHECK-NEXT:    [[RCAST:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[RCAST]]
+;
+  %xcast = bitcast <2 x i64>* %x to <16 x i8>*
+  %ld = load <16 x i8>, <16 x i8>* %xcast, align 16
+  %ycast = bitcast <2 x i64> %y to <4 x i32>
+  %zcast = bitcast <2 x i64> %z to <4 x i32>
+  %cmp = icmp sgt <4 x i32> %ycast, %zcast
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %cond = bitcast <4 x i32> %sext to <16 x i8>
+  %r = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %ld, <16 x i8> zeroinitializer, <16 x i8> %cond)
+  %rcast = bitcast <16 x i8> %r to <2 x i64>
+  ret <2 x i64> %rcast
+}
+
+define <2 x i64> @sel_v16i8_sse_reality(<2 x i64>* nocapture readonly %x, <2 x i64> %y, <2 x i64> %z) {
+; CHECK-LABEL: @sel_v16i8_sse_reality(
+; CHECK-NEXT:    [[XCAST:%.*]] = bitcast <2 x i64>* [[X:%.*]] to <16 x i8>*
+; CHECK-NEXT:    [[LD:%.*]] = load <16 x i8>, <16 x i8>* [[XCAST]], align 16
+; CHECK-NEXT:    [[YCAST:%.*]] = bitcast <2 x i64> [[Y:%.*]] to <16 x i8>
+; CHECK-NEXT:    [[ZCAST:%.*]] = bitcast <2 x i64> [[Z:%.*]] to <16 x i8>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <16 x i8> [[YCAST]], [[ZCAST]]
+; CHECK-NEXT:    [[R:%.*]] = select <16 x i1> [[CMP]], <16 x i8> zeroinitializer, <16 x i8> [[LD]]
+; CHECK-NEXT:    [[RCAST:%.*]] = bitcast <16 x i8> [[R]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[RCAST]]
+;
+  %xcast = bitcast <2 x i64>* %x to <16 x i8>*
+  %ld = load <16 x i8>, <16 x i8>* %xcast, align 16
+  %ycast = bitcast <2 x i64> %y to <16 x i8>
+  %zcast = bitcast <2 x i64> %z to <16 x i8>
+  %cmp = icmp sgt <16 x i8> %ycast, %zcast
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  %r = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %ld, <16 x i8> zeroinitializer, <16 x i8> %sext)
+  %rcast = bitcast <16 x i8> %r to <2 x i64>
+  ret <2 x i64> %rcast
+}
+
+declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>)
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>)
+declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>)
+
+declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>)
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>)
+

Added: llvm/trunk/test/Transforms/InstCombine/X86/clmulqdq.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/X86/clmulqdq.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/X86/clmulqdq.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/X86/clmulqdq.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,266 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8)
+declare <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64>, <4 x i64>, i8)
+declare <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64>, <8 x i64>, i8)
+
+define <2 x i64> @test_demanded_elts_pclmulqdq_0(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pclmulqdq_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]], i8 0)
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = insertelement <2 x i64> %a0, i64 1, i64 1
+  %2 = insertelement <2 x i64> %a1, i64 1, i64 1
+  %3 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %1, <2 x i64> %2, i8 0)
+  ret <2 x i64> %3
+}
+
+define <2 x i64> @test_demanded_elts_pclmulqdq_1(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pclmulqdq_1(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> <i64 undef, i64 1>, <2 x i64> [[A1:%.*]], i8 1)
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = insertelement <2 x i64> %a0, i64 1, i64 1
+  %2 = insertelement <2 x i64> %a1, i64 1, i64 1
+  %3 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %1, <2 x i64> %2, i8 1)
+  ret <2 x i64> %3
+}
+
+define <2 x i64> @test_demanded_elts_pclmulqdq_16(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pclmulqdq_16(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> [[A0:%.*]], <2 x i64> <i64 undef, i64 1>, i8 16)
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = insertelement <2 x i64> %a0, i64 1, i64 1
+  %2 = insertelement <2 x i64> %a1, i64 1, i64 1
+  %3 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %1, <2 x i64> %2, i8 16)
+  ret <2 x i64> %3
+}
+
+define <2 x i64> @test_demanded_elts_pclmulqdq_17(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pclmulqdq_17(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> <i64 undef, i64 1>, <2 x i64> <i64 undef, i64 1>, i8 17)
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = insertelement <2 x i64> %a0, i64 1, i64 1
+  %2 = insertelement <2 x i64> %a1, i64 1, i64 1
+  %3 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %1, <2 x i64> %2, i8 17)
+  ret <2 x i64> %3
+}
+
+define <2 x i64> @test_demanded_elts_pclmulqdq_undef_0() {
+; CHECK-LABEL: @test_demanded_elts_pclmulqdq_undef_0(
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %1 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> <i64 undef, i64 1>, <2 x i64> <i64 undef, i64 1>, i8 0)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_demanded_elts_pclmulqdq_undef_1() {
+; CHECK-LABEL: @test_demanded_elts_pclmulqdq_undef_1(
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %1 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> <i64 1, i64 undef>, <2 x i64> <i64 undef, i64 1>, i8 1)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_demanded_elts_pclmulqdq_undef_16() {
+; CHECK-LABEL: @test_demanded_elts_pclmulqdq_undef_16(
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %1 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> <i64 undef, i64 1>, <2 x i64> <i64 1, i64 undef>, i8 16)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_demanded_elts_pclmulqdq_undef_17() {
+; CHECK-LABEL: @test_demanded_elts_pclmulqdq_undef_17(
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %1 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> <i64 1, i64 undef>, <2 x i64> <i64 1, i64 undef>, i8 17)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @test_demanded_elts_pclmulqdq_256_0(<4 x i64> %a0, <4 x i64> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pclmulqdq_256_0(
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> [[A0:%.*]], <4 x i64> [[A1:%.*]], i8 0)
+; CHECK-NEXT:    ret <4 x i64> [[RES]]
+;
+  %1 = insertelement <4 x i64> %a0, i64 1, i64 1
+  %2 = insertelement <4 x i64> %a1, i64 1, i64 1
+  %3 = insertelement <4 x i64> %1, i64 1, i64 3
+  %4 = insertelement <4 x i64> %2, i64 1, i64 3
+  %res = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %3, <4 x i64> %4, i8 0)
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_demanded_elts_pclmulqdq_256_1(<4 x i64> %a0, <4 x i64> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pclmulqdq_256_1(
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> <i64 undef, i64 1, i64 undef, i64 1>, <4 x i64> [[A1:%.*]], i8 1)
+; CHECK-NEXT:    ret <4 x i64> [[RES]]
+;
+  %1 = insertelement <4 x i64> %a0, i64 1, i64 1
+  %2 = insertelement <4 x i64> %a1, i64 1, i64 1
+  %3 = insertelement <4 x i64> %1, i64 1, i64 3
+  %4 = insertelement <4 x i64> %2, i64 1, i64 3
+  %res = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %3, <4 x i64> %4, i8 1)
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_demanded_elts_pclmulqdq_256_16(<4 x i64> %a0, <4 x i64> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pclmulqdq_256_16(
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> [[A0:%.*]], <4 x i64> <i64 undef, i64 1, i64 undef, i64 1>, i8 16)
+; CHECK-NEXT:    ret <4 x i64> [[RES]]
+;
+  %1 = insertelement <4 x i64> %a0, i64 1, i64 1
+  %2 = insertelement <4 x i64> %a1, i64 1, i64 1
+  %3 = insertelement <4 x i64> %1, i64 1, i64 3
+  %4 = insertelement <4 x i64> %2, i64 1, i64 3
+  %res = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %3, <4 x i64> %4, i8 16)
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_demanded_elts_pclmulqdq_256_17(<4 x i64> %a0, <4 x i64> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pclmulqdq_256_17(
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> <i64 undef, i64 1, i64 undef, i64 1>, <4 x i64> <i64 undef, i64 1, i64 undef, i64 1>, i8 17)
+; CHECK-NEXT:    ret <4 x i64> [[RES]]
+;
+  %1 = insertelement <4 x i64> %a0, i64 1, i64 1
+  %2 = insertelement <4 x i64> %a1, i64 1, i64 1
+  %3 = insertelement <4 x i64> %1, i64 1, i64 3
+  %4 = insertelement <4 x i64> %2, i64 1, i64 3
+  %res = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %3, <4 x i64> %4, i8 17)
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_demanded_elts_pclmulqdq_256_undef_0() {
+; CHECK-LABEL: @test_demanded_elts_pclmulqdq_256_undef_0(
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
+  %1 = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> <i64 undef, i64 1, i64 undef, i64 1>, <4 x i64> <i64 undef, i64 1, i64 undef, i64 1>, i8 0)
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @test_demanded_elts_pclmulqdq_256_undef_1() {
+; CHECK-LABEL: @test_demanded_elts_pclmulqdq_256_undef_1(
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
+  %1 = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> <i64 1, i64 undef, i64 1, i64 undef>, <4 x i64> <i64 undef, i64 1, i64 undef, i64 1>, i8 1)
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @test_demanded_elts_pclmulqdq_256_undef_16() {
+; CHECK-LABEL: @test_demanded_elts_pclmulqdq_256_undef_16(
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
+  %1 = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> <i64 undef, i64 1, i64 undef, i64 1>, <4 x i64> <i64 1, i64 undef, i64 1, i64 undef>, i8 16)
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @test_demanded_elts_pclmulqdq_256_undef_17() {
+; CHECK-LABEL: @test_demanded_elts_pclmulqdq_256_undef_17(
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
+  %1 = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> <i64 1, i64 undef, i64 1, i64 undef>, <4 x i64> <i64 1, i64 undef, i64 1, i64 undef>, i8 17)
+  ret <4 x i64> %1
+}
+
+define <8 x i64> @test_demanded_elts_pclmulqdq_512_0(<8 x i64> %a0, <8 x i64> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pclmulqdq_512_0(
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1:%.*]], i8 0)
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
+;
+  %1 = insertelement <8 x i64> %a0, i64 1, i64 1
+  %2 = insertelement <8 x i64> %a1, i64 1, i64 1
+  %3 = insertelement <8 x i64> %1, i64 1, i64 3
+  %4 = insertelement <8 x i64> %2, i64 1, i64 3
+  %5 = insertelement <8 x i64> %3, i64 1, i64 5
+  %6 = insertelement <8 x i64> %4, i64 1, i64 5
+  %7 = insertelement <8 x i64> %5, i64 1, i64 7
+  %8 = insertelement <8 x i64> %6, i64 1, i64 7
+  %res = call <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64> %7, <8 x i64> %8, i8 0)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_demanded_elts_pclmulqdq_512_1(<8 x i64> %a0, <8 x i64> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pclmulqdq_512_1(
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64> <i64 undef, i64 1, i64 undef, i64 1, i64 undef, i64 1, i64 undef, i64 1>, <8 x i64> [[A1:%.*]], i8 1)
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
+;
+  %1 = insertelement <8 x i64> %a0, i64 1, i64 1
+  %2 = insertelement <8 x i64> %a1, i64 1, i64 1
+  %3 = insertelement <8 x i64> %1, i64 1, i64 3
+  %4 = insertelement <8 x i64> %2, i64 1, i64 3
+  %5 = insertelement <8 x i64> %3, i64 1, i64 5
+  %6 = insertelement <8 x i64> %4, i64 1, i64 5
+  %7 = insertelement <8 x i64> %5, i64 1, i64 7
+  %8 = insertelement <8 x i64> %6, i64 1, i64 7
+  %res = call <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64> %7, <8 x i64> %8, i8 1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_demanded_elts_pclmulqdq_512_16(<8 x i64> %a0, <8 x i64> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pclmulqdq_512_16(
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64> [[A0:%.*]], <8 x i64> <i64 undef, i64 1, i64 undef, i64 1, i64 undef, i64 1, i64 undef, i64 1>, i8 16)
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
+;
+  %1 = insertelement <8 x i64> %a0, i64 1, i64 1
+  %2 = insertelement <8 x i64> %a1, i64 1, i64 1
+  %3 = insertelement <8 x i64> %1, i64 1, i64 3
+  %4 = insertelement <8 x i64> %2, i64 1, i64 3
+  %5 = insertelement <8 x i64> %3, i64 1, i64 5
+  %6 = insertelement <8 x i64> %4, i64 1, i64 5
+  %7 = insertelement <8 x i64> %5, i64 1, i64 7
+  %8 = insertelement <8 x i64> %6, i64 1, i64 7
+  %res = call <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64> %7, <8 x i64> %8, i8 16)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_demanded_elts_pclmulqdq_512_17(<8 x i64> %a0, <8 x i64> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pclmulqdq_512_17(
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64> <i64 undef, i64 1, i64 undef, i64 1, i64 undef, i64 1, i64 undef, i64 1>, <8 x i64> <i64 undef, i64 1, i64 undef, i64 1, i64 undef, i64 1, i64 undef, i64 1>, i8 17)
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
+;
+  %1 = insertelement <8 x i64> %a0, i64 1, i64 1
+  %2 = insertelement <8 x i64> %a1, i64 1, i64 1
+  %3 = insertelement <8 x i64> %1, i64 1, i64 3
+  %4 = insertelement <8 x i64> %2, i64 1, i64 3
+  %5 = insertelement <8 x i64> %3, i64 1, i64 5
+  %6 = insertelement <8 x i64> %4, i64 1, i64 5
+  %7 = insertelement <8 x i64> %5, i64 1, i64 7
+  %8 = insertelement <8 x i64> %6, i64 1, i64 7
+  %res = call <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64> %7, <8 x i64> %8, i8 17)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_demanded_elts_pclmulqdq_512_undef_0() {
+; CHECK-LABEL: @test_demanded_elts_pclmulqdq_512_undef_0(
+; CHECK-NEXT:    ret <8 x i64> zeroinitializer
+;
+  %1 = call <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64> <i64 undef, i64 1, i64 undef, i64 1, i64 undef, i64 1, i64 undef, i64 1>, <8 x i64> <i64 undef, i64 1, i64 undef, i64 1, i64 undef, i64 1, i64 undef, i64 1>, i8 0)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @test_demanded_elts_pclmulqdq_512_undef_1() {
+; CHECK-LABEL: @test_demanded_elts_pclmulqdq_512_undef_1(
+; CHECK-NEXT:    ret <8 x i64> zeroinitializer
+;
+  %1 = call <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64> <i64 1, i64 undef, i64 1, i64 undef, i64 1, i64 undef, i64 1, i64 undef>, <8 x i64> <i64 undef, i64 1, i64 undef, i64 1, i64 undef, i64 1, i64 undef, i64 1>, i8 1)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @test_demanded_elts_pclmulqdq_512_undef_16() {
+; CHECK-LABEL: @test_demanded_elts_pclmulqdq_512_undef_16(
+; CHECK-NEXT:    ret <8 x i64> zeroinitializer
+;
+  %1 = call <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64> <i64 undef, i64 1, i64 undef, i64 1, i64 undef, i64 1, i64 undef, i64 1>, <8 x i64> <i64 1, i64 undef, i64 1, i64 undef, i64 1, i64 undef, i64 1, i64 undef>, i8 16)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @test_demanded_elts_pclmulqdq_512_undef_17() {
+; CHECK-LABEL: @test_demanded_elts_pclmulqdq_512_undef_17(
+; CHECK-NEXT:    ret <8 x i64> zeroinitializer
+;
+  %1 = call <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64> <i64 1, i64 undef, i64 1, i64 undef, i64 1, i64 undef, i64 1, i64 undef>, <8 x i64> <i64 1, i64 undef, i64 1, i64 undef, i64 1, i64 undef, i64 1, i64 undef>, i8 17)
+  ret <8 x i64> %1
+}

Added: llvm/trunk/test/Transforms/InstCombine/X86/lit.local.cfg
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/X86/lit.local.cfg?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/X86/lit.local.cfg (added)
+++ llvm/trunk/test/Transforms/InstCombine/X86/lit.local.cfg Tue Apr 16 21:52:47 2019
@@ -0,0 +1,2 @@
+if not 'X86' in config.root.targets:
+    config.unsupported = True

Added: llvm/trunk/test/Transforms/InstCombine/X86/pr2645-1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/X86/pr2645-1.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/X86/pr2645-1.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/X86/pr2645-1.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,39 @@
+; RUN: opt < %s -instcombine -S | grep shufflevector
+; PR2645
+
+; instcombine shouldn't delete the shufflevector.
+
+define internal void @""(i8*, i32, i8*) {
+; <label>:3
+        br label %4
+
+; <label>:4             ; preds = %6, %3
+        %.0 = phi i32 [ 0, %3 ], [ %19, %6 ]            ; <i32> [#uses=4]
+        %5 = icmp slt i32 %.0, %1               ; <i1> [#uses=1]
+        br i1 %5, label %6, label %20
+
+; <label>:6             ; preds = %4
+        %7 = getelementptr i8, i8* %2, i32 %.0              ; <i8*> [#uses=1]
+        %8 = bitcast i8* %7 to <4 x i16>*               ; <<4 x i16>*> [#uses=1]
+        %9 = load <4 x i16>, <4 x i16>* %8, align 1                ; <<4 x i16>> [#uses=1]
+        %10 = bitcast <4 x i16> %9 to <1 x i64>         ; <<1 x i64>> [#uses=1]
+        %11 = call <2 x i64> @foo(<1 x i64> %10)
+; <<2 x i64>> [#uses=1]
+        %12 = bitcast <2 x i64> %11 to <4 x i32>                ; <<4 x i32>> [#uses=1]
+        %13 = bitcast <4 x i32> %12 to <8 x i16>                ; <<8 x i16>> [#uses=2]
+        %14 = shufflevector <8 x i16> %13, <8 x i16> %13, <8 x i32> < i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3 >          ; <<8 x i16>> [#uses=1]
+        %15 = bitcast <8 x i16> %14 to <4 x i32>                ; <<4 x i32>> [#uses=1]
+        %16 = sitofp <4 x i32> %15 to <4 x float>               ; <<4 x float>> [#uses=1]
+        %17 = getelementptr i8, i8* %0, i32 %.0             ; <i8*> [#uses=1]
+        %18 = bitcast i8* %17 to <4 x float>*           ; <<4 x float>*> [#uses=1]
+        store <4 x float> %16, <4 x float>* %18, align 1
+        %19 = add i32 %.0, 1            ; <i32> [#uses=1]
+        br label %4
+
+; <label>:20            ; preds = %4
+        call void @llvm.x86.mmx.emms( )
+        ret void
+}
+
+declare <2 x i64> @foo(<1 x i64>)
+declare void @llvm.x86.mmx.emms( )

Added: llvm/trunk/test/Transforms/InstCombine/X86/shufflemask-undef.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/X86/shufflemask-undef.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/X86/shufflemask-undef.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/X86/shufflemask-undef.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,110 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; CHECK-NOT: shufflevector{{.*}}i32 8"
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin9"
+	%struct.ActiveTextureTargets = type { i64, i64, i64, i64, i64, i64 }
+	%struct.AlphaTest = type { float, i16, i8, i8 }
+	%struct.ArrayRange = type { i8, i8, i8, i8 }
+	%struct.BlendMode = type { i16, i16, i16, i16, %struct.IColor4, i16, i16, i8, i8, i8, i8 }
+	%struct.ClearColor = type { double, %struct.IColor4, %struct.IColor4, float, i32 }
+	%struct.ClipPlane = type { i32, [6 x %struct.IColor4] }
+	%struct.ColorBuffer = type { i16, i8, i8, [8 x i16], [0 x i32] }
+	%struct.ColorMatrix = type { [16 x float]*, %struct.ImagingColorScale }
+	%struct.Convolution = type { %struct.IColor4, %struct.ImagingColorScale, i16, i16, [0 x i32], float*, i32, i32 }
+	%struct.DepthTest = type { i16, i16, i8, i8, i8, i8, double, double }
+	%struct.FixedFunction = type { %struct.PPStreamToken* }
+	%struct.FogMode = type { %struct.IColor4, float, float, float, float, float, i16, i16, i16, i8, i8 }
+	%struct.HintMode = type { i16, i16, i16, i16, i16, i16, i16, i16, i16, i16 }
+	%struct.Histogram = type { %struct.ProgramLimits*, i32, i16, i8, i8 }
+	%struct.ImagingColorScale = type { %struct.TCoord2, %struct.TCoord2, %struct.TCoord2, %struct.TCoord2 }
+	%struct.ImagingSubset = type { %struct.Convolution, %struct.Convolution, %struct.Convolution, %struct.ColorMatrix, %struct.Minmax, %struct.Histogram, %struct.ImagingColorScale, %struct.ImagingColorScale, %struct.ImagingColorScale, %struct.ImagingColorScale, i32, [0 x i32] }
+	%struct.Light = type { %struct.IColor4, %struct.IColor4, %struct.IColor4, %struct.IColor4, %struct.PointLineLimits, float, float, float, float, float, %struct.PointLineLimits, float, %struct.PointLineLimits, float, %struct.PointLineLimits, float, float, float, float, float }
+	%struct.LightModel = type { %struct.IColor4, [8 x %struct.Light], [2 x %struct.Material], i32, i16, i16, i16, i8, i8, i8, i8, i8, i8 }
+	%struct.LightProduct = type { %struct.IColor4, %struct.IColor4, %struct.IColor4 }
+	%struct.LineMode = type { float, i32, i16, i16, i8, i8, i8, i8 }
+	%struct.LogicOp = type { i16, i8, i8 }
+	%struct.MaskMode = type { i32, [3 x i32], i8, i8, i8, i8, i8, i8, i8, i8 }
+	%struct.Material = type { %struct.IColor4, %struct.IColor4, %struct.IColor4, %struct.IColor4, float, float, float, float, [8 x %struct.LightProduct], %struct.IColor4, [8 x i32] }
+	%struct.Minmax = type { %struct.MinmaxTable*, i16, i8, i8, [0 x i32] }
+	%struct.MinmaxTable = type { %struct.IColor4, %struct.IColor4 }
+	%struct.Mipmaplevel = type { [4 x i32], [4 x i32], [4 x float], [4 x i32], i32, i32, float*, i8*, i16, i16, i16, i16, [2 x float] }
+	%struct.Multisample = type { float, i8, i8, i8, i8, i8, i8, i8, i8 }
+	%struct.PipelineProgramState = type { i8, i8, i8, i8, [0 x i32], %struct.IColor4* }
+	%struct.PixelMap = type { i32*, float*, float*, float*, float*, float*, float*, float*, float*, i32*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
+	%struct.PixelMode = type { float, float, %struct.PixelStore, %struct.PixelTransfer, %struct.PixelMap, %struct.ImagingSubset, i32, i32 }
+	%struct.PixelPack = type { i32, i32, i32, i32, i32, i32, i32, i32, i8, i8, i8, i8 }
+	%struct.PixelStore = type { %struct.PixelPack, %struct.PixelPack }
+	%struct.PixelTransfer = type { float, float, float, float, float, float, float, float, float, float, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float }
+	%struct.PluginBufferData = type { i32 }
+	%struct.PointLineLimits = type { float, float, float }
+	%struct.PointMode = type { float, float, float, float, %struct.PointLineLimits, float, i8, i8, i8, i8, i16, i16, i32, i16, i16 }
+	%struct.PolygonMode = type { [128 x i8], float, float, i16, i16, i16, i16, i8, i8, i8, i8, i8, i8, i8, i8 }
+	%struct.ProgramLimits = type { i32, i32, i32, i32 }
+	%struct.RegisterCombiners = type { i8, i8, i8, i8, i32, [2 x %struct.IColor4], [8 x %struct.RegisterCombinersPerStageState], %struct.RegisterCombinersFinalStageState }
+	%struct.RegisterCombinersFinalStageState = type { i8, i8, i8, i8, [7 x %struct.RegisterCombinersPerVariableState] }
+	%struct.RegisterCombinersPerPortionState = type { [4 x %struct.RegisterCombinersPerVariableState], i8, i8, i8, i8, i16, i16, i16, i16, i16, i16 }
+	%struct.RegisterCombinersPerStageState = type { [2 x %struct.RegisterCombinersPerPortionState], [2 x %struct.IColor4] }
+	%struct.RegisterCombinersPerVariableState = type { i16, i16, i16, i16 }
+	%struct.SWRSurfaceRec = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8*, i8*, i8*, [4 x i8*], i32 }
+	%struct.ScissorTest = type { %struct.ProgramLimits, i8, i8, i8, i8 }
+	%struct.State = type <{ i16, i16, i16, i16, i32, i32, [256 x %struct.IColor4], [128 x %struct.IColor4], %struct.Viewport, %struct.Transform, %struct.LightModel, %struct.ActiveTextureTargets, %struct.AlphaTest, %struct.BlendMode, %struct.ClearColor, %struct.ColorBuffer, %struct.DepthTest, %struct.ArrayRange, %struct.FogMode, %struct.HintMode, %struct.LineMode, %struct.LogicOp, %struct.MaskMode, %struct.PixelMode, %struct.PointMode, %struct.PolygonMode, %struct.ScissorTest, i32, %struct.StencilTest, [8 x %struct.TextureMode], [16 x %struct.TextureImageMode], %struct.ArrayRange, [8 x %struct.TextureCoordGen], %struct.ClipPlane, %struct.Multisample, %struct.RegisterCombiners, %struct.ArrayRange, %struct.ArrayRange, [3 x %struct.PipelineProgramState], %struct.ArrayRange, %struct.TransformFeedback, i32*, %struct.FixedFunction, [3 x i32], [3 x i32] }>
+	%struct.StencilTest = type { [3 x { i32, i32, i16, i16, i16, i16 }], i32, [4 x i8] }
+	%struct.TextureCoordGen = type { { i16, i16, %struct.IColor4, %struct.IColor4 }, { i16, i16, %struct.IColor4, %struct.IColor4 }, { i16, i16, %struct.IColor4, %struct.IColor4 }, { i16, i16, %struct.IColor4, %struct.IColor4 }, i8, i8, i8, i8 }
+	%struct.TextureGeomState = type { i16, i16, i16, i16, i16, i8, i8, i8, i8, i16, i16, i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, [6 x i16], [6 x i16] }
+	%struct.TextureImageMode = type { float }
+	%struct.TextureLevel = type { i32, i32, i16, i16, i16, i8, i8, i16, i16, i16, i16, i8* }
+	%struct.TextureMode = type { %struct.IColor4, i32, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, float, float, i16, i16, i16, i16, i16, i16, [4 x i16], i8, i8, i8, i8, [3 x float], [4 x float], float, float }
+	%struct.TextureParamState = type { i16, i16, i16, i16, i16, i16, %struct.IColor4, float, float, float, float, i16, i16, i16, i16, float, i16, i8, i8, i32, i8* }
+	%struct.TextureRec = type { [4 x float], %struct.TextureState*, %struct.Mipmaplevel*, %struct.Mipmaplevel*, float, float, float, float, i8, i8, i8, i8, i16, i16, i16, i16, i32, float, [2 x %struct.PPStreamToken] }
+	%struct.TextureState = type { i16, i8, i8, i16, i16, float, i32, %struct.SWRSurfaceRec*, %struct.TextureParamState, %struct.TextureGeomState, [0 x i32], i8*, i32, %struct.TextureLevel, [1 x [15 x %struct.TextureLevel]] }
+	%struct.Transform = type <{ [24 x [16 x float]], [24 x [16 x float]], [16 x float], float, float, float, float, float, i8, i8, i8, i8, i32, i32, i32, i16, i16, i8, i8, i8, i8, i32 }>
+	%struct.TransformFeedback = type { i8, i8, i8, i8, [0 x i32], [16 x i32], [16 x i32] }
+	%struct.Viewport = type { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, double, double, i32, i32, i32, i32, float, float, float, float }
+	%struct.IColor4 = type { float, float, float, float }
+	%struct.TCoord2 = type { float, float }
+	%struct.VMGPStack = type { [6 x <4 x float>*], <4 x float>*, i32, i32, <4 x float>*, <4 x float>**, i32, i32, i32, i32, i32, i32 }
+	%struct.VMTextures = type { [16 x %struct.TextureRec*] }
+	%struct.PPStreamToken = type { { i16, i16, i32 } }
+	%struct._VMConstants = type { <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, float, float, float, float, float, float, float, float, float, float, float, float, [256 x float], [528 x i8], { void (i8*, i8*, i32, i8*)*, float (float)*, float (float)*, float (float)*, i32 (float)* } }
+
+define i32 @foo(%struct.State* %dst, <4 x float>* %prgrm, <4 x float>** %buffs, %struct._VMConstants* %cnstn, %struct.PPStreamToken* %pstrm, %struct.PluginBufferData* %gpctx, %struct.VMTextures* %txtrs, %struct.VMGPStack* %gpstk) nounwind {
+bb266.i:
+	getelementptr <4 x float>, <4 x float>* null, i32 11		; <<4 x float>*>:0 [#uses=1]
+	load <4 x float>, <4 x float>* %0, align 16		; <<4 x float>>:1 [#uses=1]
+	shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> < i32 0, i32 1, i32 1, i32 1 >		; <<4 x float>>:2 [#uses=1]
+	shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>>:3 [#uses=1]
+	shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>>:4 [#uses=1]
+	shufflevector <4 x float> %4, <4 x float> %3, <4 x i32> < i32 6, i32 7, i32 2, i32 3 >		; <<4 x float>>:5 [#uses=1]
+	fmul <4 x float> %5, zeroinitializer		; <<4 x float>>:6 [#uses=2]
+	fmul <4 x float> %6, %6		; <<4 x float>>:7 [#uses=1]
+	fadd <4 x float> zeroinitializer, %7		; <<4 x float>>:8 [#uses=1]
+	call <4 x float> @llvm.x86.sse.max.ps( <4 x float> zeroinitializer, <4 x float> %8 ) nounwind readnone		; <<4 x float>>:9 [#uses=1]
+	%phitmp40 = bitcast <4 x float> %9 to <4 x i32>		; <<4 x i32>> [#uses=1]
+	%tmp4109.i = and <4 x i32> %phitmp40, < i32 8388607, i32 8388607, i32 8388607, i32 8388607 >		; <<4 x i32>> [#uses=1]
+	%tmp4116.i = or <4 x i32> %tmp4109.i, < i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216 >		; <<4 x i32>> [#uses=1]
+	%tmp4117.i = bitcast <4 x i32> %tmp4116.i to <4 x float>		; <<4 x float>> [#uses=1]
+	fadd <4 x float> %tmp4117.i, zeroinitializer		; <<4 x float>>:10 [#uses=1]
+	fmul <4 x float> %10, < float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01 >		; <<4 x float>>:11 [#uses=1]
+	call <4 x float> @llvm.x86.sse.max.ps( <4 x float> %11, <4 x float> zeroinitializer ) nounwind readnone		; <<4 x float>>:12 [#uses=1]
+	call <4 x float> @llvm.x86.sse.min.ps( <4 x float> %12, <4 x float> zeroinitializer ) nounwind readnone		; <<4 x float>>:13 [#uses=1]
+	%tmp4170.i = call <4 x float> @llvm.x86.sse.cmp.ps( <4 x float> %13, <4 x float> zeroinitializer, i8 2 ) nounwind		; <<4 x float>> [#uses=1]
+	bitcast <4 x float> %tmp4170.i to <16 x i8>		; <<16 x i8>>:14 [#uses=1]
+	call i32 @llvm.x86.sse2.pmovmskb.128( <16 x i8> %14 ) nounwind readnone		; <i32>:15 [#uses=1]
+	icmp eq i32 %15, 0		; <i1>:16 [#uses=1]
+	br i1 %16, label %bb5574.i, label %bb4521.i
+
+bb4521.i:		; preds = %bb266.i
+	unreachable
+
+bb5574.i:		; preds = %bb266.i
+	unreachable
+}
+
+declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
+
+declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
+
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone

Added: llvm/trunk/test/Transforms/InstCombine/X86/x86-avx.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/X86/x86-avx.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/X86/x86-avx.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/X86/x86-avx.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32)
+declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32)
+
+define <8 x float> @test_round_ps_floor(<8 x float> %a) {
+; CHECK-LABEL: @test_round_ps_floor(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A:%.*]])
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a, i32 1)
+  ret <8 x float> %1
+}
+
+define <8 x float> @test_round_ps_ceil(<8 x float> %a) {
+; CHECK-LABEL: @test_round_ps_ceil(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A:%.*]])
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a, i32 2)
+  ret <8 x float> %1
+}
+
+define <4 x double> @test_round_pd_floor(<4 x double> %a) {
+; CHECK-LABEL: @test_round_pd_floor(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[A:%.*]])
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a, i32 1)
+  ret <4 x double> %1
+}
+
+define <4 x double> @test_round_pd_ceil(<4 x double> %a) {
+; CHECK-LABEL: @test_round_pd_ceil(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[A:%.*]])
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a, i32 2)
+  ret <4 x double> %1
+}

Added: llvm/trunk/test/Transforms/InstCombine/X86/x86-avx2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/X86/x86-avx2.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/X86/x86-avx2.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/X86/x86-avx2.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Verify that instcombine is able to fold identity shuffles.
+
+define <8 x i32> @identity_test_vpermd(<8 x i32> %a0) {
+; CHECK-LABEL: @identity_test_vpermd(
+; CHECK-NEXT:    ret <8 x i32> %a0
+;
+  %a = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
+  ret <8 x i32> %a
+}
+
+define <8 x float> @identity_test_vpermps(<8 x float> %a0) {
+; CHECK-LABEL: @identity_test_vpermps(
+; CHECK-NEXT:    ret <8 x float> %a0
+;
+  %a = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
+  ret <8 x float> %a
+}
+
+; Instcombine should be able to fold the following shuffle to a builtin shufflevector
+; with a shuffle mask of all zeroes.
+
+define <8 x i32> @zero_test_vpermd(<8 x i32> %a0) {
+; CHECK-LABEL: @zero_test_vpermd(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %a = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> zeroinitializer)
+  ret <8 x i32> %a
+}
+
+define <8 x float> @zero_test_vpermps(<8 x float> %a0) {
+; CHECK-LABEL: @zero_test_vpermps(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %a = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> zeroinitializer)
+  ret <8 x float> %a
+}
+
+; Verify that instcombine is able to fold constant shuffles.
+
+define <8 x i32> @shuffle_test_vpermd(<8 x i32> %a0) {
+; CHECK-LABEL: @shuffle_test_vpermd(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %a = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <8 x i32> %a
+}
+
+define <8 x float> @shuffle_test_vpermps(<8 x float> %a0) {
+; CHECK-LABEL: @shuffle_test_vpermps(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %a = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <8 x float> %a
+}
+
+; Verify that instcombine is able to fold constant shuffles with undef mask elements.
+
+define <8 x i32> @undef_test_vpermd(<8 x i32> %a0) {
+; CHECK-LABEL: @undef_test_vpermd(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %a = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <8 x i32> %a
+}
+
+define <8 x float> @undef_test_vpermps(<8 x float> %a0) {
+; CHECK-LABEL: @undef_test_vpermps(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %a = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <8 x float> %a
+}
+
+; Verify simplify demanded elts.
+
+define <8 x i32> @elts_test_vpermd(<8 x i32> %a0, i32 %a1) {
+; CHECK-LABEL: @elts_test_vpermd(
+; CHECK-NEXT:    ret <8 x i32> %a0
+;
+  %1 = insertelement <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, i32 %a1, i32 0
+  %2 = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> %1)
+  %3 = shufflevector <8 x i32> %2, <8 x i32> undef, <8 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %3
+}
+
+define <8 x float> @elts_test_vpermps(<8 x float> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @elts_test_vpermps(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %a1)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x float> [[TMP2]]
+;
+  %1 = insertelement <8 x i32> %a1, i32 0, i32 7
+  %2 = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %1)
+  %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> zeroinitializer
+  ret <8 x float> %3
+}
+
+declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>)
+declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>)

Added: llvm/trunk/test/Transforms/InstCombine/X86/x86-avx512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/X86/x86-avx512.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/X86/x86-avx512.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/X86/x86-avx512.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,3532 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_add_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd float [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 4)
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_add_ss_round(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_add_ss_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 8)
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_add_ss_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd float [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0
+; CHECK-NEXT:    ret <4 x float> [[TMP8]]
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_add_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_add_ss_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 8)
+  ret <4 x float> %4
+}
+
+define float @test_add_ss_1(float %a, float %b) {
+; CHECK-LABEL: @test_add_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %4, <4 x float> %8, <4 x float> undef, i8 -1, i32 8)
+  %10 = extractelement <4 x float> %9, i32 1
+  ret float %10
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_add_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd double [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[A]], double [[TMP3]], i64 0
+; CHECK-NEXT:    ret <2 x double> [[TMP4]]
+;
+  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_add_sd_round(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_add_sd_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 8)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_add_sd_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd double [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[A]], double [[TMP7]], i64 0
+; CHECK-NEXT:    ret <2 x double> [[TMP8]]
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_add_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_add_sd_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 8)
+  ret <2 x double> %2
+}
+
+define double @test_add_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_add_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> %2, <2 x double> %4, <2 x double> undef, i8 -1, i32 8)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_sub_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub float [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 4)
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_sub_ss_round(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_sub_ss_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 8)
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_sub_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_sub_ss_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub float [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0
+; CHECK-NEXT:    ret <4 x float> [[TMP8]]
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_sub_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_sub_ss_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 8)
+  ret <4 x float> %4
+}
+
+define float @test_sub_ss_1(float %a, float %b) {
+; CHECK-LABEL: @test_sub_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> %4, <4 x float> %8, <4 x float> undef, i8 -1, i32 8)
+  %10 = extractelement <4 x float> %9, i32 1
+  ret float %10
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_sub_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub double [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[A]], double [[TMP3]], i64 0
+; CHECK-NEXT:    ret <2 x double> [[TMP4]]
+;
+  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_sub_sd_round(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_sub_sd_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 8)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_sub_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_sub_sd_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub double [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[A]], double [[TMP7]], i64 0
+; CHECK-NEXT:    ret <2 x double> [[TMP8]]
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_sub_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_sub_sd_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 8)
+  ret <2 x double> %2
+}
+
+define double @test_sub_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_sub_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> %2, <2 x double> %4, <2 x double> undef, i8 -1, i32 8)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_mul_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul float [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 4)
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_mul_ss_round(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_mul_ss_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 8)
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_mul_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_mul_ss_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul float [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0
+; CHECK-NEXT:    ret <4 x float> [[TMP8]]
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_mul_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_mul_ss_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 8)
+  ret <4 x float> %4
+}
+
+define float @test_mul_ss_1(float %a, float %b) {
+; CHECK-LABEL: @test_mul_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> %4, <4 x float> %8, <4 x float> undef, i8 -1, i32 8)
+  %10 = extractelement <4 x float> %9, i32 1
+  ret float %10
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_mul_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul double [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[A]], double [[TMP3]], i64 0
+; CHECK-NEXT:    ret <2 x double> [[TMP4]]
+;
+  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_mul_sd_round(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_mul_sd_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 8)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_mul_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mul_sd_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul double [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[A]], double [[TMP7]], i64 0
+; CHECK-NEXT:    ret <2 x double> [[TMP8]]
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_mul_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mul_sd_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 8)
+  ret <2 x double> %2
+}
+
+define double @test_mul_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_mul_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> %2, <2 x double> %4, <2 x double> undef, i8 -1, i32 8)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_div_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fdiv float [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 4)
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_div_ss_round(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_div_ss_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 8)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 8)
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_div_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_div_ss_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fdiv float [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], float [[TMP3]], float [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0
+; CHECK-NEXT:    ret <4 x float> [[TMP8]]
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_div_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_div_ss_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 8)
+  ret <4 x float> %4
+}
+
+define float @test_div_ss_1(float %a, float %b) {
+; CHECK-LABEL: @test_div_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> %4, <4 x float> %8, <4 x float> undef, i8 -1, i32 8)
+  %10 = extractelement <4 x float> %9, i32 1
+  ret float %10
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_div_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fdiv double [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[A]], double [[TMP3]], i64 0
+; CHECK-NEXT:    ret <2 x double> [[TMP4]]
+;
+  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_div_sd_round(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_div_sd_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 8)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 8)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_div_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_div_sd_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fdiv double [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP5]], double [[TMP3]], double [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[A]], double [[TMP7]], i64 0
+; CHECK-NEXT:    ret <2 x double> [[TMP8]]
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_div_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_div_sd_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 8)
+  ret <2 x double> %2
+}
+
+define double @test_div_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_div_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> %2, <2 x double> %4, <2 x double> undef, i8 -1, i32 8)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float> @test_max_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_max_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 4)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 4)
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_max_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_max_ss_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
+  ret <4 x float> %4
+}
+
+define float @test_max_ss_1(float %a, float %b) {
+; CHECK-LABEL: @test_max_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> %4, <4 x float> %8, <4 x float> undef, i8 -1, i32 8)
+  %10 = extractelement <4 x float> %9, i32 1
+  ret float %10
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <2 x double> @test_max_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_max_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 4)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_max_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_max_sd_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
+  ret <2 x double> %2
+}
+
+define double @test_max_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_max_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> %2, <2 x double> %4, <2 x double> undef, i8 -1, i32 8)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float> @test_min_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_min_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 4)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 4)
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_min_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_min_ss_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
+  ret <4 x float> %4
+}
+
+define float @test_min_ss_1(float %a, float %b) {
+; CHECK-LABEL: @test_min_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> %4, <4 x float> %8, <4 x float> undef, i8 -1, i32 8)
+  %10 = extractelement <4 x float> %9, i32 1
+  ret float %10
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <2 x double> @test_min_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @test_min_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 4)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_min_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_min_sd_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
+  ret <2 x double> %2
+}
+
+define double @test_min_sd_1(double %a, double %b) {
+; CHECK-LABEL: @test_min_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> %2, <2 x double> %4, <2 x double> undef, i8 -1, i32 8)
+  %6 = extractelement <2 x double> %5, i32 1
+  ret double %6
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32)
+
+define i8 @test_cmp_ss(<4 x float> %a, <4 x float> %b, i8 %mask) {
+; CHECK-LABEL: @test_cmp_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 3, i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = insertelement <4 x float> %b, float 4.000000e+00, i32 1
+  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
+  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
+  %7 = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %3, <4 x float> %6, i32 3, i8 %mask, i32 4)
+  ret i8 %7
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double>, <2 x double>, i32, i8, i32)
+
+define i8 @test_cmp_sd(<2 x double> %a, <2 x double> %b, i8 %mask) {
+; CHECK-LABEL: @test_cmp_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], i32 3, i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
+  %3 = tail call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %1, <2 x double> %2, i32 3, i8 %mask, i32 4)
+  ret i8 %3
+}
+
+define i64 @test(float %f, double %d) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    [[V03:%.*]] = insertelement <4 x float> undef, float [[F:%.*]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> [[V03]], i32 4)
+; CHECK-NEXT:    [[V13:%.*]] = insertelement <4 x float> undef, float [[F]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> [[V13]], i32 4)
+; CHECK-NEXT:    [[V23:%.*]] = insertelement <4 x float> undef, float [[F]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.x86.avx512.cvttss2si(<4 x float> [[V23]], i32 4)
+; CHECK-NEXT:    [[V33:%.*]] = insertelement <4 x float> undef, float [[F]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.x86.avx512.cvttss2si64(<4 x float> [[V33]], i32 4)
+; CHECK-NEXT:    [[V41:%.*]] = insertelement <2 x double> undef, double [[D:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> [[V41]], i32 4)
+; CHECK-NEXT:    [[V51:%.*]] = insertelement <2 x double> undef, double [[D]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> [[V51]], i32 4)
+; CHECK-NEXT:    [[V61:%.*]] = insertelement <2 x double> undef, double [[D]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> [[V61]], i32 4)
+; CHECK-NEXT:    [[V71:%.*]] = insertelement <2 x double> undef, double [[D]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call i64 @llvm.x86.avx512.cvttsd2si64(<2 x double> [[V71]], i32 4)
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP14]], [[TMP11]]
+; CHECK-NEXT:    ret i64 [[TMP15]]
+;
+  %v00 = insertelement <4 x float> undef, float %f, i32 0
+  %v01 = insertelement <4 x float> %v00, float 0.000000e+00, i32 1
+  %v02 = insertelement <4 x float> %v01, float 0.000000e+00, i32 2
+  %v03 = insertelement <4 x float> %v02, float 0.000000e+00, i32 3
+  %tmp0 = tail call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %v03, i32 4)
+  %v10 = insertelement <4 x float> undef, float %f, i32 0
+  %v11 = insertelement <4 x float> %v10, float 0.000000e+00, i32 1
+  %v12 = insertelement <4 x float> %v11, float 0.000000e+00, i32 2
+  %v13 = insertelement <4 x float> %v12, float 0.000000e+00, i32 3
+  %tmp1 = tail call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> %v13, i32 4)
+  %v20 = insertelement <4 x float> undef, float %f, i32 0
+  %v21 = insertelement <4 x float> %v20, float 0.000000e+00, i32 1
+  %v22 = insertelement <4 x float> %v21, float 0.000000e+00, i32 2
+  %v23 = insertelement <4 x float> %v22, float 0.000000e+00, i32 3
+  %tmp2 = tail call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %v23, i32 4)
+  %v30 = insertelement <4 x float> undef, float %f, i32 0
+  %v31 = insertelement <4 x float> %v30, float 0.000000e+00, i32 1
+  %v32 = insertelement <4 x float> %v31, float 0.000000e+00, i32 2
+  %v33 = insertelement <4 x float> %v32, float 0.000000e+00, i32 3
+  %tmp3 = tail call i64 @llvm.x86.avx512.cvttss2si64(<4 x float> %v33, i32 4)
+  %v40 = insertelement <2 x double> undef, double %d, i32 0
+  %v41 = insertelement <2 x double> %v40, double 0.000000e+00, i32 1
+  %tmp4 = tail call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %v41, i32 4)
+  %v50 = insertelement <2 x double> undef, double %d, i32 0
+  %v51 = insertelement <2 x double> %v50, double 0.000000e+00, i32 1
+  %tmp5 = tail call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> %v51, i32 4)
+  %v60 = insertelement <2 x double> undef, double %d, i32 0
+  %v61 = insertelement <2 x double> %v60, double 0.000000e+00, i32 1
+  %tmp6 = tail call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %v61, i32 4)
+  %v70 = insertelement <2 x double> undef, double %d, i32 0
+  %v71 = insertelement <2 x double> %v70, double 0.000000e+00, i32 1
+  %tmp7 = tail call i64 @llvm.x86.avx512.cvttsd2si64(<2 x double> %v71, i32 4)
+  %tmp8 = add i32 %tmp0, %tmp2
+  %tmp9 = add i32 %tmp4, %tmp6
+  %tmp10 = add i32 %tmp8, %tmp9
+  %tmp11 = sext i32 %tmp10 to i64
+  %tmp12 = add i64 %tmp1, %tmp3
+  %tmp13 = add i64 %tmp5, %tmp7
+  %tmp14 = add i64 %tmp12, %tmp13
+  %tmp15 = add i64 %tmp11, %tmp14
+  ret i64 %tmp15
+}
+
+declare i32 @llvm.x86.avx512.vcvtss2si32(<4 x float>, i32)
+declare i64 @llvm.x86.avx512.vcvtss2si64(<4 x float>, i32)
+declare i32 @llvm.x86.avx512.cvttss2si(<4 x float>, i32)
+declare i64 @llvm.x86.avx512.cvttss2si64(<4 x float>, i32)
+declare i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double>, i32)
+declare i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double>, i32)
+declare i32 @llvm.x86.avx512.cvttsd2si(<2 x double>, i32)
+declare i64 @llvm.x86.avx512.cvttsd2si64(<2 x double>, i32)
+
+define i64 @test2(float %f, double %d) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[V03:%.*]] = insertelement <4 x float> undef, float [[F:%.*]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> [[V03]], i32 4)
+; CHECK-NEXT:    [[V13:%.*]] = insertelement <4 x float> undef, float [[F]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> [[V13]], i32 4)
+; CHECK-NEXT:    [[V23:%.*]] = insertelement <4 x float> undef, float [[F]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> [[V23]], i32 4)
+; CHECK-NEXT:    [[V33:%.*]] = insertelement <4 x float> undef, float [[F]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.x86.avx512.cvttss2usi64(<4 x float> [[V33]], i32 4)
+; CHECK-NEXT:    [[V41:%.*]] = insertelement <2 x double> undef, double [[D:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> [[V41]], i32 4)
+; CHECK-NEXT:    [[V51:%.*]] = insertelement <2 x double> undef, double [[D]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> [[V51]], i32 4)
+; CHECK-NEXT:    [[V61:%.*]] = insertelement <2 x double> undef, double [[D]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> [[V61]], i32 4)
+; CHECK-NEXT:    [[V71:%.*]] = insertelement <2 x double> undef, double [[D]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double> [[V71]], i32 4)
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP14]], [[TMP11]]
+; CHECK-NEXT:    ret i64 [[TMP15]]
+;
+  %v00 = insertelement <4 x float> undef, float %f, i32 0
+  %v01 = insertelement <4 x float> %v00, float 0.000000e+00, i32 1
+  %v02 = insertelement <4 x float> %v01, float 0.000000e+00, i32 2
+  %v03 = insertelement <4 x float> %v02, float 0.000000e+00, i32 3
+  %tmp0 = tail call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %v03, i32 4)
+  %v10 = insertelement <4 x float> undef, float %f, i32 0
+  %v11 = insertelement <4 x float> %v10, float 0.000000e+00, i32 1
+  %v12 = insertelement <4 x float> %v11, float 0.000000e+00, i32 2
+  %v13 = insertelement <4 x float> %v12, float 0.000000e+00, i32 3
+  %tmp1 = tail call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> %v13, i32 4)
+  %v20 = insertelement <4 x float> undef, float %f, i32 0
+  %v21 = insertelement <4 x float> %v20, float 0.000000e+00, i32 1
+  %v22 = insertelement <4 x float> %v21, float 0.000000e+00, i32 2
+  %v23 = insertelement <4 x float> %v22, float 0.000000e+00, i32 3
+  %tmp2 = tail call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %v23, i32 4)
+  %v30 = insertelement <4 x float> undef, float %f, i32 0
+  %v31 = insertelement <4 x float> %v30, float 0.000000e+00, i32 1
+  %v32 = insertelement <4 x float> %v31, float 0.000000e+00, i32 2
+  %v33 = insertelement <4 x float> %v32, float 0.000000e+00, i32 3
+  %tmp3 = tail call i64 @llvm.x86.avx512.cvttss2usi64(<4 x float> %v33, i32 4)
+  %v40 = insertelement <2 x double> undef, double %d, i32 0
+  %v41 = insertelement <2 x double> %v40, double 0.000000e+00, i32 1
+  %tmp4 = tail call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %v41, i32 4)
+  %v50 = insertelement <2 x double> undef, double %d, i32 0
+  %v51 = insertelement <2 x double> %v50, double 0.000000e+00, i32 1
+  %tmp5 = tail call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> %v51, i32 4)
+  %v60 = insertelement <2 x double> undef, double %d, i32 0
+  %v61 = insertelement <2 x double> %v60, double 0.000000e+00, i32 1
+  %tmp6 = tail call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %v61, i32 4)
+  %v70 = insertelement <2 x double> undef, double %d, i32 0
+  %v71 = insertelement <2 x double> %v70, double 0.000000e+00, i32 1
+  %tmp7 = tail call i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double> %v71, i32 4)
+  %tmp8 = add i32 %tmp0, %tmp2
+  %tmp9 = add i32 %tmp4, %tmp6
+  %tmp10 = add i32 %tmp8, %tmp9
+  %tmp11 = sext i32 %tmp10 to i64
+  %tmp12 = add i64 %tmp1, %tmp3
+  %tmp13 = add i64 %tmp5, %tmp7
+  %tmp14 = add i64 %tmp12, %tmp13
+  %tmp15 = add i64 %tmp11, %tmp14
+  ret i64 %tmp15
+}
+
+declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32)
+declare i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float>, i32)
+declare i32 @llvm.x86.avx512.cvttss2usi(<4 x float>, i32)
+declare i64 @llvm.x86.avx512.cvttss2usi64(<4 x float>, i32)
+declare i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double>, i32)
+declare i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double>, i32)
+declare i32 @llvm.x86.avx512.cvttsd2usi(<2 x double>, i32)
+declare i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double>, i32)
+
+declare <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32, i32)
+declare <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32, i32)
+declare <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float>, i32, <4 x float>, i8)
+declare <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float>, i32, <8 x float>, i8)
+declare <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
+declare <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double>, i32, <2 x double>, i8)
+declare <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double>, i32, <4 x double>, i8)
+declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
+
+define <4 x float> @test_rndscale_ss_floor(<4 x float> %src0, <4 x float> %src1, <4 x float> %dst, i8 %k) {
+; CHECK-LABEL: @test_rndscale_ss_floor(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[K:%.*]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[SRC1:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.floor.f32(float [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[DST:%.*]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP2]], float [[TMP5]], float [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[SRC0:%.*]], float [[TMP6]], i64 0
+; CHECK-NEXT:    ret <4 x float> [[TMP7]]
+;
+  %1 = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %src0, <4 x float> %src1, <4 x float> %dst, i8 %k, i32 1, i32 4)
+  ret <4 x float> %1
+}
+
+define <4 x float> @test_rndscale_ss_ceil(<4 x float> %src0, <4 x float> %src1, <4 x float> %dst, i8 %k) {
+; CHECK-LABEL: @test_rndscale_ss_ceil(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[K:%.*]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[SRC1:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.ceil.f32(float [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[DST:%.*]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP2]], float [[TMP5]], float [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[SRC0:%.*]], float [[TMP6]], i64 0
+; CHECK-NEXT:    ret <4 x float> [[TMP7]]
+;
+  %1 = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %src0, <4 x float> %src1, <4 x float> %dst, i8 %k, i32 2, i32 4)
+  ret <4 x float> %1
+}
+
+define <2 x double> @test_rndscale_sd_floor(<2 x double> %src0, <2 x double> %src1, <2 x double> %dst, i8 %k) {
+; CHECK-LABEL: @test_rndscale_sd_floor(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[K:%.*]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[SRC1:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = call double @llvm.floor.f64(double [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[DST:%.*]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP2]], double [[TMP5]], double [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[SRC0:%.*]], double [[TMP6]], i64 0
+; CHECK-NEXT:    ret <2 x double> [[TMP7]]
+;
+  %1 = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %src0, <2 x double> %src1, <2 x double> %dst, i8 %k, i32 1, i32 4)
+  ret <2 x double> %1
+}
+
+define <2 x double> @test_rndscale_sd_ceil(<2 x double> %src0, <2 x double> %src1, <2 x double> %dst, i8 %k) {
+; CHECK-LABEL: @test_rndscale_sd_ceil(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[K:%.*]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[SRC1:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = call double @llvm.ceil.f64(double [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[DST:%.*]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP2]], double [[TMP5]], double [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[SRC0:%.*]], double [[TMP6]], i64 0
+; CHECK-NEXT:    ret <2 x double> [[TMP7]]
+;
+  %1 = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %src0, <2 x double> %src1, <2 x double> %dst, i8 %k, i32 2, i32 4)
+  ret <2 x double> %1
+}
+
+define <4 x float> @test_rndscale_ps_128_floor(<4 x float> %src, <4 x float> %dst, i8 %k) {
+; CHECK-LABEL: @test_rndscale_ps_128_floor(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[SRC:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP1]], <4 x float> [[DST:%.*]]
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
+;
+  %1 = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> %src, i32 1, <4 x float> %dst, i8 %k)
+  ret <4 x float> %1
+}
+
+define <4 x float> @test_rndscale_ps_128_ceil(<4 x float> %src, <4 x float> %dst, i8 %k) {
+; CHECK-LABEL: @test_rndscale_ps_128_ceil(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[SRC:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP1]], <4 x float> [[DST:%.*]]
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
+;
+  %1 = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> %src, i32 2, <4 x float> %dst, i8 %k)
+  ret <4 x float> %1
+}
+
+define <8 x float> @test_rndscale_ps_256_floor(<8 x float> %src, <8 x float> %dst, i8 %k) {
+; CHECK-LABEL: @test_rndscale_ps_256_floor(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[SRC:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[DST:%.*]]
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+;
+  %1 = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> %src, i32 1, <8 x float> %dst, i8 %k)
+  ret <8 x float> %1
+}
+
+define <8 x float> @test_rndscale_ps_256_ceil(<8 x float> %src, <8 x float> %dst, i8 %k) {
+; CHECK-LABEL: @test_rndscale_ps_256_ceil(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[SRC:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[DST:%.*]]
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+;
+  %1 = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> %src, i32 2, <8 x float> %dst, i8 %k)
+  ret <8 x float> %1
+}
+
+define <16 x float> @test_rndscale_ps_512_floor(<16 x float> %src, <16 x float> %dst, i16 %k) {
+; CHECK-LABEL: @test_rndscale_ps_512_floor(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.floor.v16f32(<16 x float> [[SRC:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[K:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[DST:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP3]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %src, i32 1, <16 x float> %dst, i16 %k, i32 4)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_rndscale_ps_512_ceil(<16 x float> %src, <16 x float> %dst, i16 %k) {
+; CHECK-LABEL: @test_rndscale_ps_512_ceil(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.ceil.v16f32(<16 x float> [[SRC:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[K:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[DST:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP3]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %src, i32 2, <16 x float> %dst, i16 %k, i32 4)
+  ret <16 x float> %1
+}
+
+define <2 x double> @test_rndscale_pd_128_floor(<2 x double> %src, <2 x double> %dst, i8 %k) {
+; CHECK-LABEL: @test_rndscale_pd_128_floor(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[SRC:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = select <2 x i1> [[TMP3]], <2 x double> [[TMP1]], <2 x double> [[DST:%.*]]
+; CHECK-NEXT:    ret <2 x double> [[TMP4]]
+;
+  %1 = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> %src, i32 1, <2 x double> %dst, i8 %k)
+  ret <2 x double> %1
+}
+
+define <2 x double> @test_rndscale_pd_128_ceil(<2 x double> %src, <2 x double> %dst, i8 %k) {
+; CHECK-LABEL: @test_rndscale_pd_128_ceil(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[SRC:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = select <2 x i1> [[TMP3]], <2 x double> [[TMP1]], <2 x double> [[DST:%.*]]
+; CHECK-NEXT:    ret <2 x double> [[TMP4]]
+;
+  %1 = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> %src, i32 2, <2 x double> %dst, i8 %k)
+  ret <2 x double> %1
+}
+
+define <4 x double> @test_rndscale_pd_256_floor(<4 x double> %src, <4 x double> %dst, i8 %k) {
+; CHECK-LABEL: @test_rndscale_pd_256_floor(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[SRC:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x double> [[TMP1]], <4 x double> [[DST:%.*]]
+; CHECK-NEXT:    ret <4 x double> [[TMP4]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> %src, i32 1, <4 x double> %dst, i8 %k)
+  ret <4 x double> %1
+}
+
+define <4 x double> @test_rndscale_pd_256_ceil(<4 x double> %src, <4 x double> %dst, i8 %k) {
+; CHECK-LABEL: @test_rndscale_pd_256_ceil(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[SRC:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x double> [[TMP1]], <4 x double> [[DST:%.*]]
+; CHECK-NEXT:    ret <4 x double> [[TMP4]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> %src, i32 2, <4 x double> %dst, i8 %k)
+  ret <4 x double> %1
+}
+
+define <8 x double> @test_rndscale_pd_512_floor(<8 x double> %src, <8 x double> %dst, i8 %k) {
+; CHECK-LABEL: @test_rndscale_pd_512_floor(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.floor.v8f64(<8 x double> [[SRC:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[DST:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %src, i32 1, <8 x double> %dst, i8 %k, i32 4)
+  ret <8 x double> %1
+}
+
+define <8 x double> @test_rndscale_pd_512_ceil(<8 x double> %src, <8 x double> %dst, i8 %k) {
+; CHECK-LABEL: @test_rndscale_pd_512_ceil(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.ceil.v8f64(<8 x double> [[SRC:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[DST:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %src, i32 2, <8 x double> %dst, i8 %k, i32 4)
+  ret <8 x double> %1
+}
+
+declare float @llvm.fma.f32(float, float, float) #1
+
+define <4 x float> @test_mask_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask_vfmadd_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], float [[TMP4]], float [[TMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0
+; CHECK-NEXT:    ret <4 x float> [[TMP8]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = insertelement <4 x float> %c, float 4.000000e+00, i32 1
+  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
+  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
+  %7 = extractelement <4 x float> %a, i64 0
+  %8 = extractelement <4 x float> %3, i64 0
+  %9 = extractelement <4 x float> %6, i64 0
+  %10 = call float @llvm.fma.f32(float %7, float %8, float %9)
+  %11 = bitcast i8 %mask to <8 x i1>
+  %12 = extractelement <8 x i1> %11, i64 0
+  %13 = select i1 %12, float %10, float %7
+  %14 = insertelement <4 x float> %a, float %13, i64 0
+  ret <4 x float> %14
+}
+
+define float @test_mask_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask_vfmadd_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], float [[TMP4]], float [[TMP1]]
+; CHECK-NEXT:    ret float [[TMP7]]
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = extractelement <4 x float> %3, i64 0
+  %5 = extractelement <4 x float> %b, i64 0
+  %6 = extractelement <4 x float> %c, i64 0
+  %7 = call float @llvm.fma.f32(float %4, float %5, float %6)
+  %8 = bitcast i8 %mask to <8 x i1>
+  %9 = extractelement <8 x i1> %8, i64 0
+  %10 = select i1 %9, float %7, float %4
+  %11 = insertelement <4 x float> %3, float %10, i64 0
+  %12 = extractelement <4 x float> %11, i32 0
+  ret float %12
+}
+
+define float @test_mask_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask_vfmadd_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = extractelement <4 x float> %3, i64 0
+  %5 = extractelement <4 x float> %b, i64 0
+  %6 = extractelement <4 x float> %c, i64 0
+  %7 = call float @llvm.fma.f32(float %4, float %5, float %6)
+  %8 = bitcast i8 %mask to <8 x i1>
+  %9 = extractelement <8 x i1> %8, i64 0
+  %10 = select i1 %9, float %7, float %4
+  %11 = insertelement <4 x float> %3, float %10, i64 0
+  %12 = extractelement <4 x float> %11, i32 1
+  ret float %12
+}
+
+declare double @llvm.fma.f64(double, double, double) #1
+
+define <2 x double> @test_mask_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask_vfmadd_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], double [[TMP4]], double [[TMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[A]], double [[TMP7]], i64 0
+; CHECK-NEXT:    ret <2 x double> [[TMP8]]
+;
+  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
+  %2 = insertelement <2 x double> %c, double 2.000000e+00, i32 1
+  %3 = extractelement <2 x double> %a, i64 0
+  %4 = extractelement <2 x double> %1, i64 0
+  %5 = extractelement <2 x double> %2, i64 0
+  %6 = call double @llvm.fma.f64(double %3, double %4, double %5)
+  %7 = bitcast i8 %mask to <8 x i1>
+  %8 = extractelement <8 x i1> %7, i64 0
+  %9 = select i1 %8, double %6, double %3
+  %10 = insertelement <2 x double> %a, double %9, i64 0
+  ret <2 x double> %10
+}
+
+define double @test_mask_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask_vfmadd_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], double [[TMP4]], double [[TMP1]]
+; CHECK-NEXT:    ret double [[TMP7]]
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = extractelement <2 x double> %1, i64 0
+  %3 = extractelement <2 x double> %b, i64 0
+  %4 = extractelement <2 x double> %c, i64 0
+  %5 = call double @llvm.fma.f64(double %2, double %3, double %4)
+  %6 = bitcast i8 %mask to <8 x i1>
+  %7 = extractelement <8 x i1> %6, i64 0
+  %8 = select i1 %7, double %5, double %2
+  %9 = insertelement <2 x double> %1, double %8, i64 0
+  %10 = extractelement <2 x double> %9, i32 0
+  ret double %10
+}
+
+define double @test_mask_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask_vfmadd_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = extractelement <2 x double> %1, i64 0
+  %3 = extractelement <2 x double> %b, i64 0
+  %4 = extractelement <2 x double> %c, i64 0
+  %5 = call double @llvm.fma.f64(double %2, double %3, double %4)
+  %6 = bitcast i8 %mask to <8 x i1>
+  %7 = extractelement <8 x i1> %6, i64 0
+  %8 = select i1 %7, double %5, double %2
+  %9 = insertelement <2 x double> %1, double %8, i64 0
+  %10 = extractelement <2 x double> %9, i32 1
+  ret double %10
+}
+
+define <4 x float> @test_maskz_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_maskz_vfmadd_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], float [[TMP4]], float 0.000000e+00
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[A]], float [[TMP7]], i64 0
+; CHECK-NEXT:    ret <4 x float> [[TMP8]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = insertelement <4 x float> %c, float 4.000000e+00, i32 1
+  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
+  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
+  %7 = extractelement <4 x float> %a, i64 0
+  %8 = extractelement <4 x float> %3, i64 0
+  %9 = extractelement <4 x float> %6, i64 0
+  %10 = call float @llvm.fma.f32(float %7, float %8, float %9)
+  %11 = bitcast i8 %mask to <8 x i1>
+  %12 = extractelement <8 x i1> %11, i64 0
+  %13 = select i1 %12, float %10, float 0.000000e+00
+  %14 = insertelement <4 x float> %a, float %13, i64 0
+  ret <4 x float> %14
+}
+
+define float @test_maskz_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_maskz_vfmadd_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], float [[TMP4]], float 0.000000e+00
+; CHECK-NEXT:    ret float [[TMP7]]
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = extractelement <4 x float> %3, i64 0
+  %5 = extractelement <4 x float> %b, i64 0
+  %6 = extractelement <4 x float> %c, i64 0
+  %7 = call float @llvm.fma.f32(float %4, float %5, float %6)
+  %8 = bitcast i8 %mask to <8 x i1>
+  %9 = extractelement <8 x i1> %8, i64 0
+  %10 = select i1 %9, float %7, float 0.000000e+00
+  %11 = insertelement <4 x float> %3, float %10, i64 0
+  %12 = extractelement <4 x float> %11, i32 0
+  ret float %12
+}
+
+define float @test_maskz_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_maskz_vfmadd_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = extractelement <4 x float> %3, i64 0
+  %5 = extractelement <4 x float> %b, i64 0
+  %6 = extractelement <4 x float> %c, i64 0
+  %7 = call float @llvm.fma.f32(float %4, float %5, float %6)
+  %8 = bitcast i8 %mask to <8 x i1>
+  %9 = extractelement <8 x i1> %8, i64 0
+  %10 = select i1 %9, float %7, float 0.000000e+00
+  %11 = insertelement <4 x float> %3, float %10, i64 0
+  %12 = extractelement <4 x float> %11, i32 1
+  ret float %12
+}
+
+define <2 x double> @test_maskz_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_maskz_vfmadd_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], double [[TMP4]], double 0.000000e+00
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[A]], double [[TMP7]], i64 0
+; CHECK-NEXT:    ret <2 x double> [[TMP8]]
+;
+  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
+  %2 = insertelement <2 x double> %c, double 2.000000e+00, i32 1
+  %3 = extractelement <2 x double> %a, i64 0
+  %4 = extractelement <2 x double> %1, i64 0
+  %5 = extractelement <2 x double> %2, i64 0
+  %6 = call double @llvm.fma.f64(double %3, double %4, double %5)
+  %7 = bitcast i8 %mask to <8 x i1>
+  %8 = extractelement <8 x i1> %7, i64 0
+  %9 = select i1 %8, double %6, double 0.000000e+00
+  %10 = insertelement <2 x double> %a, double %9, i64 0
+  ret <2 x double> %10
+}
+
+define double @test_maskz_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_maskz_vfmadd_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], double [[TMP4]], double 0.000000e+00
+; CHECK-NEXT:    ret double [[TMP7]]
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = extractelement <2 x double> %1, i64 0
+  %3 = extractelement <2 x double> %b, i64 0
+  %4 = extractelement <2 x double> %c, i64 0
+  %5 = call double @llvm.fma.f64(double %2, double %3, double %4)
+  %6 = bitcast i8 %mask to <8 x i1>
+  %7 = extractelement <8 x i1> %6, i64 0
+  %8 = select i1 %7, double %5, double 0.000000e+00
+  %9 = insertelement <2 x double> %1, double %8, i64 0
+  %10 = extractelement <2 x double> %9, i32 0
+  ret double %10
+}
+
+define double @test_maskz_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_maskz_vfmadd_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = extractelement <2 x double> %1, i64 0
+  %3 = extractelement <2 x double> %b, i64 0
+  %4 = extractelement <2 x double> %c, i64 0
+  %5 = call double @llvm.fma.f64(double %2, double %3, double %4)
+  %6 = bitcast i8 %mask to <8 x i1>
+  %7 = extractelement <8 x i1> %6, i64 0
+  %8 = select i1 %7, double %5, double 0.000000e+00
+  %9 = insertelement <2 x double> %1, double %8, i64 0
+  %10 = extractelement <2 x double> %9, i32 1
+  ret double %10
+}
+
+define <4 x float> @test_mask3_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfmadd_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], float [[TMP4]], float [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[C]], float [[TMP7]], i64 0
+; CHECK-NEXT:    ret <4 x float> [[TMP8]]
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = insertelement <4 x float> %b, float 4.000000e+00, i32 1
+  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
+  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
+  %7 = extractelement <4 x float> %3, i64 0
+  %8 = extractelement <4 x float> %6, i64 0
+  %9 = extractelement <4 x float> %c, i64 0
+  %10 = call float @llvm.fma.f32(float %7, float %8, float %9)
+  %11 = bitcast i8 %mask to <8 x i1>
+  %12 = extractelement <8 x i1> %11, i64 0
+  %13 = select i1 %12, float %10, float %9
+  %14 = insertelement <4 x float> %c, float %13, i64 0
+  ret <4 x float> %14
+}
+
+define float @test_mask3_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfmadd_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], float [[TMP4]], float [[TMP3]]
+; CHECK-NEXT:    ret float [[TMP7]]
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = extractelement <4 x float> %a, i64 0
+  %5 = extractelement <4 x float> %b, i64 0
+  %6 = extractelement <4 x float> %3, i64 0
+  %7 = call float @llvm.fma.f32(float %4, float %5, float %6)
+  %8 = bitcast i8 %mask to <8 x i1>
+  %9 = extractelement <8 x i1> %8, i64 0
+  %10 = select i1 %9, float %7, float %6
+  %11 = insertelement <4 x float> %3, float %10, i64 0
+  %12 = extractelement <4 x float> %11, i32 0
+  ret float %12
+}
+
+define float @test_mask3_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfmadd_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = extractelement <4 x float> %a, i64 0
+  %5 = extractelement <4 x float> %b, i64 0
+  %6 = extractelement <4 x float> %3, i64 0
+  %7 = call float @llvm.fma.f32(float %4, float %5, float %6)
+  %8 = bitcast i8 %mask to <8 x i1>
+  %9 = extractelement <8 x i1> %8, i64 0
+  %10 = select i1 %9, float %7, float %6
+  %11 = insertelement <4 x float> %3, float %10, i64 0
+  %12 = extractelement <4 x float> %11, i32 1
+  ret float %12
+}
+
+define <2 x double> @test_mask3_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfmadd_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], double [[TMP4]], double [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[C]], double [[TMP7]], i64 0
+; CHECK-NEXT:    ret <2 x double> [[TMP8]]
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
+  %3 = extractelement <2 x double> %1, i64 0
+  %4 = extractelement <2 x double> %2, i64 0
+  %5 = extractelement <2 x double> %c, i64 0
+  %6 = call double @llvm.fma.f64(double %3, double %4, double %5)
+  %7 = bitcast i8 %mask to <8 x i1>
+  %8 = extractelement <8 x i1> %7, i64 0
+  %9 = select i1 %8, double %6, double %5
+  %10 = insertelement <2 x double> %c, double %9, i64 0
+  ret <2 x double> %10
+}
+
+define double @test_mask3_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfmadd_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], double [[TMP4]], double [[TMP3]]
+; CHECK-NEXT:    ret double [[TMP7]]
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = extractelement <2 x double> %a, i64 0
+  %3 = extractelement <2 x double> %b, i64 0
+  %4 = extractelement <2 x double> %1, i64 0
+  %5 = call double @llvm.fma.f64(double %2, double %3, double %4)
+  %6 = bitcast i8 %mask to <8 x i1>
+  %7 = extractelement <8 x i1> %6, i64 0
+  %8 = select i1 %7, double %5, double %4
+  %9 = insertelement <2 x double> %1, double %8, i64 0
+  %10 = extractelement <2 x double> %9, i32 0
+  ret double %10
+}
+
+define double @test_mask3_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfmadd_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = extractelement <2 x double> %a, i64 0
+  %3 = extractelement <2 x double> %b, i64 0
+  %4 = extractelement <2 x double> %1, i64 0
+  %5 = call double @llvm.fma.f64(double %2, double %3, double %4)
+  %6 = bitcast i8 %mask to <8 x i1>
+  %7 = extractelement <8 x i1> %6, i64 0
+  %8 = select i1 %7, double %5, double %4
+  %9 = insertelement <2 x double> %1, double %8, i64 0
+  %10 = extractelement <2 x double> %9, i32 1
+  ret double %10
+}
+
+define <4 x float> @test_mask3_vfmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfmsub_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub float -0.000000e+00, [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[C]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i1> [[TMP7]], i64 0
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP5]], float [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[C]], float [[TMP9]], i64 0
+; CHECK-NEXT:    ret <4 x float> [[TMP10]]
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = insertelement <4 x float> %b, float 4.000000e+00, i32 1
+  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
+  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
+  %7 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
+  %8 = extractelement <4 x float> %3, i64 0
+  %9 = extractelement <4 x float> %6, i64 0
+  %10 = extractelement <4 x float> %7, i64 0
+  %11 = call float @llvm.fma.f32(float %8, float %9, float %10)
+  %12 = extractelement <4 x float> %c, i64 0
+  %13 = bitcast i8 %mask to <8 x i1>
+  %14 = extractelement <8 x i1> %13, i64 0
+  %15 = select i1 %14, float %11, float %12
+  %16 = insertelement <4 x float> %c, float %15, i64 0
+  ret <4 x float> %16
+}
+
+define float @test_mask3_vfmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfmsub_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub float -0.000000e+00, [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[C]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i1> [[TMP7]], i64 0
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP5]], float [[TMP6]]
+; CHECK-NEXT:    ret float [[TMP9]]
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %3
+  %5 = extractelement <4 x float> %a, i64 0
+  %6 = extractelement <4 x float> %b, i64 0
+  %7 = extractelement <4 x float> %4, i64 0
+  %8 = call float @llvm.fma.f32(float %5, float %6, float %7)
+  %9 = extractelement <4 x float> %3, i64 0
+  %10 = bitcast i8 %mask to <8 x i1>
+  %11 = extractelement <8 x i1> %10, i64 0
+  %12 = select i1 %11, float %8, float %9
+  %13 = insertelement <4 x float> %3, float %12, i64 0
+  %14 = extractelement <4 x float> %13, i32 0
+  ret float %14
+}
+
+define float @test_mask3_vfmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfmsub_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %3
+  %5 = extractelement <4 x float> %a, i64 0
+  %6 = extractelement <4 x float> %b, i64 0
+  %7 = extractelement <4 x float> %4, i64 0
+  %8 = call float @llvm.fma.f32(float %5, float %6, float %7)
+  %9 = extractelement <4 x float> %3, i64 0
+  %10 = bitcast i8 %mask to <8 x i1>
+  %11 = extractelement <8 x i1> %10, i64 0
+  %12 = select i1 %11, float %8, float %9
+  %13 = insertelement <4 x float> %3, float %12, i64 0
+  %14 = extractelement <4 x float> %13, i32 1
+  ret float %14
+}
+
+define <2 x double> @test_mask3_vfmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfmsub_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub double -0.000000e+00, [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[C]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i1> [[TMP7]], i64 0
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], double [[TMP5]], double [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> [[C]], double [[TMP9]], i64 0
+; CHECK-NEXT:    ret <2 x double> [[TMP10]]
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
+  %3 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c
+  %4 = extractelement <2 x double> %1, i64 0
+  %5 = extractelement <2 x double> %2, i64 0
+  %6 = extractelement <2 x double> %3, i64 0
+  %7 = call double @llvm.fma.f64(double %4, double %5, double %6)
+  %8 = extractelement <2 x double> %c, i64 0
+  %9 = bitcast i8 %mask to <8 x i1>
+  %10 = extractelement <8 x i1> %9, i64 0
+  %11 = select i1 %10, double %7, double %8
+  %12 = insertelement <2 x double> %c, double %11, i64 0
+  ret <2 x double> %12
+}
+
+define double @test_mask3_vfmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfmsub_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub double -0.000000e+00, [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[C]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i1> [[TMP7]], i64 0
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], double [[TMP5]], double [[TMP6]]
+; CHECK-NEXT:    ret double [[TMP9]]
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %1
+  %3 = extractelement <2 x double> %a, i64 0
+  %4 = extractelement <2 x double> %b, i64 0
+  %5 = extractelement <2 x double> %2, i64 0
+  %6 = call double @llvm.fma.f64(double %3, double %4, double %5)
+  %7 = extractelement <2 x double> %1, i64 0
+  %8 = bitcast i8 %mask to <8 x i1>
+  %9 = extractelement <8 x i1> %8, i64 0
+  %10 = select i1 %9, double %6, double %7
+  %11 = insertelement <2 x double> %1, double %10, i64 0
+  %12 = extractelement <2 x double> %11, i32 0
+  ret double %12
+}
+
+define double @test_mask3_vfmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfmsub_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %1
+  %3 = extractelement <2 x double> %a, i64 0
+  %4 = extractelement <2 x double> %b, i64 0
+  %5 = extractelement <2 x double> %2, i64 0
+  %6 = call double @llvm.fma.f64(double %3, double %4, double %5)
+  %7 = extractelement <2 x double> %1, i64 0
+  %8 = bitcast i8 %mask to <8 x i1>
+  %9 = extractelement <8 x i1> %8, i64 0
+  %10 = select i1 %9, double %6, double %7
+  %11 = insertelement <2 x double> %1, double %10, i64 0
+  %12 = extractelement <2 x double> %11, i32 1
+  ret double %12
+}
+
+define <4 x float> @test_mask3_vfnmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfnmsub_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub float -0.000000e+00, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = fsub float -0.000000e+00, [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call float @llvm.fma.f32(float [[TMP2]], float [[TMP3]], float [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[C]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[TMP8]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], float [[TMP6]], float [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> [[C]], float [[TMP10]], i64 0
+; CHECK-NEXT:    ret <4 x float> [[TMP11]]
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = insertelement <4 x float> %b, float 4.000000e+00, i32 1
+  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
+  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
+  %7 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %3
+  %8 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
+  %9 = extractelement <4 x float> %7, i64 0
+  %10 = extractelement <4 x float> %6, i64 0
+  %11 = extractelement <4 x float> %8, i64 0
+  %12 = call float @llvm.fma.f32(float %9, float %10, float %11)
+  %13 = extractelement <4 x float> %c, i64 0
+  %14 = bitcast i8 %mask to <8 x i1>
+  %15 = extractelement <8 x i1> %14, i64 0
+  %16 = select i1 %15, float %12, float %13
+  %17 = insertelement <4 x float> %c, float %16, i64 0
+  ret <4 x float> %17
+}
+
+define float @test_mask3_vfnmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfnmsub_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub float -0.000000e+00, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = fsub float -0.000000e+00, [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call float @llvm.fma.f32(float [[TMP2]], float [[TMP3]], float [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[C]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[TMP8]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], float [[TMP6]], float [[TMP7]]
+; CHECK-NEXT:    ret float [[TMP10]]
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
+  %5 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %3
+  %6 = extractelement <4 x float> %4, i64 0
+  %7 = extractelement <4 x float> %b, i64 0
+  %8 = extractelement <4 x float> %5, i64 0
+  %9 = call float @llvm.fma.f32(float %6, float %7, float %8)
+  %10 = extractelement <4 x float> %3, i64 0
+  %11 = bitcast i8 %mask to <8 x i1>
+  %12 = extractelement <8 x i1> %11, i64 0
+  %13 = select i1 %12, float %9, float %10
+  %14 = insertelement <4 x float> %3, float %13, i64 0
+  %15 = extractelement <4 x float> %14, i32 0
+  ret float %15
+}
+
+define float @test_mask3_vfnmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfnmsub_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
+  %5 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %3
+  %6 = extractelement <4 x float> %4, i64 0
+  %7 = extractelement <4 x float> %b, i64 0
+  %8 = extractelement <4 x float> %5, i64 0
+  %9 = call float @llvm.fma.f32(float %6, float %7, float %8)
+  %10 = extractelement <4 x float> %3, i64 0
+  %11 = bitcast i8 %mask to <8 x i1>
+  %12 = extractelement <8 x i1> %11, i64 0
+  %13 = select i1 %12, float %9, float %10
+  %14 = insertelement <4 x float> %3, float %13, i64 0
+  %15 = extractelement <4 x float> %14, i32 1
+  ret float %15
+}
+
+define <2 x double> @test_mask3_vfnmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfnmsub_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub double -0.000000e+00, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = fsub double -0.000000e+00, [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP2]], double [[TMP3]], double [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[C]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[TMP8]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], double [[TMP6]], double [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> [[C]], double [[TMP10]], i64 0
+; CHECK-NEXT:    ret <2 x double> [[TMP11]]
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
+  %3 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %1
+  %4 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c
+  %5 = extractelement <2 x double> %3, i64 0
+  %6 = extractelement <2 x double> %2, i64 0
+  %7 = extractelement <2 x double> %4, i64 0
+  %8 = call double @llvm.fma.f64(double %5, double %6, double %7)
+  %9 = extractelement <2 x double> %c, i64 0
+  %10 = bitcast i8 %mask to <8 x i1>
+  %11 = extractelement <8 x i1> %10, i64 0
+  %12 = select i1 %11, double %8, double %9
+  %13 = insertelement <2 x double> %c, double %12, i64 0
+  ret <2 x double> %13
+}
+
+define double @test_mask3_vfnmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfnmsub_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub double -0.000000e+00, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = fsub double -0.000000e+00, [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP2]], double [[TMP3]], double [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[C]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[TMP8]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], double [[TMP6]], double [[TMP7]]
+; CHECK-NEXT:    ret double [[TMP10]]
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a
+  %3 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %1
+  %4 = extractelement <2 x double> %2, i64 0
+  %5 = extractelement <2 x double> %b, i64 0
+  %6 = extractelement <2 x double> %3, i64 0
+  %7 = call double @llvm.fma.f64(double %4, double %5, double %6)
+  %8 = extractelement <2 x double> %1, i64 0
+  %9 = bitcast i8 %mask to <8 x i1>
+  %10 = extractelement <8 x i1> %9, i64 0
+  %11 = select i1 %10, double %7, double %8
+  %12 = insertelement <2 x double> %1, double %11, i64 0
+  %13 = extractelement <2 x double> %12, i32 0
+  ret double %13
+}
+
+define double @test_mask3_vfnmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mask3_vfnmsub_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
+  %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a
+  %3 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %1
+  %4 = extractelement <2 x double> %2, i64 0
+  %5 = extractelement <2 x double> %b, i64 0
+  %6 = extractelement <2 x double> %3, i64 0
+  %7 = call double @llvm.fma.f64(double %4, double %5, double %6)
+  %8 = extractelement <2 x double> %1, i64 0
+  %9 = bitcast i8 %mask to <8 x i1>
+  %10 = extractelement <8 x i1> %9, i64 0
+  %11 = select i1 %10, double %7, double %8
+  %12 = insertelement <2 x double> %1, double %11, i64 0
+  %13 = extractelement <2 x double> %12, i32 1
+  ret double %13
+}
+
+declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>)
+
+define <8 x i32> @identity_test_permvar_si_256(<8 x i32> %a0) {
+; CHECK-LABEL: @identity_test_permvar_si_256(
+; CHECK-NEXT:    ret <8 x i32> [[A0:%.*]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @identity_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: @identity_test_permvar_si_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> [[A0:%.*]], <8 x i32> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %passthru
+  ret <8 x i32> %3
+}
+
+define <8 x i32> @zero_test_permvar_si_256(<8 x i32> %a0) {
+; CHECK-LABEL: @zero_test_permvar_si_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> zeroinitializer)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @zero_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: @zero_test_permvar_si_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> zeroinitializer)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %passthru
+  ret <8 x i32> %3
+}
+
+define <8 x i32> @shuffle_test_permvar_si_256(<8 x i32> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_si_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @shuffle_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_si_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %passthru
+  ret <8 x i32> %3
+}
+
+define <8 x i32> @undef_test_permvar_si_256(<8 x i32> %a0) {
+; CHECK-LABEL: @undef_test_permvar_si_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @undef_test_permvar_si_256_mask(<8 x i32> %a0, <8 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: @undef_test_permvar_si_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %passthru
+  ret <8 x i32> %3
+}
+
+declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>)
+
+define <8 x float> @identity_test_permvar_sf_256(<8 x float> %a0) {
+; CHECK-LABEL: @identity_test_permvar_sf_256(
+; CHECK-NEXT:    ret <8 x float> [[A0:%.*]]
+;
+  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
+  ret <8 x float> %1
+}
+
+define <8 x float> @identity_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: @identity_test_permvar_sf_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[A0:%.*]], <8 x float> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x float> [[TMP2]]
+;
+  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %passthru
+  ret <8 x float> %3
+}
+
+define <8 x float> @zero_test_permvar_sf_256(<8 x float> %a0) {
+; CHECK-LABEL: @zero_test_permvar_sf_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> zeroinitializer)
+  ret <8 x float> %1
+}
+
+define <8 x float> @zero_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: @zero_test_permvar_sf_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+;
+  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> zeroinitializer)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %passthru
+  ret <8 x float> %3
+}
+
+define <8 x float> @shuffle_test_permvar_sf_256(<8 x float> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_sf_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <8 x float> %1
+}
+
+define <8 x float> @shuffle_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_sf_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+;
+  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %passthru
+  ret <8 x float> %3
+}
+
+define <8 x float> @undef_test_permvar_sf_256(<8 x float> %a0) {
+; CHECK-LABEL: @undef_test_permvar_sf_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <8 x float> %1
+}
+
+define <8 x float> @undef_test_permvar_sf_256_mask(<8 x float> %a0, <8 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: @undef_test_permvar_sf_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+;
+  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %passthru
+  ret <8 x float> %3
+}
+
+declare <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64>, <4 x i64>)
+
+define <4 x i64> @identity_test_permvar_di_256(<4 x i64> %a0) {
+; CHECK-LABEL: @identity_test_permvar_di_256(
+; CHECK-NEXT:    ret <4 x i64> [[A0:%.*]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 0, i64 1, i64 2, i64 3>)
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @identity_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: @identity_test_permvar_di_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[A0:%.*]], <4 x i64> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP2]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 0, i64 1, i64 2, i64 3>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %passthru
+  ret <4 x i64> %3
+}
+
+define <4 x i64> @zero_test_permvar_di_256(<4 x i64> %a0) {
+; CHECK-LABEL: @zero_test_permvar_di_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> zeroinitializer)
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @zero_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: @zero_test_permvar_di_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> zeroinitializer)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %passthru
+  ret <4 x i64> %3
+}
+
+define <4 x i64> @shuffle_test_permvar_di_256(<4 x i64> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_di_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 3, i64 2, i64 1, i64 0>)
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @shuffle_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_di_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 3, i64 2, i64 1, i64 0>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %passthru
+  ret <4 x i64> %3
+}
+
+define <4 x i64> @undef_test_permvar_di_256(<4 x i64> %a0) {
+; CHECK-LABEL: @undef_test_permvar_di_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 undef, i64 2, i64 1, i64 0>)
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @undef_test_permvar_di_256_mask(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: @undef_test_permvar_di_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> <i64 undef, i64 2, i64 1, i64 0>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %passthru
+  ret <4 x i64> %3
+}
+
+declare <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double>, <4 x i64>)
+
+define <4 x double> @identity_test_permvar_df_256(<4 x double> %a0) {
+; CHECK-LABEL: @identity_test_permvar_df_256(
+; CHECK-NEXT:    ret <4 x double> [[A0:%.*]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 0, i64 1, i64 2, i64 3>)
+  ret <4 x double> %1
+}
+
+define <4 x double> @identity_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: @identity_test_permvar_df_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[A0:%.*]], <4 x double> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x double> [[TMP2]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 0, i64 1, i64 2, i64 3>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %passthru
+  ret <4 x double> %3
+}
+
+define <4 x double> @zero_test_permvar_df_256(<4 x double> %a0) {
+; CHECK-LABEL: @zero_test_permvar_df_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> zeroinitializer)
+  ret <4 x double> %1
+}
+
+define <4 x double> @zero_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: @zero_test_permvar_df_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> zeroinitializer)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %passthru
+  ret <4 x double> %3
+}
+
+define <4 x double> @shuffle_test_permvar_df_256(<4 x double> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_df_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 3, i64 2, i64 1, i64 0>)
+  ret <4 x double> %1
+}
+
+define <4 x double> @shuffle_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_df_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 3, i64 2, i64 1, i64 0>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %passthru
+  ret <4 x double> %3
+}
+
+define <4 x double> @undef_test_permvar_df_256(<4 x double> %a0) {
+; CHECK-LABEL: @undef_test_permvar_df_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 undef, i64 2, i64 1, i64 0>)
+  ret <4 x double> %1
+}
+
+define <4 x double> @undef_test_permvar_df_256_mask(<4 x double> %a0, <4 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: @undef_test_permvar_df_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> <i64 undef, i64 2, i64 1, i64 0>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %passthru
+  ret <4 x double> %3
+}
+
+declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>)
+
+define <16 x i32> @identity_test_permvar_si_512(<16 x i32> %a0) {
+; CHECK-LABEL: @identity_test_permvar_si_512(
+; CHECK-NEXT:    ret <16 x i32> [[A0:%.*]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @identity_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
+; CHECK-LABEL: @identity_test_permvar_si_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i32> [[A0:%.*]], <16 x i32> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i32> [[TMP2]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %passthru
+  ret <16 x i32> %3
+}
+
+define <16 x i32> @zero_test_permvar_si_512(<16 x i32> %a0) {
+; CHECK-LABEL: @zero_test_permvar_si_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> zeroinitializer)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @zero_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
+; CHECK-LABEL: @zero_test_permvar_si_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> zeroinitializer)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %passthru
+  ret <16 x i32> %3
+}
+
+define <16 x i32> @shuffle_test_permvar_si_512(<16 x i32> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_si_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @shuffle_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_si_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %passthru
+  ret <16 x i32> %3
+}
+
+define <16 x i32> @undef_test_permvar_si_512(<16 x i32> %a0) {
+; CHECK-LABEL: @undef_test_permvar_si_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x i32> [[TMP1]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @undef_test_permvar_si_512_mask(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
+; CHECK-LABEL: @undef_test_permvar_si_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i32> [[TMP1]], <16 x i32> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a0, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %passthru
+  ret <16 x i32> %3
+}
+
+declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>)
+
+define <16 x float> @identity_test_permvar_sf_512(<16 x float> %a0) {
+; CHECK-LABEL: @identity_test_permvar_sf_512(
+; CHECK-NEXT:    ret <16 x float> [[A0:%.*]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>)
+  ret <16 x float> %1
+}
+
+define <16 x float> @identity_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
+; CHECK-LABEL: @identity_test_permvar_sf_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x float> [[A0:%.*]], <16 x float> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP2]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
+  ret <16 x float> %3
+}
+
+define <16 x float> @zero_test_permvar_sf_512(<16 x float> %a0) {
+; CHECK-LABEL: @zero_test_permvar_sf_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> zeroinitializer)
+  ret <16 x float> %1
+}
+
+define <16 x float> @zero_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
+; CHECK-LABEL: @zero_test_permvar_sf_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP3]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> zeroinitializer)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
+  ret <16 x float> %3
+}
+
+define <16 x float> @shuffle_test_permvar_sf_512(<16 x float> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_sf_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <16 x float> %1
+}
+
+define <16 x float> @shuffle_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_sf_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP3]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
+  ret <16 x float> %3
+}
+
+define <16 x float> @undef_test_permvar_sf_512(<16 x float> %a0) {
+; CHECK-LABEL: @undef_test_permvar_sf_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <16 x float> %1
+}
+
+define <16 x float> @undef_test_permvar_sf_512_mask(<16 x float> %a0, <16 x float> %passthru, i16 %mask) {
+; CHECK-LABEL: @undef_test_permvar_sf_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A0:%.*]], <16 x float> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP3]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
+  ret <16 x float> %3
+}
+
+declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>)
+
+define <8 x i64> @identity_test_permvar_di_512(<8 x i64> %a0) {
+; CHECK-LABEL: @identity_test_permvar_di_512(
+; CHECK-NEXT:    ret <8 x i64> [[A0:%.*]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @identity_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: @identity_test_permvar_di_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i64> [[A0:%.*]], <8 x i64> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i64> [[TMP2]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %passthru
+  ret <8 x i64> %3
+}
+
+define <8 x i64> @zero_test_permvar_di_512(<8 x i64> %a0) {
+; CHECK-LABEL: @zero_test_permvar_di_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> zeroinitializer)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @zero_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: @zero_test_permvar_di_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> zeroinitializer)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %passthru
+  ret <8 x i64> %3
+}
+
+define <8 x i64> @shuffle_test_permvar_di_512(<8 x i64> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_di_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @shuffle_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_di_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %passthru
+  ret <8 x i64> %3
+}
+
+define <8 x i64> @undef_test_permvar_di_512(<8 x i64> %a0) {
+; CHECK-LABEL: @undef_test_permvar_di_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @undef_test_permvar_di_512_mask(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: @undef_test_permvar_di_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[A0:%.*]], <8 x i64> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> [[TMP1]], <8 x i64> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %passthru
+  ret <8 x i64> %3
+}
+
+declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>)
+
+define <8 x double> @identity_test_permvar_df_512(<8 x double> %a0) {
+; CHECK-LABEL: @identity_test_permvar_df_512(
+; CHECK-NEXT:    ret <8 x double> [[A0:%.*]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>)
+  ret <8 x double> %1
+}
+
+define <8 x double> @identity_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: @identity_test_permvar_df_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x double> [[A0:%.*]], <8 x double> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP2]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru
+  ret <8 x double> %3
+}
+
+define <8 x double> @zero_test_permvar_df_512(<8 x double> %a0) {
+; CHECK-LABEL: @zero_test_permvar_df_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> zeroinitializer)
+  ret <8 x double> %1
+}
+
+define <8 x double> @zero_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: @zero_test_permvar_df_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> zeroinitializer)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru
+  ret <8 x double> %3
+}
+
+define <8 x double> @shuffle_test_permvar_df_512(<8 x double> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_df_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
+  ret <8 x double> %1
+}
+
+define <8 x double> @shuffle_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_df_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru
+  ret <8 x double> %3
+}
+
+define <8 x double> @undef_test_permvar_df_512(<8 x double> %a0) {
+; CHECK-LABEL: @undef_test_permvar_df_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
+  ret <8 x double> %1
+}
+
+define <8 x double> @undef_test_permvar_df_512_mask(<8 x double> %a0, <8 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: @undef_test_permvar_df_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A0:%.*]], <8 x double> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru
+  ret <8 x double> %3
+}
+
+declare <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @identity_test_permvar_hi_128(<8 x i16> %a0) {
+; CHECK-LABEL: @identity_test_permvar_hi_128(
+; CHECK-NEXT:    ret <8 x i16> [[A0:%.*]]
+;
+  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @identity_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
+; CHECK-LABEL: @identity_test_permvar_hi_128_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[A0:%.*]], <8 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+;
+  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passthru
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @zero_test_permvar_hi_128(<8 x i16> %a0) {
+; CHECK-LABEL: @zero_test_permvar_hi_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> zeroinitializer)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @zero_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
+; CHECK-LABEL: @zero_test_permvar_hi_128_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+;
+  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> zeroinitializer)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passthru
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @shuffle_test_permvar_hi_128(<8 x i16> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_hi_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @shuffle_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_hi_128_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+;
+  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passthru
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @undef_test_permvar_hi_128(<8 x i16> %a0) {
+; CHECK-LABEL: @undef_test_permvar_hi_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 undef, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @undef_test_permvar_hi_128_mask(<8 x i16> %a0, <8 x i16> %passthru, i8 %mask) {
+; CHECK-LABEL: @undef_test_permvar_hi_128_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> undef, <8 x i32> <i32 undef, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP1]], <8 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+;
+  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %a0, <8 x i16> <i16 undef, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passthru
+  ret <8 x i16> %3
+}
+
+declare <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16>, <16 x i16>)
+
+define <16 x i16> @identity_test_permvar_hi_256(<16 x i16> %a0) {
+; CHECK-LABEL: @identity_test_permvar_hi_256(
+; CHECK-NEXT:    ret <16 x i16> [[A0:%.*]]
+;
+  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @identity_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
+; CHECK-LABEL: @identity_test_permvar_hi_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i16> [[A0:%.*]], <16 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i16> [[TMP2]]
+;
+  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passthru
+  ret <16 x i16> %3
+}
+
+define <16 x i16> @zero_test_permvar_hi_256(<16 x i16> %a0) {
+; CHECK-LABEL: @zero_test_permvar_hi_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> zeroinitializer)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @zero_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
+; CHECK-LABEL: @zero_test_permvar_hi_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+;
+  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> zeroinitializer)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passthru
+  ret <16 x i16> %3
+}
+
+define <16 x i16> @shuffle_test_permvar_hi_256(<16 x i16> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_hi_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @shuffle_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_hi_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+;
+  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passthru
+  ret <16 x i16> %3
+}
+
+define <16 x i16> @undef_test_permvar_hi_256(<16 x i16> %a0) {
+; CHECK-LABEL: @undef_test_permvar_hi_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 undef, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @undef_test_permvar_hi_256_mask(<16 x i16> %a0, <16 x i16> %passthru, i16 %mask) {
+; CHECK-LABEL: @undef_test_permvar_hi_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i16> [[TMP1]], <16 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+;
+  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a0, <16 x i16> <i16 undef, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passthru
+  ret <16 x i16> %3
+}
+
+declare <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16>, <32 x i16>)
+
+define <32 x i16> @identity_test_permvar_hi_512(<32 x i16> %a0) {
+; CHECK-LABEL: @identity_test_permvar_hi_512(
+; CHECK-NEXT:    ret <32 x i16> [[A0:%.*]]
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23, i16 24, i16 25, i16 26, i16 27, i16 28, i16 29, i16 30, i16 31>)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @identity_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
+; CHECK-LABEL: @identity_test_permvar_hi_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i16> [[A0:%.*]], <32 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <32 x i16> [[TMP2]]
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23, i16 24, i16 25, i16 26, i16 27, i16 28, i16 29, i16 30, i16 31>)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passthru
+  ret <32 x i16> %3
+}
+
+define <32 x i16> @zero_test_permvar_hi_512(<32 x i16> %a0) {
+; CHECK-LABEL: @zero_test_permvar_hi_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> zeroinitializer)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @zero_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
+; CHECK-LABEL: @zero_test_permvar_hi_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <32 x i16> [[TMP3]]
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> zeroinitializer)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passthru
+  ret <32 x i16> %3
+}
+
+define <32 x i16> @shuffle_test_permvar_hi_512(<32 x i16> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_hi_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @shuffle_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_hi_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <32 x i16> [[TMP3]]
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passthru
+  ret <32 x i16> %3
+}
+
+define <32 x i16> @undef_test_permvar_hi_512(<32 x i16> %a0) {
+; CHECK-LABEL: @undef_test_permvar_hi_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> undef, <32 x i32> <i32 undef, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 undef, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @undef_test_permvar_hi_512_mask(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
+; CHECK-LABEL: @undef_test_permvar_hi_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[A0:%.*]], <32 x i16> undef, <32 x i32> <i32 undef, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i16> [[TMP1]], <32 x i16> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <32 x i16> [[TMP3]]
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 undef, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passthru
+  ret <32 x i16> %3
+}
+
+declare <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @identity_test_permvar_qi_128(<16 x i8> %a0) {
+; CHECK-LABEL: @identity_test_permvar_qi_128(
+; CHECK-NEXT:    ret <16 x i8> [[A0:%.*]]
+;
+  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @identity_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
+; CHECK-LABEL: @identity_test_permvar_qi_128_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[A0:%.*]], <16 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+;
+  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passthru
+  ret <16 x i8> %3
+}
+
+define <16 x i8> @zero_test_permvar_qi_128(<16 x i8> %a0) {
+; CHECK-LABEL: @zero_test_permvar_qi_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> zeroinitializer)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @zero_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
+; CHECK-LABEL: @zero_test_permvar_qi_128_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+;
+  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> zeroinitializer)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passthru
+  ret <16 x i8> %3
+}
+
+define <16 x i8> @shuffle_test_permvar_qi_128(<16 x i8> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_qi_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @shuffle_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_qi_128_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+;
+  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passthru
+  ret <16 x i8> %3
+}
+
+define <16 x i8> @undef_test_permvar_qi_128(<16 x i8> %a0) {
+; CHECK-LABEL: @undef_test_permvar_qi_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 undef, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @undef_test_permvar_qi_128_mask(<16 x i8> %a0, <16 x i8> %passthru, i16 %mask) {
+; CHECK-LABEL: @undef_test_permvar_qi_128_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A0:%.*]], <16 x i8> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP1]], <16 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+;
+  %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %a0, <16 x i8> <i8 undef, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passthru
+  ret <16 x i8> %3
+}
+
+declare <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8>, <32 x i8>)
+
+define <32 x i8> @identity_test_permvar_qi_256(<32 x i8> %a0) {
+; CHECK-LABEL: @identity_test_permvar_qi_256(
+; CHECK-NEXT:    ret <32 x i8> [[A0:%.*]]
+;
+  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @identity_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
+; CHECK-LABEL: @identity_test_permvar_qi_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <32 x i1> [[TMP1]], <32 x i8> [[A0:%.*]], <32 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <32 x i8> [[TMP2]]
+;
+  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passthru
+  ret <32 x i8> %3
+}
+
+define <32 x i8> @zero_test_permvar_qi_256(<32 x i8> %a0) {
+; CHECK-LABEL: @zero_test_permvar_qi_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> zeroinitializer)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @zero_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
+; CHECK-LABEL: @zero_test_permvar_qi_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <32 x i8> [[TMP3]]
+;
+  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> zeroinitializer)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passthru
+  ret <32 x i8> %3
+}
+
+define <32 x i8> @shuffle_test_permvar_qi_256(<32 x i8> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_qi_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @shuffle_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_qi_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <32 x i8> [[TMP3]]
+;
+  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passthru
+  ret <32 x i8> %3
+}
+
+define <32 x i8> @undef_test_permvar_qi_256(<32 x i8> %a0) {
+; CHECK-LABEL: @undef_test_permvar_qi_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> undef, <32 x i32> <i32 undef, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 undef, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @undef_test_permvar_qi_256_mask(<32 x i8> %a0, <32 x i8> %passthru, i32 %mask) {
+; CHECK-LABEL: @undef_test_permvar_qi_256_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> [[A0:%.*]], <32 x i8> undef, <32 x i32> <i32 undef, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[TMP2]], <32 x i8> [[TMP1]], <32 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <32 x i8> [[TMP3]]
+;
+  %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a0, <32 x i8> <i8 undef, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passthru
+  ret <32 x i8> %3
+}
+
+declare <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8>, <64 x i8>)
+
+define <64 x i8> @identity_test_permvar_qi_512(<64 x i8> %a0) {
+; CHECK-LABEL: @identity_test_permvar_qi_512(
+; CHECK-NEXT:    ret <64 x i8> [[A0:%.*]]
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39, i8 40, i8 41, i8 42, i8 43, i8 44, i8 45, i8 46, i8 47, i8 48, i8 49, i8 50, i8 51, i8 52, i8 53, i8 54, i8 55, i8 56, i8 57, i8 58, i8 59, i8 60, i8 61, i8 62, i8 63>)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @identity_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
+; CHECK-LABEL: @identity_test_permvar_qi_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <64 x i1> [[TMP1]], <64 x i8> [[A0:%.*]], <64 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <64 x i8> [[TMP2]]
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39, i8 40, i8 41, i8 42, i8 43, i8 44, i8 45, i8 46, i8 47, i8 48, i8 49, i8 50, i8 51, i8 52, i8 53, i8 54, i8 55, i8 56, i8 57, i8 58, i8 59, i8 60, i8 61, i8 62, i8 63>)
+  %2 = bitcast i64 %mask to <64 x i1>
+  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passthru
+  ret <64 x i8> %3
+}
+
+define <64 x i8> @zero_test_permvar_qi_512(<64 x i8> %a0) {
+; CHECK-LABEL: @zero_test_permvar_qi_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> undef, <64 x i32> zeroinitializer
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> zeroinitializer)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @zero_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
+; CHECK-LABEL: @zero_test_permvar_qi_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> undef, <64 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <64 x i8> [[TMP3]]
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> zeroinitializer)
+  %2 = bitcast i64 %mask to <64 x i1>
+  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passthru
+  ret <64 x i8> %3
+}
+
+define <64 x i8> @shuffle_test_permvar_qi_512(<64 x i8> %a0) {
+; CHECK-LABEL: @shuffle_test_permvar_qi_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 63, i8 62, i8 61, i8 60, i8 59, i8 58, i8 57, i8 56, i8 55, i8 54, i8 53, i8 52, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @shuffle_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
+; CHECK-LABEL: @shuffle_test_permvar_qi_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <64 x i8> [[TMP3]]
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 63, i8 62, i8 61, i8 60, i8 59, i8 58, i8 57, i8 56, i8 55, i8 54, i8 53, i8 52, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  %2 = bitcast i64 %mask to <64 x i1>
+  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passthru
+  ret <64 x i8> %3
+}
+
+define <64 x i8> @undef_test_permvar_qi_512(<64 x i8> %a0) {
+; CHECK-LABEL: @undef_test_permvar_qi_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> undef, <64 x i32> <i32 undef, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 undef, i8 62, i8 61, i8 60, i8 59, i8 58, i8 57, i8 56, i8 55, i8 54, i8 53, i8 52, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @undef_test_permvar_qi_512_mask(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask) {
+; CHECK-LABEL: @undef_test_permvar_qi_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> [[A0:%.*]], <64 x i8> undef, <64 x i32> <i32 undef, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <64 x i1> [[TMP2]], <64 x i8> [[TMP1]], <64 x i8> [[PASSTHRU:%.*]]
+; CHECK-NEXT:    ret <64 x i8> [[TMP3]]
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a0, <64 x i8> <i8 undef, i8 62, i8 61, i8 60, i8 59, i8 58, i8 57, i8 56, i8 55, i8 54, i8 53, i8 52, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  %2 = bitcast i64 %mask to <64 x i1>
+  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passthru
+  ret <64 x i8> %3
+}
+
+declare <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float>, <16 x float>, i32)
+
+define <16 x float> @test_add_ps(<16 x float> %a, <16 x float> %b) {
+; CHECK-LABEL: @test_add_ps(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <16 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a, <16 x float> %b, i32 4)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_add_ps_round(<16 x float> %a, <16 x float> %b) {
+; CHECK-LABEL: @test_add_ps_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a, <16 x float> %b, i32 8)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_add_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+; CHECK-LABEL: @test_add_ps_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <16 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP3]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a, <16 x float> %b, i32 4)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %c
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_add_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+; CHECK-LABEL: @test_add_ps_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP3]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a, <16 x float> %b, i32 8)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %c
+  ret <16 x float> %3
+}
+
+declare <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double>, <8 x double>, i32)
+
+define <8 x double> @test_add_pd(<8 x double> %a, <8 x double> %b) {
+; CHECK-LABEL: @test_add_pd(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> %a, <8 x double> %b, i32 4)
+  ret <8 x double> %1
+}
+
+define <8 x double> @test_add_pd_round(<8 x double> %a, <8 x double> %b) {
+; CHECK-LABEL: @test_add_pd_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> %a, <8 x double> %b, i32 8)
+  ret <8 x double> %1
+}
+
+define <8 x double> @test_add_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_add_pd_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> %a, <8 x double> %b, i32 4)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %c
+  ret <8 x double> %3
+}
+
+define <8 x double> @test_add_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_add_pd_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> %a, <8 x double> %b, i32 8)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %c
+  ret <8 x double> %3
+}
+
+declare <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float>, <16 x float>, i32)
+
+define <16 x float> @test_sub_ps(<16 x float> %a, <16 x float> %b) {
+; CHECK-LABEL: @test_sub_ps(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a, <16 x float> %b, i32 4)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_sub_ps_round(<16 x float> %a, <16 x float> %b) {
+; CHECK-LABEL: @test_sub_ps_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a, <16 x float> %b, i32 8)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_sub_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+; CHECK-LABEL: @test_sub_ps_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP3]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a, <16 x float> %b, i32 4)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %c
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_sub_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+; CHECK-LABEL: @test_sub_ps_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP3]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a, <16 x float> %b, i32 8)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %c
+  ret <16 x float> %3
+}
+
+declare <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double>, <8 x double>, i32)
+
+define <8 x double> @test_sub_pd(<8 x double> %a, <8 x double> %b) {
+; CHECK-LABEL: @test_sub_pd(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double> %a, <8 x double> %b, i32 4)
+  ret <8 x double> %1
+}
+
+define <8 x double> @test_sub_pd_round(<8 x double> %a, <8 x double> %b) {
+; CHECK-LABEL: @test_sub_pd_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double> %a, <8 x double> %b, i32 8)
+  ret <8 x double> %1
+}
+
+define <8 x double> @test_sub_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_sub_pd_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double> %a, <8 x double> %b, i32 4)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %c
+  ret <8 x double> %3
+}
+
+define <8 x double> @test_sub_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_sub_pd_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double> %a, <8 x double> %b, i32 8)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %c
+  ret <8 x double> %3
+}
+
+declare <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float>, <16 x float>, i32)
+
+define <16 x float> @test_mul_ps(<16 x float> %a, <16 x float> %b) {
+; CHECK-LABEL: @test_mul_ps(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <16 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a, <16 x float> %b, i32 4)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_mul_ps_round(<16 x float> %a, <16 x float> %b) {
+; CHECK-LABEL: @test_mul_ps_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a, <16 x float> %b, i32 8)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_mul_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+; CHECK-LABEL: @test_mul_ps_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <16 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP3]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a, <16 x float> %b, i32 4)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %c
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mul_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+; CHECK-LABEL: @test_mul_ps_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP3]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a, <16 x float> %b, i32 8)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %c
+  ret <16 x float> %3
+}
+
+declare <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double>, <8 x double>, i32)
+
+define <8 x double> @test_mul_pd(<8 x double> %a, <8 x double> %b) {
+; CHECK-LABEL: @test_mul_pd(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a, <8 x double> %b, i32 4)
+  ret <8 x double> %1
+}
+
+define <8 x double> @test_mul_pd_round(<8 x double> %a, <8 x double> %b) {
+; CHECK-LABEL: @test_mul_pd_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a, <8 x double> %b, i32 8)
+  ret <8 x double> %1
+}
+
+define <8 x double> @test_mul_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mul_pd_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a, <8 x double> %b, i32 4)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %c
+  ret <8 x double> %3
+}
+
+define <8 x double> @test_mul_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_mul_pd_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a, <8 x double> %b, i32 8)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %c
+  ret <8 x double> %3
+}
+
+declare <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float>, <16 x float>, i32)
+
+define <16 x float> @test_div_ps(<16 x float> %a, <16 x float> %b) {
+; CHECK-LABEL: @test_div_ps(
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <16 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a, <16 x float> %b, i32 4)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_div_ps_round(<16 x float> %a, <16 x float> %b) {
+; CHECK-LABEL: @test_div_ps_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a, <16 x float> %b, i32 8)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_div_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+; CHECK-LABEL: @test_div_ps_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <16 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP3]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a, <16 x float> %b, i32 4)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %c
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_div_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+; CHECK-LABEL: @test_div_ps_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]]
+; CHECK-NEXT:    ret <16 x float> [[TMP3]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a, <16 x float> %b, i32 8)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %c
+  ret <16 x float> %3
+}
+
+declare <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double>, <8 x double>, i32)
+
+define <8 x double> @test_div_pd(<8 x double> %a, <8 x double> %b) {
+; CHECK-LABEL: @test_div_pd(
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double> %a, <8 x double> %b, i32 4)
+  ret <8 x double> %1
+}
+
+define <8 x double> @test_div_pd_round(<8 x double> %a, <8 x double> %b) {
+; CHECK-LABEL: @test_div_pd_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double> %a, <8 x double> %b, i32 8)
+  ret <8 x double> %1
+}
+
+define <8 x double> @test_div_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_div_pd_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double> %a, <8 x double> %b, i32 4)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %c
+  ret <8 x double> %3
+}
+
+define <8 x double> @test_div_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+; CHECK-LABEL: @test_div_pd_mask_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]]
+; CHECK-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double> %a, <8 x double> %b, i32 8)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %c
+  ret <8 x double> %3
+}
+
+declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32)
+
+define i32 @test_comi_ss_0(float %a, float %b) {
+; CHECK-LABEL: @test_comi_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]], i32 0, i32 4)
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
+  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
+  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
+  %5 = insertelement <4 x float> undef, float %b, i32 0
+  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
+  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
+  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
+  %9 = tail call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> %4, <4 x float> %8, i32 0, i32 4)
+  ret i32 %9
+}
+
+declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32)
+
+define i32 @test_comi_sd_0(double %a, double %b) {
+; CHECK-LABEL: @test_comi_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]], i32 0, i32 4)
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = insertelement <2 x double> undef, double %a, i32 0
+  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
+  %3 = insertelement <2 x double> undef, double %b, i32 0
+  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
+  %5 = tail call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %2, <2 x double> %4, i32 0, i32 4)
+  ret i32 %5
+}

Added: llvm/trunk/test/Transforms/InstCombine/X86/x86-bmi-tbm.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/X86/x86-bmi-tbm.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/X86/x86-bmi-tbm.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/X86/x86-bmi-tbm.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,271 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+declare i32 @llvm.x86.tbm.bextri.u32(i32, i32) nounwind readnone
+declare i64 @llvm.x86.tbm.bextri.u64(i64, i64) nounwind readnone
+declare i32 @llvm.x86.bmi.bextr.32(i32, i32) nounwind readnone
+declare i64 @llvm.x86.bmi.bextr.64(i64, i64) nounwind readnone
+declare i32 @llvm.x86.bmi.bzhi.32(i32, i32) nounwind readnone
+declare i64 @llvm.x86.bmi.bzhi.64(i64, i64) nounwind readnone
+
+define i32 @test_x86_tbm_bextri_u32(i32 %a) nounwind readnone {
+; CHECK-LABEL: @test_x86_tbm_bextri_u32(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.x86.tbm.bextri.u32(i32 [[A:%.*]], i32 1296)
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = tail call i32 @llvm.x86.tbm.bextri.u32(i32 %a, i32 1296)
+  ret i32 %1
+}
+
+define i32 @test_x86_tbm_bextri_u32_zero_length(i32 %a) nounwind readnone {
+; CHECK-LABEL: @test_x86_tbm_bextri_u32_zero_length(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = tail call i32 @llvm.x86.tbm.bextri.u32(i32 %a, i32 1)
+  ret i32 %1
+}
+
+define i32 @test_x86_tbm_bextri_u32_large_shift(i32 %a) nounwind readnone {
+; CHECK-LABEL: @test_x86_tbm_bextri_u32_large_shift(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = tail call i32 @llvm.x86.tbm.bextri.u32(i32 %a, i32 288)
+  ret i32 %1
+}
+
+define i64 @test_x86_tbm_bextri_u64(i64 %a) nounwind readnone {
+; CHECK-LABEL: @test_x86_tbm_bextri_u64(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.x86.tbm.bextri.u64(i64 [[A:%.*]], i64 1312)
+; CHECK-NEXT:    ret i64 [[TMP1]]
+;
+  %1 = tail call i64 @llvm.x86.tbm.bextri.u64(i64 %a, i64 1312)
+  ret i64 %1
+}
+
+define i64 @test_x86_tbm_bextri_u64_zero_length(i64 %a) nounwind readnone {
+; CHECK-LABEL: @test_x86_tbm_bextri_u64_zero_length(
+; CHECK-NEXT:    ret i64 0
+;
+  %1 = tail call i64 @llvm.x86.tbm.bextri.u64(i64 %a, i64 1)
+  ret i64 %1
+}
+
+define i64 @test_x86_tbm_bextri_u64_large_shift(i64 %a) nounwind readnone {
+; CHECK-LABEL: @test_x86_tbm_bextri_u64_large_shift(
+; CHECK-NEXT:    ret i64 0
+;
+  %1 = tail call i64 @llvm.x86.tbm.bextri.u64(i64 %a, i64 320)
+  ret i64 %1
+}
+
+define i32 @test_x86_tbm_bextri_u32_constfold() nounwind readnone {
+; CHECK-LABEL: @test_x86_tbm_bextri_u32_constfold(
+; CHECK-NEXT:    ret i32 57005
+;
+  %1 = tail call i32 @llvm.x86.tbm.bextri.u32(i32 3735928559, i32 4112) ; extract bits 31:16 from 0xDEADBEEF
+  ret i32 %1
+}
+
+define i32 @test_x86_tbm_bextri_u32_constfold2() nounwind readnone {
+; CHECK-LABEL: @test_x86_tbm_bextri_u32_constfold2(
+; CHECK-NEXT:    ret i32 233495534
+;
+  %1 = tail call i32 @llvm.x86.tbm.bextri.u32(i32 3735928559, i32 8196) ; extract bits 35:4 from 0xDEADBEEF
+  ret i32 %1
+}
+
+define i32 @test_x86_tbm_bextri_u32_constfold3() nounwind readnone {
+; CHECK-LABEL: @test_x86_tbm_bextri_u32_constfold3(
+; CHECK-NEXT:    ret i32 233495534
+;
+  %1 = tail call i32 @llvm.x86.tbm.bextri.u32(i32 3735928559, i32 16388) ; extract bits 67:4 from 0xDEADBEEF
+  ret i32 %1
+}
+
+define i64 @test_x86_tbm_bextri_u64_constfold() nounwind readnone {
+; CHECK-LABEL: @test_x86_tbm_bextri_u64_constfold(
+; CHECK-NEXT:    ret i64 57005
+;
+  %1 = tail call i64 @llvm.x86.tbm.bextri.u64(i64 3735928559, i64 4112) ; extract bits 31:16 from 0xDEADBEEF
+  ret i64 %1
+}
+
+define i64 @test_x86_tbm_bextri_u64_constfold2() nounwind readnone {
+; CHECK-LABEL: @test_x86_tbm_bextri_u64_constfold2(
+; CHECK-NEXT:    ret i64 233495534
+;
+  %1 = tail call i64 @llvm.x86.tbm.bextri.u64(i64 3735928559, i64 16388) ; extract bits 67:4 from 0xDEADBEEF
+  ret i64 %1
+}
+
+define i64 @test_x86_tbm_bextri_u64_constfold3() nounwind readnone {
+; CHECK-LABEL: @test_x86_tbm_bextri_u64_constfold3(
+; CHECK-NEXT:    ret i64 233495534
+;
+  %1 = tail call i64 @llvm.x86.tbm.bextri.u64(i64 3735928559, i64 32772) ; extract bits 131:4 from 0xDEADBEEF
+  ret i64 %1
+}
+
+define i32 @test_x86_bmi_bextri_32(i32 %a) nounwind readnone {
+; CHECK-LABEL: @test_x86_bmi_bextri_32(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.x86.bmi.bextr.32(i32 [[A:%.*]], i32 1296)
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = tail call i32 @llvm.x86.bmi.bextr.32(i32 %a, i32 1296)
+  ret i32 %1
+}
+
+define i32 @test_x86_bmi_bextri_32_zero_length(i32 %a) nounwind readnone {
+; CHECK-LABEL: @test_x86_bmi_bextri_32_zero_length(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = tail call i32 @llvm.x86.bmi.bextr.32(i32 %a, i32 1)
+  ret i32 %1
+}
+
+define i32 @test_x86_bmi_bextri_32_large_shift(i32 %a) nounwind readnone {
+; CHECK-LABEL: @test_x86_bmi_bextri_32_large_shift(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = tail call i32 @llvm.x86.bmi.bextr.32(i32 %a, i32 288)
+  ret i32 %1
+}
+
+define i64 @test_x86_bmi_bextri_64(i64 %a) nounwind readnone {
+; CHECK-LABEL: @test_x86_bmi_bextri_64(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.x86.bmi.bextr.64(i64 [[A:%.*]], i64 1312)
+; CHECK-NEXT:    ret i64 [[TMP1]]
+;
+  %1 = tail call i64 @llvm.x86.bmi.bextr.64(i64 %a, i64 1312)
+  ret i64 %1
+}
+
+define i64 @test_x86_bmi_bextri_64_zero_length(i64 %a) nounwind readnone {
+; CHECK-LABEL: @test_x86_bmi_bextri_64_zero_length(
+; CHECK-NEXT:    ret i64 0
+;
+  %1 = tail call i64 @llvm.x86.bmi.bextr.64(i64 %a, i64 1)
+  ret i64 %1
+}
+
+define i64 @test_x86_bmi_bextri_64_large_shift(i64 %a) nounwind readnone {
+; CHECK-LABEL: @test_x86_bmi_bextri_64_large_shift(
+; CHECK-NEXT:    ret i64 0
+;
+  %1 = tail call i64 @llvm.x86.bmi.bextr.64(i64 %a, i64 320)
+  ret i64 %1
+}
+
+define i32 @test_x86_bmi_bextri_32_constfold() nounwind readnone {
+; CHECK-LABEL: @test_x86_bmi_bextri_32_constfold(
+; CHECK-NEXT:    ret i32 57005
+;
+  %1 = tail call i32 @llvm.x86.bmi.bextr.32(i32 3735928559, i32 4112) ; extract bits 31:16 from 0xDEADBEEF
+  ret i32 %1
+}
+
+define i32 @test_x86_bmi_bextri_32_constfold2() nounwind readnone {
+; CHECK-LABEL: @test_x86_bmi_bextri_32_constfold2(
+; CHECK-NEXT:    ret i32 233495534
+;
+  %1 = tail call i32 @llvm.x86.bmi.bextr.32(i32 3735928559, i32 8196) ; extract bits 35:4 from 0xDEADBEEF
+  ret i32 %1
+}
+
+define i32 @test_x86_bmi_bextri_32_constfold3() nounwind readnone {
+; CHECK-LABEL: @test_x86_bmi_bextri_32_constfold3(
+; CHECK-NEXT:    ret i32 233495534
+;
+  %1 = tail call i32 @llvm.x86.bmi.bextr.32(i32 3735928559, i32 16388) ; extract bits 67:4 from 0xDEADBEEF
+  ret i32 %1
+}
+
+define i64 @test_x86_bmi_bextri_64_constfold() nounwind readnone {
+; CHECK-LABEL: @test_x86_bmi_bextri_64_constfold(
+; CHECK-NEXT:    ret i64 57005
+;
+  %1 = tail call i64 @llvm.x86.bmi.bextr.64(i64 3735928559, i64 4112) ; extract bits 31:16 from 0xDEADBEEF
+  ret i64 %1
+}
+
+define i64 @test_x86_bmi_bextri_64_constfold2() nounwind readnone {
+; CHECK-LABEL: @test_x86_bmi_bextri_64_constfold2(
+; CHECK-NEXT:    ret i64 233495534
+;
+  %1 = tail call i64 @llvm.x86.bmi.bextr.64(i64 3735928559, i64 16388) ; extract bits 67:4 from 0xDEADBEEF
+  ret i64 %1
+}
+
+define i64 @test_x86_bmi_bextri_64_constfold3() nounwind readnone {
+; CHECK-LABEL: @test_x86_bmi_bextri_64_constfold3(
+; CHECK-NEXT:    ret i64 233495534
+;
+  %1 = tail call i64 @llvm.x86.bmi.bextr.64(i64 3735928559, i64 32772) ; extract bits 131:4 from 0xDEADBEEF
+  ret i64 %1
+}
+
+define i32 @test_x86_bmi_bzhi_32(i32 %a) nounwind readnone {
+; CHECK-LABEL: @test_x86_bmi_bzhi_32(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.x86.bmi.bzhi.32(i32 [[A:%.*]], i32 31)
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %a, i32 31)
+  ret i32 %1
+}
+
+define i32 @test_x86_bmi_bzhi_32_zero(i32 %a) nounwind readnone {
+; CHECK-LABEL: @test_x86_bmi_bzhi_32_zero(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %a, i32 0)
+  ret i32 %1
+}
+
+define i32 @test_x86_bmi_bzhi_32_max(i32 %a) nounwind readnone {
+; CHECK-LABEL: @test_x86_bmi_bzhi_32_max(
+; CHECK-NEXT:    ret i32 [[A:%.*]]
+;
+  %1 = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %a, i32 32)
+  ret i32 %1
+}
+
+define i32 @test_x86_bmi_bzhi_32_constfold() nounwind readnone {
+; CHECK-LABEL: @test_x86_bmi_bzhi_32_constfold(
+; CHECK-NEXT:    ret i32 1
+;
+  %1 = tail call i32 @llvm.x86.bmi.bzhi.32(i32 5, i32 1)
+  ret i32 %1
+}
+
+define i64 @test_x86_bmi_bzhi_64(i64 %a) nounwind readnone {
+; CHECK-LABEL: @test_x86_bmi_bzhi_64(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.x86.bmi.bzhi.64(i64 [[A:%.*]], i64 63)
+; CHECK-NEXT:    ret i64 [[TMP1]]
+;
+  %1 = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %a, i64 63)
+  ret i64 %1
+}
+
+define i64 @test_x86_bmi_bzhi_64_zero(i64 %a) nounwind readnone {
+; CHECK-LABEL: @test_x86_bmi_bzhi_64_zero(
+; CHECK-NEXT:    ret i64 0
+;
+  %1 = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %a, i64 0)
+  ret i64 %1
+}
+
+define i64 @test_x86_bmi_bzhi_64_max(i64 %a) nounwind readnone {
+; CHECK-LABEL: @test_x86_bmi_bzhi_64_max(
+; CHECK-NEXT:    ret i64 [[A:%.*]]
+;
+  %1 = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %a, i64 64)
+  ret i64 %1
+}
+
+define i64 @test_x86_bmi_bzhi_64_constfold() nounwind readnone {
+; CHECK-LABEL: @test_x86_bmi_bzhi_64_constfold(
+; CHECK-NEXT:    ret i64 1
+;
+  %1 = tail call i64 @llvm.x86.bmi.bzhi.64(i64 5, i64 1)
+  ret i64 %1
+}

Added: llvm/trunk/test/Transforms/InstCombine/X86/x86-crc32-demanded.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/X86/x86-crc32-demanded.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/X86/x86-crc32-demanded.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/X86/x86-crc32-demanded.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,17 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; crc32 with 64-bit destination zeros high 32-bit.
+; rdar://9467055
+
+define i64 @test() nounwind {
+entry:
+; CHECK: test
+; CHECK: tail call i64 @llvm.x86.sse42.crc32.64.64
+; CHECK-NOT: and
+; CHECK: ret
+  %0 = tail call i64 @llvm.x86.sse42.crc32.64.64(i64 0, i64 4) nounwind
+  %1 = and i64 %0, 4294967295
+  ret i64 %1
+}
+
+declare i64 @llvm.x86.sse42.crc32.64.64(i64, i64) nounwind readnone

Added: llvm/trunk/test/Transforms/InstCombine/X86/x86-f16c.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/X86/x86-f16c.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/X86/x86-f16c.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/X86/x86-f16c.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>)
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>)
+
+;
+; Vector Demanded Bits
+;
+
+; Only bottom 4 elements required.
+define <4 x float> @demand_vcvtph2ps_128(<8 x i16> %A) {
+; CHECK-LABEL: @demand_vcvtph2ps_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %A)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = shufflevector <8 x i16> %A, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %2 = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %1)
+  ret <4 x float> %2
+}
+
+; All 8 elements required.
+define <8 x float> @demand_vcvtph2ps_256(<8 x i16> %A) {
+; CHECK-LABEL: @demand_vcvtph2ps_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> %A, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> [[TMP1]])
+; CHECK-NEXT:    ret <8 x float> [[TMP2]]
+;
+  %1 = shufflevector <8 x i16> %A, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %2 = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %1)
+  ret <8 x float> %2
+}
+
+;
+; Constant Folding
+;
+
+define <4 x float> @fold_vcvtph2ps_128() {
+; CHECK-LABEL: @fold_vcvtph2ps_128(
+; CHECK-NEXT:    ret <4 x float> <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float -0.000000e+00>
+;
+  %1 = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> <i16 0, i16 14336, i16 15360, i16 32768, i16 16384, i16 31743, i16 48128, i16 49152>)
+  ret <4 x float> %1
+}
+
+define <8 x float> @fold_vcvtph2ps_256() {
+; CHECK-LABEL: @fold_vcvtph2ps_256(
+; CHECK-NEXT:    ret <8 x float> <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float -0.000000e+00, float 2.000000e+00, float 6.550400e+04, float -1.000000e+00, float -2.000000e+00>
+;
+  %1 = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> <i16 0, i16 14336, i16 15360, i16 32768, i16 16384, i16 31743, i16 48128, i16 49152>)
+  ret <8 x float> %1
+}
+
+define <4 x float> @fold_vcvtph2ps_128_zero() {
+; CHECK-LABEL: @fold_vcvtph2ps_128_zero(
+; CHECK-NEXT:    ret <4 x float> zeroinitializer
+;
+  %1 = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
+  ret <4 x float> %1
+}
+
+define <8 x float> @fold_vcvtph2ps_256_zero() {
+; CHECK-LABEL: @fold_vcvtph2ps_256_zero(
+; CHECK-NEXT:    ret <8 x float> zeroinitializer
+;
+  %1 = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
+  ret <8 x float> %1
+}

Added: llvm/trunk/test/Transforms/InstCombine/X86/x86-fma.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/X86/x86-fma.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/X86/x86-fma.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/X86/x86-fma.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,116 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define <4 x float> @test_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: @test_vfmadd_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[A]], float [[TMP4]], i64 0
+; CHECK-NEXT:    ret <4 x float> [[TMP5]]
+;
+  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = insertelement <4 x float> %c, float 4.000000e+00, i32 1
+  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
+  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
+  %7 = extractelement <4 x float> %a, i64 0
+  %8 = extractelement <4 x float> %3, i64 0
+  %9 = extractelement <4 x float> %6, i64 0
+  %10 = call float @llvm.fma.f32(float %7, float %8, float %9)
+  %11 = insertelement <4 x float> %a, float %10, i64 0
+  ret <4 x float> %11
+}
+
+define float @test_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: @test_vfmadd_ss_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]])
+; CHECK-NEXT:    ret float [[TMP4]]
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = extractelement <4 x float> %3, i64 0
+  %5 = extractelement <4 x float> %b, i64 0
+  %6 = extractelement <4 x float> %c, i64 0
+  %7 = call float @llvm.fma.f32(float %4, float %5, float %6)
+  %8 = insertelement <4 x float> %3, float %7, i64 0
+  %9 = extractelement <4 x float> %8, i32 0
+  ret float %9
+}
+
+define float @test_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: @test_vfmadd_ss_1(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
+  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
+  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
+  %4 = extractelement <4 x float> %3, i64 0
+  %5 = extractelement <4 x float> %b, i64 0
+  %6 = extractelement <4 x float> %c, i64 0
+  %7 = call float @llvm.fma.f32(float %4, float %5, float %6)
+  %8 = insertelement <4 x float> %3, float %7, i64 0
+  %9 = extractelement <4 x float> %8, i32 1
+  ret float %9
+}
+
+define <2 x double> @test_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; CHECK-LABEL: @test_vfmadd_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[A]], double [[TMP4]], i64 0
+; CHECK-NEXT:    ret <2 x double> [[TMP5]]
+;
+  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
+  %2 = insertelement <2 x double> %c, double 2.000000e+00, i32 1
+  %3 = extractelement <2 x double> %a, i64 0
+  %4 = extractelement <2 x double> %1, i64 0
+  %5 = extractelement <2 x double> %2, i64 0
+  %6 = call double @llvm.fma.f64(double %3, double %4, double %5)
+  %7 = insertelement <2 x double> %a, double %6, i64 0
+  ret <2 x double> %7
+}
+
+define double @test_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; CHECK-LABEL: @test_vfmadd_sd_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]])
+; CHECK-NEXT:    ret double [[TMP4]]
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = extractelement <2 x double> %1, i64 0
+  %3 = extractelement <2 x double> %b, i64 0
+  %4 = extractelement <2 x double> %c, i64 0
+  %5 = call double @llvm.fma.f64(double %2, double %3, double %4)
+  %6 = insertelement <2 x double> %1, double %5, i64 0
+  %7 = extractelement <2 x double> %6, i32 0
+  ret double %7
+}
+
+define double @test_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; CHECK-LABEL: @test_vfmadd_sd_1(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
+  %2 = extractelement <2 x double> %1, i64 0
+  %3 = extractelement <2 x double> %b, i64 0
+  %4 = extractelement <2 x double> %c, i64 0
+  %5 = call double @llvm.fma.f64(double %2, double %3, double %4)
+  %6 = insertelement <2 x double> %1, double %5, i64 0
+  %7 = extractelement <2 x double> %6, i32 1
+  ret double %7
+}
+
+declare float @llvm.fma.f32(float, float, float)
+declare double @llvm.fma.f64(double, double, double)

Added: llvm/trunk/test/Transforms/InstCombine/X86/x86-insertps.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/X86/x86-insertps.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/X86/x86-insertps.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/X86/x86-insertps.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,140 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
+
+; If all zero mask bits are set, return a zero regardless of the other control bits.
+
+define <4 x float> @insertps_0x0f(<4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: @insertps_0x0f(
+; CHECK-NEXT:    ret <4 x float> zeroinitializer
+;
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 15)
+  ret <4 x float> %res
+}
+
+define <4 x float> @insertps_0xff(<4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: @insertps_0xff(
+; CHECK-NEXT:    ret <4 x float> zeroinitializer
+;
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 255)
+  ret <4 x float> %res
+}
+
+; If some zero mask bits are set that do not override the insertion, we do not change anything.
+
+define <4 x float> @insertps_0x0c(<4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: @insertps_0x0c(
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> [[V1:%.*]], <4 x float> [[V2:%.*]], i8 12)
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 12)
+  ret <4 x float> %res
+}
+
+; ...unless both input vectors are the same operand.
+
+define <4 x float> @insertps_0x15_single_input(<4 x float> %v1) {
+; CHECK-LABEL: @insertps_0x15_single_input(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[V1:%.*]], <4 x float> <float 0.000000e+00, float undef, float 0.000000e+00, float undef>, <4 x i32> <i32 4, i32 0, i32 6, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v1, i8 21)
+  ret <4 x float> %res
+}
+
+; The zero mask overrides the insertion lane.
+
+define <4 x float> @insertps_0x1a_single_input(<4 x float> %v1) {
+; CHECK-LABEL: @insertps_0x1a_single_input(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[V1:%.*]], <4 x float> <float undef, float 0.000000e+00, float undef, float 0.000000e+00>, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v1, i8 26)
+  ret <4 x float> %res
+}
+
+; The zero mask overrides the insertion lane, so the second input vector is not used.
+
+define <4 x float> @insertps_0xc1(<4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: @insertps_0xc1(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[V1:%.*]], float 0.000000e+00, i32 0
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 193)
+  ret <4 x float> %res
+}
+
+; If no zero mask bits are set, convert to a shuffle.
+
+define <4 x float> @insertps_0x00(<4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: @insertps_0x00(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[V2:%.*]], <4 x float> [[V1:%.*]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 0)
+  ret <4 x float> %res
+}
+
+define <4 x float> @insertps_0x10(<4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: @insertps_0x10(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[V1:%.*]], <4 x float> [[V2:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 16)
+  ret <4 x float> %res
+}
+
+define <4 x float> @insertps_0x20(<4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: @insertps_0x20(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[V1:%.*]], <4 x float> [[V2:%.*]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 32)
+  ret <4 x float> %res
+}
+
+define <4 x float> @insertps_0x30(<4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: @insertps_0x30(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[V1:%.*]], <4 x float> [[V2:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 48)
+  ret <4 x float> %res
+}
+
+define <4 x float> @insertps_0xc0(<4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: @insertps_0xc0(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[V1:%.*]], <4 x float> [[V2:%.*]], <4 x i32> <i32 7, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 192)
+  ret <4 x float> %res
+}
+
+define <4 x float> @insertps_0xd0(<4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: @insertps_0xd0(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[V1:%.*]], <4 x float> [[V2:%.*]], <4 x i32> <i32 0, i32 7, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 208)
+  ret <4 x float> %res
+}
+
+define <4 x float> @insertps_0xe0(<4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: @insertps_0xe0(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[V1:%.*]], <4 x float> [[V2:%.*]], <4 x i32> <i32 0, i32 1, i32 7, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 224)
+  ret <4 x float> %res
+}
+
+define <4 x float> @insertps_0xf0(<4 x float> %v1, <4 x float> %v2) {
+; CHECK-LABEL: @insertps_0xf0(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[V1:%.*]], <4 x float> [[V2:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 240)
+  ret <4 x float> %res
+}

Added: llvm/trunk/test/Transforms/InstCombine/X86/x86-masked-memops.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/X86/x86-masked-memops.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/X86/x86-masked-memops.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/X86/x86-masked-memops.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,328 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+;; MASKED LOADS
+
+; If the mask isn't constant, do nothing.
+
+define <4 x float> @mload(i8* %f, <4 x i32> %mask) {
+; CHECK-LABEL: @mload(
+; CHECK-NEXT:    [[LD:%.*]] = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* [[F:%.*]], <4 x i32> [[MASK:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[LD]]
+;
+  %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> %mask)
+  ret <4 x float> %ld
+
+}
+
+; Zero mask returns a zero vector.
+
+define <4 x float> @mload_zeros(i8* %f) {
+; CHECK-LABEL: @mload_zeros(
+; CHECK-NEXT:    ret <4 x float> zeroinitializer
+;
+  %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> zeroinitializer)
+  ret <4 x float> %ld
+
+}
+
+; Only the sign bit matters.
+
+define <4 x float> @mload_fake_ones(i8* %f) {
+; CHECK-LABEL: @mload_fake_ones(
+; CHECK-NEXT:    ret <4 x float> zeroinitializer
+;
+  %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> <i32 1, i32 2, i32 3, i32 2147483647>)
+  ret <4 x float> %ld
+
+}
+
+; All mask bits are set, so this is just a vector load.
+
+define <4 x float> @mload_real_ones(i8* %f) {
+; CHECK-LABEL: @mload_real_ones(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[UNMASKEDLOAD:%.*]] = load <4 x float>, <4 x float>* [[CASTVEC]], align 1
+; CHECK-NEXT:    ret <4 x float> [[UNMASKEDLOAD]]
+;
+  %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> <i32 -1, i32 -2, i32 -3, i32 2147483648>)
+  ret <4 x float> %ld
+
+}
+
+; It's a constant mask, so convert to an LLVM intrinsic. The backend should optimize further.
+
+define <4 x float> @mload_one_one(i8* %f) {
+; CHECK-LABEL: @mload_one_one(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[CASTVEC]], i32 1, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef>)
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> <i32 0, i32 0, i32 0, i32 -1>)
+  ret <4 x float> %ld
+
+}
+
+; Try doubles.
+
+define <2 x double> @mload_one_one_double(i8* %f) {
+; CHECK-LABEL: @mload_one_one_double(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* [[CASTVEC]], i32 1, <2 x i1> <i1 true, i1 false>, <2 x double> <double undef, double 0.000000e+00>)
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %ld = tail call <2 x double> @llvm.x86.avx.maskload.pd(i8* %f, <2 x i64> <i64 -1, i64 0>)
+  ret <2 x double> %ld
+
+}
+
+; Try 256-bit FP ops.
+
+define <8 x float> @mload_v8f32(i8* %f) {
+; CHECK-LABEL: @mload_v8f32(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <8 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[CASTVEC]], i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>)
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %ld = tail call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %f, <8 x i32> <i32 0, i32 0, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 0>)
+  ret <8 x float> %ld
+
+}
+
+define <4 x double> @mload_v4f64(i8* %f) {
+; CHECK-LABEL: @mload_v4f64(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <4 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[CASTVEC]], i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x double> <double undef, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>)
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %ld = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %f, <4 x i64> <i64 -1, i64 0, i64 0, i64 0>)
+  ret <4 x double> %ld
+
+}
+
+; Try the AVX2 variants.
+
+define <4 x i32> @mload_v4i32(i8* %f) {
+; CHECK-LABEL: @mload_v4i32(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[CASTVEC]], i32 1, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32> <i32 0, i32 0, i32 0, i32 undef>)
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %ld = tail call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %f, <4 x i32> <i32 0, i32 0, i32 0, i32 -1>)
+  ret <4 x i32> %ld
+
+}
+
+define <2 x i64> @mload_v2i64(i8* %f) {
+; CHECK-LABEL: @mload_v2i64(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* [[CASTVEC]], i32 1, <2 x i1> <i1 true, i1 false>, <2 x i64> <i64 undef, i64 0>)
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %ld = tail call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %f, <2 x i64> <i64 -1, i64 0>)
+  ret <2 x i64> %ld
+
+}
+
+define <8 x i32> @mload_v8i32(i8* %f) {
+; CHECK-LABEL: @mload_v8i32(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <8 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[CASTVEC]], i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i32> <i32 0, i32 0, i32 0, i32 undef, i32 0, i32 0, i32 0, i32 0>)
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %ld = tail call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %f, <8 x i32> <i32 0, i32 0, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 0>)
+  ret <8 x i32> %ld
+
+}
+
+define <4 x i64> @mload_v4i64(i8* %f) {
+; CHECK-LABEL: @mload_v4i64(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <4 x i64>*
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* [[CASTVEC]], i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i64> <i64 undef, i64 0, i64 0, i64 0>)
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %ld = tail call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %f, <4 x i64> <i64 -1, i64 0, i64 0, i64 0>)
+  ret <4 x i64> %ld
+
+}
+
+
+;; MASKED STORES
+
+; If the mask isn't constant, do nothing.
+
+define void @mstore(i8* %f, <4 x i32> %mask, <4 x float> %v) {
+; CHECK-LABEL: @mstore(
+; CHECK-NEXT:    tail call void @llvm.x86.avx.maskstore.ps(i8* [[F:%.*]], <4 x i32> [[MASK:%.*]], <4 x float> [[V:%.*]])
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> %mask, <4 x float> %v)
+  ret void
+
+}
+
+; Zero mask is a nop.
+
+define void @mstore_zeros(i8* %f, <4 x float> %v)  {
+; CHECK-LABEL: @mstore_zeros(
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> zeroinitializer, <4 x float> %v)
+  ret void
+
+}
+
+; Only the sign bit matters.
+
+define void @mstore_fake_ones(i8* %f, <4 x float> %v) {
+; CHECK-LABEL: @mstore_fake_ones(
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> <i32 1, i32 2, i32 3, i32 2147483647>, <4 x float> %v)
+  ret void
+
+}
+
+; All mask bits are set, so this is just a vector store.
+
+define void @mstore_real_ones(i8* %f, <4 x float> %v) {
+; CHECK-LABEL: @mstore_real_ones(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[V:%.*]], <4 x float>* [[CASTVEC]], align 1
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> <i32 -1, i32 -2, i32 -3, i32 -2147483648>, <4 x float> %v)
+  ret void
+
+}
+
+; It's a constant mask, so convert to an LLVM intrinsic. The backend should optimize further.
+
+define void @mstore_one_one(i8* %f, <4 x float> %v) {
+; CHECK-LABEL: @mstore_one_one(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <4 x float>*
+; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> [[V:%.*]], <4 x float>* [[CASTVEC]], i32 1, <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.x86.avx.maskstore.ps(i8* %f, <4 x i32> <i32 0, i32 0, i32 0, i32 -1>, <4 x float> %v)
+  ret void
+
+}
+
+; Try doubles.
+
+define void @mstore_one_one_double(i8* %f, <2 x double> %v) {
+; CHECK-LABEL: @mstore_one_one_double(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <2 x double>*
+; CHECK-NEXT:    call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> [[V:%.*]], <2 x double>* [[CASTVEC]], i32 1, <2 x i1> <i1 true, i1 false>)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.x86.avx.maskstore.pd(i8* %f, <2 x i64> <i64 -1, i64 0>, <2 x double> %v)
+  ret void
+
+}
+
+; Try 256-bit FP ops.
+
+define void @mstore_v8f32(i8* %f, <8 x float> %v) {
+; CHECK-LABEL: @mstore_v8f32(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <8 x float>*
+; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[V:%.*]], <8 x float>* [[CASTVEC]], i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.x86.avx.maskstore.ps.256(i8* %f, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 -1, i32 -2, i32 -3, i32 -4>, <8 x float> %v)
+  ret void
+
+}
+
+define void @mstore_v4f64(i8* %f, <4 x double> %v) {
+; CHECK-LABEL: @mstore_v4f64(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <4 x double>*
+; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[V:%.*]], <4 x double>* [[CASTVEC]], i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.x86.avx.maskstore.pd.256(i8* %f, <4 x i64> <i64 -1, i64 0, i64 1, i64 2>, <4 x double> %v)
+  ret void
+
+}
+
+; Try the AVX2 variants.
+
+define void @mstore_v4i32(i8* %f, <4 x i32> %v) {
+; CHECK-LABEL: @mstore_v4i32(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <4 x i32>*
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[V:%.*]], <4 x i32>* [[CASTVEC]], i32 1, <4 x i1> <i1 false, i1 false, i1 true, i1 true>)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.x86.avx2.maskstore.d(i8* %f, <4 x i32> <i32 0, i32 1, i32 -1, i32 -2>, <4 x i32> %v)
+  ret void
+
+}
+
+define void @mstore_v2i64(i8* %f, <2 x i64> %v) {
+; CHECK-LABEL: @mstore_v2i64(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <2 x i64>*
+; CHECK-NEXT:    call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> [[V:%.*]], <2 x i64>* [[CASTVEC]], i32 1, <2 x i1> <i1 true, i1 false>)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.x86.avx2.maskstore.q(i8* %f, <2 x i64> <i64 -1, i64 0>, <2 x i64> %v)
+  ret void
+
+}
+
+define void @mstore_v8i32(i8* %f, <8 x i32> %v) {
+; CHECK-LABEL: @mstore_v8i32(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <8 x i32>*
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[V:%.*]], <8 x i32>* [[CASTVEC]], i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.x86.avx2.maskstore.d.256(i8* %f, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 -1, i32 -2, i32 -3, i32 -4>, <8 x i32> %v)
+  ret void
+
+}
+
+define void @mstore_v4i64(i8* %f, <4 x i64> %v) {
+; CHECK-LABEL: @mstore_v4i64(
+; CHECK-NEXT:    [[CASTVEC:%.*]] = bitcast i8* [[F:%.*]] to <4 x i64>*
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> [[V:%.*]], <4 x i64>* [[CASTVEC]], i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>)
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.x86.avx2.maskstore.q.256(i8* %f, <4 x i64> <i64 -1, i64 0, i64 1, i64 2>, <4 x i64> %v)
+  ret void
+
+}
+
+; The original SSE2 masked store variant.
+
+define void @mstore_v16i8_sse2_zeros(<16 x i8> %d, i8* %p) {
+; CHECK-LABEL: @mstore_v16i8_sse2_zeros(
+; CHECK-NEXT:    ret void
+;
+  tail call void @llvm.x86.sse2.maskmov.dqu(<16 x i8> %d, <16 x i8> zeroinitializer, i8* %p)
+  ret void
+
+}
+
+
+declare <4 x float> @llvm.x86.avx.maskload.ps(i8*, <4 x i32>)
+declare <2 x double> @llvm.x86.avx.maskload.pd(i8*, <2 x i64>)
+declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8*, <8 x i32>)
+declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x i64>)
+
+declare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>)
+declare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>)
+declare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>)
+declare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>)
+
+declare void @llvm.x86.avx.maskstore.ps(i8*, <4 x i32>, <4 x float>)
+declare void @llvm.x86.avx.maskstore.pd(i8*, <2 x i64>, <2 x double>)
+declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>)
+declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x i64>, <4 x double>)
+
+declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>)
+declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>)
+declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>)
+declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>)
+
+declare void @llvm.x86.sse2.maskmov.dqu(<16 x i8>, <16 x i8>, i8*)
+

Added: llvm/trunk/test/Transforms/InstCombine/X86/x86-movmsk.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/X86/x86-movmsk.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/X86/x86-movmsk.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/X86/x86-movmsk.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,458 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+;
+; DemandedBits - MOVMSK zeros the upper bits of the result.
+;
+
+define i32 @test_upper_x86_mmx_pmovmskb(x86_mmx %a0) {
+; CHECK-LABEL: @test_upper_x86_mmx_pmovmskb(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx [[A0:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %a0)
+  %2 = and i32 %1, 255
+  ret i32 %2
+}
+
+define i32 @test_upper_x86_sse_movmsk_ps(<4 x float> %a0) {
+; CHECK-LABEL: @test_upper_x86_sse_movmsk_ps(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[A0:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i1> [[TMP2]] to i4
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i4 [[TMP3]] to i32
+; CHECK-NEXT:    ret i32 [[TMP4]]
+;
+  %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
+  %2 = and i32 %1, 15
+  ret i32 %2
+}
+
+define i32 @test_upper_x86_sse2_movmsk_pd(<2 x double> %a0) {
+; CHECK-LABEL: @test_upper_x86_sse2_movmsk_pd(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[A0:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i1> [[TMP2]] to i2
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i2 [[TMP3]] to i32
+; CHECK-NEXT:    ret i32 [[TMP4]]
+;
+  %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0)
+  %2 = and i32 %1, 3
+  ret i32 %2
+}
+
+define i32 @test_upper_x86_sse2_pmovmskb_128(<16 x i8> %a0) {
+; CHECK-LABEL: @test_upper_x86_sse2_pmovmskb_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <16 x i8> [[A0:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i1> [[TMP1]] to i16
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0)
+  %2 = and i32 %1, 65535
+  ret i32 %2
+}
+
+define i32 @test_upper_x86_avx_movmsk_ps_256(<8 x float> %a0) {
+; CHECK-LABEL: @test_upper_x86_avx_movmsk_ps_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[A0:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <8 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i1> [[TMP2]] to i8
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i32
+; CHECK-NEXT:    ret i32 [[TMP4]]
+;
+  %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0)
+  %2 = and i32 %1, 255
+  ret i32 %2
+}
+
+define i32 @test_upper_x86_avx_movmsk_pd_256(<4 x double> %a0) {
+; CHECK-LABEL: @test_upper_x86_avx_movmsk_pd_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x double> [[A0:%.*]] to <4 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <4 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i1> [[TMP2]] to i4
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i4 [[TMP3]] to i32
+; CHECK-NEXT:    ret i32 [[TMP4]]
+;
+  %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0)
+  %2 = and i32 %1, 15
+  ret i32 %2
+}
+
+; llvm.x86.avx2.pmovmskb uses the whole of the 32-bit register.
+
+;
+; DemandedBits - If we don't use the lower bits then we just return zero.
+;
+
+define i32 @test_lower_x86_mmx_pmovmskb(x86_mmx %a0) {
+; CHECK-LABEL: @test_lower_x86_mmx_pmovmskb(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %a0)
+  %2 = and i32 %1, -256
+  ret i32 %2
+}
+
+define i32 @test_lower_x86_sse_movmsk_ps(<4 x float> %a0) {
+; CHECK-LABEL: @test_lower_x86_sse_movmsk_ps(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
+  %2 = and i32 %1, -16
+  ret i32 %2
+}
+
+define i32 @test_lower_x86_sse2_movmsk_pd(<2 x double> %a0) {
+; CHECK-LABEL: @test_lower_x86_sse2_movmsk_pd(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0)
+  %2 = and i32 %1, -4
+  ret i32 %2
+}
+
+define i32 @test_lower_x86_sse2_pmovmskb_128(<16 x i8> %a0) {
+; CHECK-LABEL: @test_lower_x86_sse2_pmovmskb_128(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0)
+  %2 = and i32 %1, -65536
+  ret i32 %2
+}
+
+define i32 @test_lower_x86_avx_movmsk_ps_256(<8 x float> %a0) {
+; CHECK-LABEL: @test_lower_x86_avx_movmsk_ps_256(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0)
+  %2 = and i32 %1, -256
+  ret i32 %2
+}
+
+define i32 @test_lower_x86_avx_movmsk_pd_256(<4 x double> %a0) {
+; CHECK-LABEL: @test_lower_x86_avx_movmsk_pd_256(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0)
+  %2 = and i32 %1, -16
+  ret i32 %2
+}
+
+; llvm.x86.avx2.pmovmskb uses the whole of the 32-bit register.
+
+;
+; Constant Folding (UNDEF -> ZERO)
+;
+
+define i32 @undef_x86_mmx_pmovmskb() {
+; CHECK-LABEL: @undef_x86_mmx_pmovmskb(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx undef)
+  ret i32 %1
+}
+
+define i32 @undef_x86_sse_movmsk_ps() {
+; CHECK-LABEL: @undef_x86_sse_movmsk_ps(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> undef)
+  ret i32 %1
+}
+
+define i32 @undef_x86_sse2_movmsk_pd() {
+; CHECK-LABEL: @undef_x86_sse2_movmsk_pd(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> undef)
+  ret i32 %1
+}
+
+define i32 @undef_x86_sse2_pmovmskb_128() {
+; CHECK-LABEL: @undef_x86_sse2_pmovmskb_128(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> undef)
+  ret i32 %1
+}
+
+define i32 @undef_x86_avx_movmsk_ps_256() {
+; CHECK-LABEL: @undef_x86_avx_movmsk_ps_256(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> undef)
+  ret i32 %1
+}
+
+define i32 @undef_x86_avx_movmsk_pd_256() {
+; CHECK-LABEL: @undef_x86_avx_movmsk_pd_256(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> undef)
+  ret i32 %1
+}
+
+define i32 @undef_x86_avx2_pmovmskb() {
+; CHECK-LABEL: @undef_x86_avx2_pmovmskb(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> undef)
+  ret i32 %1
+}
+
+;
+; Constant Folding (ZERO -> ZERO)
+;
+
+define i32 @zero_x86_mmx_pmovmskb() {
+; CHECK-LABEL: @zero_x86_mmx_pmovmskb(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx bitcast (<1 x i64> zeroinitializer to x86_mmx))
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = bitcast <1 x i64> zeroinitializer to x86_mmx
+  %2 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %1)
+  ret i32 %2
+}
+
+define i32 @zero_x86_sse_movmsk_ps() {
+; CHECK-LABEL: @zero_x86_sse_movmsk_ps(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> zeroinitializer)
+  ret i32 %1
+}
+
+define i32 @zero_x86_sse2_movmsk_pd() {
+; CHECK-LABEL: @zero_x86_sse2_movmsk_pd(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> zeroinitializer)
+  ret i32 %1
+}
+
+define i32 @zero_x86_sse2_pmovmskb_128() {
+; CHECK-LABEL: @zero_x86_sse2_pmovmskb_128(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> zeroinitializer)
+  ret i32 %1
+}
+
+define i32 @zero_x86_avx_movmsk_ps_256() {
+; CHECK-LABEL: @zero_x86_avx_movmsk_ps_256(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> zeroinitializer)
+  ret i32 %1
+}
+
+define i32 @zero_x86_avx_movmsk_pd_256() {
+; CHECK-LABEL: @zero_x86_avx_movmsk_pd_256(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> zeroinitializer)
+  ret i32 %1
+}
+
+define i32 @zero_x86_avx2_pmovmskb() {
+; CHECK-LABEL: @zero_x86_avx2_pmovmskb(
+; CHECK-NEXT:    ret i32 0
+;
+  %1 = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> zeroinitializer)
+  ret i32 %1
+}
+
+;
+; Constant Folding
+;
+
+define i32 @fold_x86_mmx_pmovmskb() {
+; CHECK-LABEL: @fold_x86_mmx_pmovmskb(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx bitcast (<8 x i8> <i8 0, i8 -1, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 0> to x86_mmx))
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = bitcast <8 x i8> <i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256> to x86_mmx
+  %2 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %1)
+  ret i32 %2
+}
+
+define i32 @fold_x86_sse_movmsk_ps() {
+; CHECK-LABEL: @fold_x86_sse_movmsk_ps(
+; CHECK-NEXT:    ret i32 10
+;
+  %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> <float 1.0, float -1.0, float 100.0, float -200.0>)
+  ret i32 %1
+}
+
+define i32 @fold_x86_sse2_movmsk_pd() {
+; CHECK-LABEL: @fold_x86_sse2_movmsk_pd(
+; CHECK-NEXT:    ret i32 2
+;
+  %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> <double 1.0, double -1.0>)
+  ret i32 %1
+}
+
+define i32 @fold_x86_sse2_pmovmskb_128() {
+; CHECK-LABEL: @fold_x86_sse2_pmovmskb_128(
+; CHECK-NEXT:    ret i32 5654
+;
+  %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> <i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256, i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256>)
+  ret i32 %1
+}
+
+define i32 @fold_x86_avx_movmsk_ps_256() {
+; CHECK-LABEL: @fold_x86_avx_movmsk_ps_256(
+; CHECK-NEXT:    ret i32 170
+;
+  %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> <float 1.0, float -1.0, float 100.0, float -200.0, float +0.0, float -0.0, float 100000.0, float -5000000.0>)
+  ret i32 %1
+}
+
+define i32 @fold_x86_avx_movmsk_pd_256() {
+; CHECK-LABEL: @fold_x86_avx_movmsk_pd_256(
+; CHECK-NEXT:    ret i32 10
+;
+  %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> <double 1.0, double -1.0, double 100.0, double -200.0>)
+  ret i32 %1
+}
+
+define i32 @fold_x86_avx2_pmovmskb() {
+; CHECK-LABEL: @fold_x86_avx2_pmovmskb(
+; CHECK-NEXT:    ret i32 370546176
+;
+  %1 = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256, i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256, i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256>)
+  ret i32 %1
+}
+
+define i32 @sext_sse_movmsk_ps(<4 x i1> %x) {
+; CHECK-LABEL: @sext_sse_movmsk_ps(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i1> [[X:%.*]] to i4
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i4 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %sext = sext <4 x i1> %x to <4 x i32>
+  %bc = bitcast <4 x i32> %sext to <4 x float>
+  %r = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %bc)
+  ret i32 %r
+}
+
+define i32 @sext_sse2_movmsk_pd(<2 x i1> %x) {
+; CHECK-LABEL: @sext_sse2_movmsk_pd(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i1> [[X:%.*]] to i2
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i2 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %sext = sext <2 x i1> %x to <2 x i64>
+  %bc = bitcast <2 x i64> %sext to <2 x double>
+  %r = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %bc)
+  ret i32 %r
+}
+
+define i32 @sext_sse2_pmovmskb_128(<16 x i1> %x) {
+; CHECK-LABEL: @sext_sse2_pmovmskb_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i1> [[X:%.*]] to i16
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %sext = sext <16 x i1> %x to <16 x i8>
+  %r = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %sext)
+  ret i32 %r
+}
+
+define i32 @sext_avx_movmsk_ps_256(<8 x i1> %x) {
+; CHECK-LABEL: @sext_avx_movmsk_ps_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i1> [[X:%.*]] to i8
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %sext = sext <8 x i1> %x to <8 x i32>
+  %bc = bitcast <8 x i32> %sext to <8 x float>
+  %r = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %bc)
+  ret i32 %r
+}
+
+define i32 @sext_avx_movmsk_pd_256(<4 x i1> %x) {
+; CHECK-LABEL: @sext_avx_movmsk_pd_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i1> [[X:%.*]] to i4
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i4 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %sext = sext <4 x i1> %x to <4 x i64>
+  %bc = bitcast <4 x i64> %sext to <4 x double>
+  %r = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %bc)
+  ret i32 %r
+}
+
+define i32 @sext_avx2_pmovmskb(<32 x i1> %x) {
+; CHECK-LABEL: @sext_avx2_pmovmskb(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <32 x i1> [[X:%.*]] to i32
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %sext = sext <32 x i1> %x to <32 x i8>
+  %r = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %sext)
+  ret i32 %r
+}
+
+; Bitcast from sign-extended scalar.
+
+define i32 @sext_sse_movmsk_ps_scalar_source(i1 %x) {
+; CHECK-LABEL: @sext_sse_movmsk_ps_scalar_source(
+; CHECK-NEXT:    [[SEXT:%.*]] = sext i1 [[X:%.*]] to i128
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[SEXT]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i1> [[TMP2]] to i4
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i4 [[TMP3]] to i32
+; CHECK-NEXT:    ret i32 [[TMP4]]
+;
+  %sext = sext i1 %x to i128
+  %bc = bitcast i128 %sext to <4 x float>
+  %r = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %bc)
+  ret i32 %r
+}
+
+; Bitcast from vector type with more elements.
+
+define i32 @sext_sse_movmsk_ps_too_many_elts(<8 x i1> %x) {
+; CHECK-LABEL: @sext_sse_movmsk_ps_too_many_elts(
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <8 x i1> [[X:%.*]] to <8 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[SEXT]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i1> [[TMP2]] to i4
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i4 [[TMP3]] to i32
+; CHECK-NEXT:    ret i32 [[TMP4]]
+;
+  %sext = sext <8 x i1> %x to <8 x i16>
+  %bc = bitcast <8 x i16> %sext to <4 x float>
+  %r = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %bc)
+  ret i32 %r
+}
+
+; Handle this by doing a bitcasted sign-bit test after the sext.
+
+define i32 @sext_sse_movmsk_ps_must_replicate_bits(<2 x i1> %x) {
+; CHECK-LABEL: @sext_sse_movmsk_ps_must_replicate_bits(
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[X:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[SEXT]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i1> [[TMP2]] to i4
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i4 [[TMP3]] to i32
+; CHECK-NEXT:    ret i32 [[TMP4]]
+;
+  %sext = sext <2 x i1> %x to <2 x i64>
+  %bc = bitcast <2 x i64> %sext to <4 x float>
+  %r = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %bc)
+  ret i32 %r
+}
+
+declare i32 @llvm.x86.mmx.pmovmskb(x86_mmx)
+
+declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>)
+declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>)
+declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>)
+
+declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>)
+declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>)
+declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>)

Added: llvm/trunk/test/Transforms/InstCombine/X86/x86-muldq.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/X86/x86-muldq.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/X86/x86-muldq.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/X86/x86-muldq.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,281 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+;
+; UNDEF Elts
+;
+
+define <2 x i64> @undef_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @undef_pmuludq_128(
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> undef, <4 x i32> undef)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @undef_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @undef_pmuludq_256(
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
+  %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> undef, <8 x i32> undef)
+  ret <4 x i64> %1
+}
+
+define <8 x i64> @undef_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @undef_pmuludq_512(
+; CHECK-NEXT:    ret <8 x i64> zeroinitializer
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> undef, <16 x i32> undef)
+  ret <8 x i64> %1
+}
+
+define <2 x i64> @undef_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @undef_pmuldq_128(
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> undef, <4 x i32> undef)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @undef_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @undef_pmuldq_256(
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
+  %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> undef, <8 x i32> undef)
+  ret <4 x i64> %1
+}
+
+define <8 x i64> @undef_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @undef_pmuldq_512(
+; CHECK-NEXT:    ret <8 x i64> zeroinitializer
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> undef, <16 x i32> undef)
+  ret <8 x i64> %1
+}
+
+define <2 x i64> @undef_zero_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @undef_zero_pmuludq_128(
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> undef, <4 x i32> zeroinitializer)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @undef_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @undef_zero_pmuludq_256(
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
+  %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> zeroinitializer, <8 x i32> undef)
+  ret <4 x i64> %1
+}
+
+define <8 x i64> @undef_zero_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @undef_zero_pmuludq_512(
+; CHECK-NEXT:    ret <8 x i64> zeroinitializer
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> undef, <16 x i32> zeroinitializer)
+  ret <8 x i64> %1
+}
+
+define <2 x i64> @undef_zero_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @undef_zero_pmuldq_128(
+; CHECK-NEXT:    ret <2 x i64> zeroinitializer
+;
+  %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> zeroinitializer, <4 x i32> undef)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @undef_zero_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @undef_zero_pmuldq_256(
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
+  %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> undef, <8 x i32> zeroinitializer)
+  ret <4 x i64> %1
+}
+
+define <8 x i64> @undef_zero_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @undef_zero_pmuldq_512(
+; CHECK-NEXT:    ret <8 x i64> zeroinitializer
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> zeroinitializer, <16 x i32> undef)
+  ret <8 x i64> %1
+}
+
+;
+; Constant Folding
+;
+
+define <2 x i64> @fold_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @fold_pmuludq_128(
+; CHECK-NEXT:    ret <2 x i64> <i64 9223372030412324865, i64 4294967295>
+;
+  %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 2147483647, i32 1, i32 1, i32 3>)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @fold_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @fold_pmuludq_256(
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
+  %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> zeroinitializer, <8 x i32> zeroinitializer)
+  ret <4 x i64> %1
+}
+
+define <8 x i64> @fold_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @fold_pmuludq_512(
+; CHECK-NEXT:    ret <8 x i64> <i64 0, i64 0, i64 255, i64 131070, i64 0, i64 -281474976645121, i64 140737488289792, i64 281470681743360>
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> <i32 0, i32 0, i32 undef, i32 0, i32 1, i32 1, i32 2, i32 2, i32 undef, i32 undef, i32 -1, i32 -1, i32 65536, i32 -1, i32 -65536, i32 undef>, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 255, i32 -256, i32 65535, i32 -65536, i32 0, i32 -1, i32 -65535, i32 -65535, i32 2147483647, i32 2147483648, i32 65536, i32 -65535>)
+  ret <8 x i64> %1
+}
+
+define <2 x i64> @fold_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @fold_pmuldq_128(
+; CHECK-NEXT:    ret <2 x i64> <i64 0, i64 2>
+;
+  %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> <i32 undef, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 undef, i32 1, i32 -2, i32 3>)
+  ret <2 x i64> %1
+}
+
+define <4 x i64> @fold_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @fold_pmuldq_256(
+; CHECK-NEXT:    ret <4 x i64> <i64 0, i64 4294836225, i64 140737488289792, i64 -140737488355328>
+;
+  %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> <i32 undef, i32 1, i32 -65535, i32 128, i32 65536, i32 2147483647, i32 -2147483648, i32 65536>, <8 x i32> <i32 0, i32 -1, i32 -65535, i32 -65535, i32 2147483647, i32 2147483648, i32 65536, i32 -65535>)
+  ret <4 x i64> %1
+}
+
+define <8 x i64> @fold_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @fold_pmuldq_512(
+; CHECK-NEXT:    ret <8 x i64> zeroinitializer
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> zeroinitializer, <16 x i32> <i32 undef, i32 -1, i32 -3, i32 -1, i32 8, i32 10, i32 -256, i32 65536, i32 undef, i32 1, i32 -65535, i32 128, i32 65536, i32 2147483647, i32 -2147483648, i32 65536>)
+  ret <8 x i64> %1
+}
+
+;
+; PMULUDQ/PMULDQ - only the even elements (0, 2, 4, 6) of the vXi32 inputs are required.
+;
+
+define <2 x i64> @test_demanded_elts_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pmuludq_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A1:%.*]], <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = and <2 x i64> [[TMP3]], <i64 4294967295, i64 undef>
+; CHECK-NEXT:    [[TMP6:%.*]] = and <2 x i64> [[TMP4]], <i64 4294967295, i64 undef>
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <2 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    ret <2 x i64> [[TMP8]]
+;
+  %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %3 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %1, <4 x i32> %2)
+  %4 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %4
+}
+
+define <4 x i64> @test_demanded_elts_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pmuludq_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A1:%.*]], <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to <4 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = and <4 x i64> [[TMP3]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+; CHECK-NEXT:    [[TMP6:%.*]] = and <4 x i64> [[TMP4]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw <4 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP7]]
+;
+  %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  %3 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %1, <8 x i32> %2)
+  ret <4 x i64> %3
+}
+
+define <8 x i64> @test_demanded_elts_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pmuludq_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[A1:%.*]], <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i64> [[TMP3]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+; CHECK-NEXT:    [[TMP6:%.*]] = and <8 x i64> [[TMP4]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw <8 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret <8 x i64> [[TMP7]]
+;
+  %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+  %2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+  %3 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %1, <16 x i32> %2)
+  ret <8 x i64> %3
+}
+
+define <2 x i64> @test_demanded_elts_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pmuldq_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A1:%.*]], <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = shl <2 x i64> [[TMP3]], <i64 32, i64 32>
+; CHECK-NEXT:    [[TMP6:%.*]] = ashr exact <2 x i64> [[TMP5]], <i64 32, i64 32>
+; CHECK-NEXT:    [[TMP7:%.*]] = shl <2 x i64> [[TMP4]], <i64 32, i64 32>
+; CHECK-NEXT:    [[TMP8:%.*]] = ashr exact <2 x i64> [[TMP7]], <i64 32, i64 32>
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <2 x i64> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    ret <2 x i64> [[TMP9]]
+;
+  %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  %3 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %1, <4 x i32> %2)
+  ret <2 x i64> %3
+}
+
+define <4 x i64> @test_demanded_elts_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pmuldq_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A1:%.*]], <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to <4 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = shl <4 x i64> [[TMP3]], <i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[TMP6:%.*]] = ashr exact <4 x i64> [[TMP5]], <i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[TMP7:%.*]] = shl <4 x i64> [[TMP4]], <i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[TMP8:%.*]] = ashr exact <4 x i64> [[TMP7]], <i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <4 x i64> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 3, i32 3>
+; CHECK-NEXT:    ret <4 x i64> [[TMP10]]
+;
+  %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  %3 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %1, <8 x i32> %2)
+  %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 3, i32 3>
+  ret <4 x i64> %4
+}
+
+define <8 x i64> @test_demanded_elts_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @test_demanded_elts_pmuldq_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[A1:%.*]], <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = shl <8 x i64> [[TMP3]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[TMP6:%.*]] = ashr exact <8 x i64> [[TMP5]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[TMP7:%.*]] = shl <8 x i64> [[TMP4]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[TMP8:%.*]] = ashr exact <8 x i64> [[TMP7]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <8 x i64> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 3, i32 3, i32 4, i32 4, i32 7, i32 7>
+; CHECK-NEXT:    ret <8 x i64> [[TMP10]]
+;
+  %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+  %2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+  %3 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %1, <16 x i32> %2)
+  %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 3, i32 3, i32 4, i32 4, i32 7, i32 7>
+  ret <8 x i64> %4
+}
+
+declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
+declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
+
+declare <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32>, <16 x i32>) nounwind readnone
+declare <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32>, <16 x i32>) nounwind readnone

Added: llvm/trunk/test/Transforms/InstCombine/X86/x86-pack.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/X86/x86-pack.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/X86/x86-pack.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/X86/x86-pack.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,366 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+;
+; UNDEF Elts
+;
+
+define <8 x i16> @undef_packssdw_128() {
+; CHECK-LABEL: @undef_packssdw_128(
+; CHECK-NEXT:    ret <8 x i16> undef
+;
+  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> undef, <4 x i32> undef)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @undef_packusdw_128() {
+; CHECK-LABEL: @undef_packusdw_128(
+; CHECK-NEXT:    ret <8 x i16> undef
+;
+  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> undef, <4 x i32> undef)
+  ret <8 x i16> %1
+}
+
+define <16 x i8> @undef_packsswb_128() {
+; CHECK-LABEL: @undef_packsswb_128(
+; CHECK-NEXT:    ret <16 x i8> undef
+;
+  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> undef, <8 x i16> undef)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @undef_packuswb_128() {
+; CHECK-LABEL: @undef_packuswb_128(
+; CHECK-NEXT:    ret <16 x i8> undef
+;
+  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> undef, <8 x i16> undef)
+  ret <16 x i8> %1
+}
+
+define <16 x i16> @undef_packssdw_256() {
+; CHECK-LABEL: @undef_packssdw_256(
+; CHECK-NEXT:    ret <16 x i16> undef
+;
+  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> undef, <8 x i32> undef)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @undef_packusdw_256() {
+; CHECK-LABEL: @undef_packusdw_256(
+; CHECK-NEXT:    ret <16 x i16> undef
+;
+  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> undef, <8 x i32> undef)
+  ret <16 x i16> %1
+}
+
+define <32 x i8> @undef_packsswb_256() {
+; CHECK-LABEL: @undef_packsswb_256(
+; CHECK-NEXT:    ret <32 x i8> undef
+;
+  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> undef, <16 x i16> undef)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @undef_packuswb_256() {
+; CHECK-LABEL: @undef_packuswb_256(
+; CHECK-NEXT:    ret <32 x i8> undef
+;
+  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> undef, <16 x i16> undef)
+  ret <32 x i8> %1
+}
+
+define <32 x i16> @undef_packssdw_512() {
+; CHECK-LABEL: @undef_packssdw_512(
+; CHECK-NEXT:    ret <32 x i16> undef
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> undef, <16 x i32> undef)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @undef_packusdw_512() {
+; CHECK-LABEL: @undef_packusdw_512(
+; CHECK-NEXT:    ret <32 x i16> undef
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> undef, <16 x i32> undef)
+  ret <32 x i16> %1
+}
+
+define <64 x i8> @undef_packsswb_512() {
+; CHECK-LABEL: @undef_packsswb_512(
+; CHECK-NEXT:    ret <64 x i8> undef
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> undef, <32 x i16> undef)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @undef_packuswb_512() {
+; CHECK-LABEL: @undef_packuswb_512(
+; CHECK-NEXT:    ret <64 x i8> undef
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> undef, <32 x i16> undef)
+  ret <64 x i8> %1
+}
+
+;
+; Constant Folding
+;
+
+define <8 x i16> @fold_packssdw_128() {
+; CHECK-LABEL: @fold_packssdw_128(
+; CHECK-NEXT:    ret <8 x i16> <i16 0, i16 -1, i16 32767, i16 -32768, i16 0, i16 0, i16 0, i16 0>
+;
+  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> <i32 0, i32 -1, i32 65536, i32 -131072>, <4 x i32> zeroinitializer)
+  ret <8 x i16> %1
+}
+
+define <8 x i16> @fold_packusdw_128() {
+; CHECK-LABEL: @fold_packusdw_128(
+; CHECK-NEXT:    ret <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 0, i16 0, i16 -32768, i16 -1>
+;
+  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> undef, <4 x i32> <i32 0, i32 -1, i32 32768, i32 65537>)
+  ret <8 x i16> %1
+}
+
+define <16 x i8> @fold_packsswb_128() {
+; CHECK-LABEL: @fold_packsswb_128(
+; CHECK-NEXT:    ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>
+;
+  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> zeroinitializer, <8 x i16> undef)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @fold_packuswb_128() {
+; CHECK-LABEL: @fold_packuswb_128(
+; CHECK-NEXT:    ret <16 x i8> <i8 0, i8 1, i8 0, i8 -1, i8 0, i8 0, i8 0, i8 15, i8 0, i8 127, i8 0, i8 1, i8 0, i8 1, i8 0, i8 0>
+;
+  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> <i16 0, i16 1, i16 -1, i16 255, i16 65535, i16 -32768, i16 -127, i16 15>, <8 x i16> <i16 -15, i16 127, i16 32768, i16 -65535, i16 -255, i16 1, i16 -1, i16 0>)
+  ret <16 x i8> %1
+}
+
+define <16 x i16> @fold_packssdw_256() {
+; CHECK-LABEL: @fold_packssdw_256(
+; CHECK-NEXT:    ret <16 x i16> <i16 0, i16 256, i16 32767, i16 -32768, i16 undef, i16 undef, i16 undef, i16 undef, i16 -127, i16 -32768, i16 -32767, i16 32767, i16 undef, i16 undef, i16 undef, i16 undef>
+;
+  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> <i32 0, i32 256, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767>, <8 x i32> undef)
+  ret <16 x i16> %1
+}
+
+define <16 x i16> @fold_packusdw_256() {
+; CHECK-LABEL: @fold_packusdw_256(
+; CHECK-NEXT:    ret <16 x i16> <i16 0, i16 0, i16 0, i16 -1, i16 0, i16 256, i16 -1, i16 0, i16 127, i16 -32768, i16 32767, i16 0, i16 0, i16 0, i16 0, i16 32767>
+;
+  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> <i32 0, i32 -256, i32 -65535, i32 65536, i32 127, i32 32768, i32 32767, i32 -32767>, <8 x i32> <i32 0, i32 256, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767>)
+  ret <16 x i16> %1
+}
+
+define <32 x i8> @fold_packsswb_256() {
+; CHECK-LABEL: @fold_packsswb_256(
+; CHECK-NEXT:    ret <32 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+;
+  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> undef, <16 x i16> zeroinitializer)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @fold_packuswb_256() {
+; CHECK-LABEL: @fold_packuswb_256(
+; CHECK-NEXT:    ret <32 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64>
+;
+  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> zeroinitializer, <16 x i16> <i16 0, i16 -127, i16 -128, i16 -32768, i16 65536, i16 255, i16 256, i16 512, i16 -1, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64>)
+  ret <32 x i8> %1
+}
+
+define <32 x i16> @fold_packssdw_512() {
+; CHECK-LABEL: @fold_packssdw_512(
+; CHECK-NEXT:    ret <32 x i16> <i16 0, i16 512, i16 32767, i16 -32768, i16 undef, i16 undef, i16 undef, i16 undef, i16 -127, i16 -32768, i16 -32767, i16 32767, i16 undef, i16 undef, i16 undef, i16 undef, i16 0, i16 512, i16 32767, i16 -32768, i16 undef, i16 undef, i16 undef, i16 undef, i16 -127, i16 -32768, i16 -32767, i16 32767, i16 undef, i16 undef, i16 undef, i16 undef>
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> <i32 0, i32 512, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767, i32 0, i32 512, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767>, <16 x i32> undef)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @fold_packusdw_512() {
+; CHECK-LABEL: @fold_packusdw_512(
+; CHECK-NEXT:    ret <32 x i16> <i16 0, i16 0, i16 0, i16 -1, i16 0, i16 512, i16 -1, i16 0, i16 127, i16 -32768, i16 32767, i16 0, i16 0, i16 0, i16 0, i16 32767, i16 0, i16 0, i16 0, i16 -1, i16 0, i16 512, i16 -1, i16 0, i16 127, i16 -32768, i16 32767, i16 0, i16 0, i16 0, i16 0, i16 32767>
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> <i32 0, i32 -512, i32 -65535, i32 65536, i32 127, i32 32768, i32 32767, i32 -32767, i32 0, i32 -512, i32 -65535, i32 65536, i32 127, i32 32768, i32 32767, i32 -32767>, <16 x i32> <i32 0, i32 512, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767, i32 0, i32 512, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767>)
+  ret <32 x i16> %1
+}
+
+define <64 x i8> @fold_packsswb_512() {
+; CHECK-LABEL: @fold_packsswb_512(
+; CHECK-NEXT:    ret <64 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> undef, <32 x i16> zeroinitializer)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @fold_packuswb_512() {
+; CHECK-LABEL: @fold_packuswb_512(
+; CHECK-NEXT:    ret <64 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64>
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> zeroinitializer, <32 x i16> <i16 0, i16 -127, i16 -128, i16 -32768, i16 65536, i16 255, i16 512, i16 512, i16 -1, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 0, i16 -127, i16 -128, i16 -32768, i16 65536, i16 255, i16 512, i16 512, i16 -1, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64>)
+  ret <64 x i8> %1
+}
+
+;
+; Demanded Elts
+;
+
+define <8 x i16> @elts_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @elts_packssdw_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> [[A0:%.*]], <4 x i32> undef)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+;
+  %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 undef, i32 undef>
+  %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 1, i32 undef>
+  %3 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %1, <4 x i32> %2)
+  %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 7, i32 7, i32 7, i32 7>
+  ret <8 x i16> %4
+}
+
+define <8 x i16> @elts_packusdw_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @elts_packusdw_128(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+;
+  %1 = insertelement <4 x i32> %a0, i32 0, i32 0
+  %2 = insertelement <4 x i32> %a1, i32 0, i32 3
+  %3 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %1, <4 x i32> %2)
+  %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 undef>
+  ret <8 x i16> %4
+}
+
+define <16 x i8> @elts_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: @elts_packsswb_128(
+; CHECK-NEXT:    ret <16 x i8> zeroinitializer
+;
+  %1 = insertelement <8 x i16> %a0, i16 0, i32 0
+  %2 = insertelement <8 x i16> %a1, i16 0, i32 0
+  %3 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %1, <8 x i16> %2)
+  %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  ret <16 x i8> %4
+}
+
+define <16 x i8> @elts_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: @elts_packuswb_128(
+; CHECK-NEXT:    ret <16 x i8> undef
+;
+  %1 = insertelement <8 x i16> undef, i16 0, i32 0
+  %2 = insertelement <8 x i16> undef, i16 0, i32 0
+  %3 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %1, <8 x i16> %2)
+  %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+  ret <16 x i8> %4
+}
+
+define <16 x i16> @elts_packssdw_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @elts_packssdw_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> [[A0:%.*]], <8 x i32> undef)
+; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
+;
+  %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 undef, i32 2, i32 1, i32 undef, i32 undef, i32 6, i32 5, i32 undef>
+  %3 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %1, <8 x i32> %2)
+  %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 7, i32 8, i32 undef, i32 undef, i32 11, i32 12, i32 undef, i32 undef, i32 15>
+  ret <16 x i16> %4
+}
+
+define <16 x i16> @elts_packusdw_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: @elts_packusdw_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A1:%.*]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> undef, <8 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+;
+  %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %3 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %1, <8 x i32> %2)
+  %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i16> %4
+}
+
+define <32 x i8> @elts_packsswb_256(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: @elts_packsswb_256(
+; CHECK-NEXT:    ret <32 x i8> zeroinitializer
+;
+  %1 = insertelement <16 x i16> %a0, i16 0, i32 0
+  %2 = insertelement <16 x i16> %a1, i16 0, i32 8
+  %3 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %1, <16 x i16> %2)
+  %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
+  ret <32 x i8> %4
+}
+
+define <32 x i8> @elts_packuswb_256(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: @elts_packuswb_256(
+; CHECK-NEXT:    ret <32 x i8> undef
+;
+  %1 = insertelement <16 x i16> undef, i16 0, i32 1
+  %2 = insertelement <16 x i16> undef, i16 0, i32 0
+  %3 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %1, <16 x i16> %2)
+  %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> zeroinitializer
+  ret <32 x i8> %4
+}
+
+define <32 x i16> @elts_packssdw_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @elts_packssdw_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A0:%.*]], <16 x i32> undef)
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 9, i32 8, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 undef, i32 2, i32 1, i32 undef, i32 undef, i32 6, i32 5, i32 undef, i32 undef, i32 10, i32 9, i32 undef, i32 undef, i32 14, i32 13, i32 undef>
+  %3 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %1, <16 x i32> %2)
+  %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 7, i32 8, i32 undef, i32 undef, i32 11, i32 12, i32 undef, i32 undef, i32 15, i32 undef, i32 undef, i32 18, i32 19, i32 20, i32 undef, i32 undef, i32 23, i32 24, i32 undef, i32 undef, i32 27, i32 28, i32 undef, i32 undef, i32 31>
+  ret <32 x i16> %4
+}
+
+define <32 x i16> @elts_packusdw_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @elts_packusdw_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A1:%.*]], <16 x i32> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> undef, <16 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <32 x i16> [[TMP3]]
+;
+  %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+  %3 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %1, <16 x i32> %2)
+  %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <32 x i16> %4
+}
+
+define <64 x i8> @elts_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) {
+; CHECK-LABEL: @elts_packsswb_512(
+; CHECK-NEXT:    ret <64 x i8> zeroinitializer
+;
+  %1 = insertelement <32 x i16> %a0, i16 0, i32 0
+  %2 = insertelement <32 x i16> %a1, i16 0, i32 8
+  %3 = insertelement <32 x i16> %1, i16 0, i32 16
+  %4 = insertelement <32 x i16> %2, i16 0, i32 24
+  %5 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %3, <32 x i16> %4)
+  %6 = shufflevector <64 x i8> %5, <64 x i8> undef, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56>
+  ret <64 x i8> %6
+}
+
+define <64 x i8> @elts_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) {
+; CHECK-LABEL: @elts_packuswb_512(
+; CHECK-NEXT:    ret <64 x i8> undef
+;
+  %1 = insertelement <32 x i16> undef, i16 0, i32 1
+  %2 = insertelement <32 x i16> undef, i16 0, i32 0
+  %3 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %1, <32 x i16> %2)
+  %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> zeroinitializer
+  ret <64 x i8> %4
+}
+
+declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
+declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
+declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
+declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
+declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
+
+declare <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32>, <16 x i32>) nounwind readnone
+declare <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32>, <16 x i32>) nounwind readnone
+declare <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16>, <32 x i16>) nounwind readnone
+declare <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16>, <32 x i16>) nounwind readnone

Added: llvm/trunk/test/Transforms/InstCombine/X86/x86-pshufb.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/InstCombine/X86/x86-pshufb.ll?rev=358552&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/InstCombine/X86/x86-pshufb.ll (added)
+++ llvm/trunk/test/Transforms/InstCombine/X86/x86-pshufb.ll Tue Apr 16 21:52:47 2019
@@ -0,0 +1,514 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; Verify that instcombine is able to fold identity shuffles.
+
+define <16 x i8> @identity_test(<16 x i8> %InVec) {
+; CHECK-LABEL: @identity_test(
+; CHECK-NEXT:    ret <16 x i8> %InVec
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @identity_test_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @identity_test_avx2(
+; CHECK-NEXT:    ret <32 x i8> %InVec
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @identity_test_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @identity_test_avx512(
+; CHECK-NEXT:    ret <64 x i8> %InVec
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+  ret <64 x i8> %1
+}
+
+; Verify that instcombine is able to fold byte shuffles with zero masks.
+
+define <16 x i8> @fold_to_zero_vector(<16 x i8> %InVec) {
+; CHECK-LABEL: @fold_to_zero_vector(
+; CHECK-NEXT:    ret <16 x i8> zeroinitializer
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @fold_to_zero_vector_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @fold_to_zero_vector_avx2(
+; CHECK-NEXT:    ret <32 x i8> zeroinitializer
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @fold_to_zero_vector_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @fold_to_zero_vector_avx512(
+; CHECK-NEXT:    ret <64 x i8> zeroinitializer
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <64 x i8> %1
+}
+
+; Instcombine should be able to fold the following byte shuffle to a builtin shufflevector
+; with a shuffle mask of all zeroes.
+
+define <16 x i8> @splat_test(<16 x i8> %InVec) {
+; CHECK-LABEL: @splat_test(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> zeroinitializer)
+  ret <16 x i8> %1
+}
+
+; In the test case below, elements in the low 128-bit lane of the result
+; vector are equal to the lower byte of %InVec (shuffle index 0).
+; Elements in the high 128-bit lane of the result vector are equal to
+; the lower byte in the high 128-bit lane of %InVec (shuffle index 16).
+
+define <32 x i8> @splat_test_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @splat_test_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> zeroinitializer)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @splat_test_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @splat_test_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> undef, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> zeroinitializer)
+  ret <64 x i8> %1
+}
+
+; Each of the byte shuffles in the following tests is equivalent to a blend between
+; vector %InVec and a vector of all zeroes.
+
+define <16 x i8> @blend1(<16 x i8> %InVec) {
+; CHECK-LABEL: @blend1(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 16, i32 1, i32 16, i32 3, i32 16, i32 5, i32 16, i32 7, i32 16, i32 9, i32 16, i32 11, i32 16, i32 13, i32 16, i32 15>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 -128, i8 1, i8 -128, i8 3, i8 -128, i8 5, i8 -128, i8 7, i8 -128, i8 9, i8 -128, i8 11, i8 -128, i8 13, i8 -128, i8 15>)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @blend2(<16 x i8> %InVec) {
+; CHECK-LABEL: @blend2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 16, i32 16, i32 2, i32 3, i32 16, i32 16, i32 6, i32 7, i32 16, i32 16, i32 10, i32 11, i32 16, i32 16, i32 14, i32 15>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 -128, i8 -128, i8 2, i8 3, i8 -128, i8 -128, i8 6, i8 7, i8 -128, i8 -128, i8 10, i8 11, i8 -128, i8 -128, i8 14, i8 15>)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @blend3(<16 x i8> %InVec) {
+; CHECK-LABEL: @blend3(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 4, i32 5, i32 6, i32 7, i32 16, i32 16, i32 16, i32 16, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 12, i8 13, i8 14, i8 15>)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @blend4(<16 x i8> %InVec) {
+; CHECK-LABEL: @blend4(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @blend5(<16 x i8> %InVec) {
+; CHECK-LABEL: @blend5(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @blend6(<16 x i8> %InVec) {
+; CHECK-LABEL: @blend6(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 0, i8 1, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @blend1_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @blend1_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <32 x i32> <i32 32, i32 1, i32 32, i32 3, i32 32, i32 5, i32 32, i32 7, i32 32, i32 9, i32 32, i32 11, i32 32, i32 13, i32 32, i32 15, i32 48, i32 17, i32 48, i32 19, i32 48, i32 21, i32 48, i32 23, i32 48, i32 25, i32 48, i32 27, i32 48, i32 29, i32 48, i32 31>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 -128, i8 1, i8 -128, i8 3, i8 -128, i8 5, i8 -128, i8 7, i8 -128, i8 9, i8 -128, i8 11, i8 -128, i8 13, i8 -128, i8 15, i8 -128, i8 1, i8 -128, i8 3, i8 -128, i8 5, i8 -128, i8 7, i8 -128, i8 9, i8 -128, i8 11, i8 -128, i8 13, i8 -128, i8 15>)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @blend2_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @blend2_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <32 x i32> <i32 32, i32 32, i32 2, i32 3, i32 32, i32 32, i32 6, i32 7, i32 32, i32 32, i32 10, i32 11, i32 32, i32 32, i32 14, i32 15, i32 48, i32 48, i32 18, i32 19, i32 48, i32 48, i32 22, i32 23, i32 48, i32 48, i32 26, i32 27, i32 48, i32 48, i32 30, i32 31>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 -128, i8 -128, i8 2, i8 3, i8 -128, i8 -128, i8 6, i8 7, i8 -128, i8 -128, i8 10, i8 11, i8 -128, i8 -128, i8 14, i8 15, i8 -128, i8 -128, i8 2, i8 3, i8 -128, i8 -128, i8 6, i8 7, i8 -128, i8 -128, i8 10, i8 11, i8 -128, i8 -128, i8 14, i8 15>)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @blend3_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @blend3_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 4, i32 5, i32 6, i32 7, i32 32, i32 32, i32 32, i32 32, i32 12, i32 13, i32 14, i32 15, i32 48, i32 48, i32 48, i32 48, i32 20, i32 21, i32 22, i32 23, i32 48, i32 48, i32 48, i32 48, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 12, i8 13, i8 14, i8 15>)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @blend4_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @blend4_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @blend5_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @blend5_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 16, i32 17, i32 18, i32 19, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 0, i8 1, i8 2, i8 3, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @blend6_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @blend6_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <32 x i32> <i32 0, i32 1, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 16, i32 17, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 0, i8 1, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 0, i8 1, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @blend1_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @blend1_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <64 x i32> <i32 64, i32 1, i32 64, i32 3, i32 64, i32 5, i32 64, i32 7, i32 64, i32 9, i32 64, i32 11, i32 64, i32 13, i32 64, i32 15, i32 80, i32 17, i32 80, i32 19, i32 80, i32 21, i32 80, i32 23, i32 80, i32 25, i32 80, i32 27, i32 80, i32 29, i32 80, i32 31, i32 96, i32 33, i32 96, i32 35, i32 96, i32 37, i32 96, i32 39, i32 96, i32 41, i32 96, i32 43, i32 96, i32 45, i32 96, i32 47, i32 112, i32 49, i32 112, i32 51, i32 112, i32 53, i32 112, i32 55, i32 112, i32 57, i32 112, i32 59, i32 112, i32 61, i32 112, i32 63>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 -128, i8 1, i8 -128, i8 3, i8 -128, i8 5, i8 -128, i8 7, i8 -128, i8 9, i8 -128, i8 11, i8 -128, i8 13, i8 -128, i8 15, i8 -128, i8 1, i8 -128, i8 3, i8 -128, i8 5, i8 -128, i8 7, i8 -128, i8 9, i8 -128, i8 11, i8 -128, i8 13, i8 -128, i8 15, i8 -128, i8 1, i8 -128, i8 3, i8 -128, i8 5, i8 -128, i8 7, i8 -128, i8 9, i8 -128, i8 11, i8 -128, i8 13, i8 -128, i8 15, i8 -128, i8 1, i8 -128, i8 3, i8 -128, i8 5, i8 -128, i8 7, i8 -128, i8 9, i8 -128, i8 11, i8 -128, i8 13, i8 -128, i8 15>)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @blend2_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @blend2_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <64 x i32> <i32 64, i32 64, i32 2, i32 3, i32 64, i32 64, i32 6, i32 7, i32 64, i32 64, i32 10, i32 11, i32 64, i32 64, i32 14, i32 15, i32 80, i32 80, i32 18, i32 19, i32 80, i32 80, i32 22, i32 23, i32 80, i32 80, i32 26, i32 27, i32 80, i32 80, i32 30, i32 31, i32 96, i32 96, i32 34, i32 35, i32 96, i32 96, i32 38, i32 39, i32 96, i32 96, i32 42, i32 43, i32 96, i32 96, i32 46, i32 47, i32 112, i32 112, i32 50, i32 51, i32 112, i32 112, i32 54, i32 55, i32 112, i32 112, i32 58, i32 59, i32 112, i32 112, i32 62, i32 63>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 -128, i8 -128, i8 2, i8 3, i8 -128, i8 -128, i8 6, i8 7, i8 -128, i8 -128, i8 10, i8 11, i8 -128, i8 -128, i8 14, i8 15, i8 -128, i8 -128, i8 2, i8 3, i8 -128, i8 -128, i8 6, i8 7, i8 -128, i8 -128, i8 10, i8 11, i8 -128, i8 -128, i8 14, i8 15, i8 -128, i8 -128, i8 2, i8 3, i8 -128, i8 -128, i8 6, i8 7, i8 -128, i8 -128, i8 10, i8 11, i8 -128, i8 -128, i8 14, i8 15, i8 -128, i8 -128, i8 2, i8 3, i8 -128, i8 -128, i8 6, i8 7, i8 -128, i8 -128, i8 10, i8 11, i8 -128, i8 -128, i8 14, i8 15>)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @blend3_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @blend3_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <64 x i32> <i32 64, i32 64, i32 64, i32 64, i32 4, i32 5, i32 6, i32 7, i32 64, i32 64, i32 64, i32 64, i32 12, i32 13, i32 14, i32 15, i32 80, i32 80, i32 80, i32 80, i32 20, i32 21, i32 22, i32 23, i32 80, i32 80, i32 80, i32 80, i32 28, i32 29, i32 30, i32 31, i32 96, i32 96, i32 96, i32 96, i32 36, i32 37, i32 38, i32 39, i32 96, i32 96, i32 96, i32 96, i32 44, i32 45, i32 46, i32 47, i32 112, i32 112, i32 112, i32 112, i32 52, i32 53, i32 54, i32 55, i32 112, i32 112, i32 112, i32 112, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 12, i8 13, i8 14, i8 15>)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @blend4_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @blend4_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <64 x i32> <i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @blend5_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @blend5_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 16, i32 17, i32 18, i32 19, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 32, i32 33, i32 34, i32 35, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 48, i32 49, i32 50, i32 51, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 0, i8 1, i8 2, i8 3, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 0, i8 1, i8 2, i8 3, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 0, i8 1, i8 2, i8 3, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @blend6_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @blend6_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <64 x i32> <i32 0, i32 1, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 16, i32 17, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 32, i32 33, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 48, i32 49, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 0, i8 1, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128,i8 0, i8 1, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 0, i8 1, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 0, i8 1, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <64 x i8> %1
+}
+
+; movq idiom.
+define <16 x i8> @movq_idiom(<16 x i8> %InVec) {
+; CHECK-LABEL: @movq_idiom(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @movq_idiom_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @movq_idiom_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @movq_idiom_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @movq_idiom_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 80, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 96, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>)
+  ret <64 x i8> %1
+}
+
+; Vector permutations using byte shuffles.
+
+define <16 x i8> @permute1(<16 x i8> %InVec) {
+; CHECK-LABEL: @permute1(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 4, i8 5, i8 6, i8 7, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 12, i8 13, i8 14, i8 15>)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @permute2(<16 x i8> %InVec) {
+; CHECK-LABEL: @permute2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @permute1_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @permute1_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15, i32 20, i32 21, i32 22, i32 23, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 4, i8 5, i8 6, i8 7, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 12, i8 13, i8 14, i8 15, i8 4, i8 5, i8 6, i8 7, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 12, i8 13, i8 14, i8 15>)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @permute2_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @permute2_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @permute1_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @permute1_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15, i32 20, i32 21, i32 22, i32 23, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31, i32 36, i32 37, i32 38, i32 39, i32 36, i32 37, i32 38, i32 39, i32 44, i32 45, i32 46, i32 47, i32 44, i32 45, i32 46, i32 47, i32 52, i32 53, i32 54, i32 55, i32 52, i32 53, i32 54, i32 55, i32 60, i32 61, i32 62, i32 63, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 4, i8 5, i8 6, i8 7, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 12, i8 13, i8 14, i8 15, i8 4, i8 5, i8 6, i8 7, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 12, i8 13, i8 14, i8 15, i8 4, i8 5, i8 6, i8 7, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 12, i8 13, i8 14, i8 15, i8 4, i8 5, i8 6, i8 7, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 12, i8 13, i8 14, i8 15>)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @permute2_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @permute2_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
+  ret <64 x i8> %1
+}
+
+; Test that instcombine correctly folds a pshufb with values that
+; are not -128 and that are not encoded in four bits.
+
+define <16 x i8> @identity_test2_2(<16 x i8> %InVec) {
+; CHECK-LABEL: @identity_test2_2(
+; CHECK-NEXT:    ret <16 x i8> %InVec
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @identity_test_avx2_2(<32 x i8> %InVec) {
+; CHECK-LABEL: @identity_test_avx2_2(
+; CHECK-NEXT:    ret <32 x i8> %InVec
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 16, i8 33, i8 66, i8 19, i8 36, i8 69, i8 22, i8 39, i8 72, i8 25, i8 42, i8 75, i8 28, i8 45, i8 78, i8 31, i8 48, i8 81, i8 34, i8 51, i8 84, i8 37, i8 54, i8 87, i8 40, i8 57, i8 90, i8 43, i8 60, i8 93, i8 46, i8 63>)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @identity_test_avx512_2(<64 x i8> %InVec) {
+; CHECK-LABEL: @identity_test_avx512_2(
+; CHECK-NEXT:    ret <64 x i8> %InVec
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 16, i8 33, i8 66, i8 19, i8 36, i8 69, i8 22, i8 39, i8 72, i8 25, i8 42, i8 75, i8 28, i8 45, i8 78, i8 31, i8 48, i8 81, i8 34, i8 51, i8 84, i8 37, i8 54, i8 87, i8 40, i8 57, i8 90, i8 43, i8 60, i8 93, i8 46, i8 63, i8 96, i8 49, i8 66, i8 99, i8 52, i8 69, i8 102, i8 55, i8 72, i8 105, i8 58, i8 75, i8 108, i8 61, i8 78, i8 111, i8 64, i8 81, i8 114, i8 67, i8 84, i8 117, i8 70, i8 87, i8 120, i8 73, i8 90, i8 123, i8 76, i8 93, i8 126, i8 79>)
+  ret <64 x i8> %1
+}
+
+define <16 x i8> @fold_to_zero_vector_2(<16 x i8> %InVec) {
+; CHECK-LABEL: @fold_to_zero_vector_2(
+; CHECK-NEXT:    ret <16 x i8> zeroinitializer
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 -125, i8 -1, i8 -53, i8 -32, i8 -4, i8 -7, i8 -33, i8 -66, i8 -99, i8 -120, i8 -100, i8 -22, i8 -17, i8 -1, i8 -11, i8 -15>)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @fold_to_zero_vector_avx2_2(<32 x i8> %InVec) {
+; CHECK-LABEL: @fold_to_zero_vector_avx2_2(
+; CHECK-NEXT:    ret <32 x i8> zeroinitializer
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 -127, i8 -1, i8 -53, i8 -32, i8 -4, i8 -7, i8 -33, i8 -66, i8 -99, i8 -120, i8 -100, i8 -22, i8 -17, i8 -1, i8 -11, i8 -15, i8 -126, i8 -2, i8 -52, i8 -31, i8 -5, i8 -8, i8 -34, i8 -67, i8 -100, i8 -119, i8 -101, i8 -23, i8 -16, i8 -2, i8 -12, i8 -16>)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @fold_to_zero_vector_avx512_2(<64 x i8> %InVec) {
+; CHECK-LABEL: @fold_to_zero_vector_avx512_2(
+; CHECK-NEXT:    ret <64 x i8> zeroinitializer
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 -127, i8 -1, i8 -53, i8 -32, i8 -4, i8 -7, i8 -33, i8 -66, i8 -99, i8 -120, i8 -100, i8 -22, i8 -17, i8 -1, i8 -11, i8 -15, i8 -126, i8 -2, i8 -52, i8 -31, i8 -5, i8 -8, i8 -34, i8 -67, i8 -100, i8 -119, i8 -101, i8 -23, i8 -16, i8 -2, i8 -12, i8 -16, i8 -125, i8 -3, i8 -51, i8 -30, i8 -6, i8 -9, i8 -35, i8 -68, i8 -101, i8 -118, i8 -102, i8 -24, i8 -15, i8 -3, i8 -13, i8 -17, i8 -124, i8 -4, i8 -56, i8 -29, i8 -7, i8 -10, i8 -36, i8 -69, i8 -102, i8 -117, i8 -103, i8 -25, i8 -14, i8 -4, i8 -14, i8 -18>)
+  ret <64 x i8> %1
+}
+
+define <16 x i8> @permute3(<16 x i8> %InVec) {
+; CHECK-LABEL: @permute3(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 48, i8 17, i8 34, i8 51, i8 20, i8 37, i8 54, i8 23, i8 16, i8 49, i8 66, i8 19, i8 52, i8 69, i8 22, i8 55>)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @permute3_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @permute3_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15, i32 20, i32 21, i32 22, i32 23, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 52, i8 21, i8 38, i8 55, i8 20, i8 37, i8 54, i8 23, i8 28, i8 61, i8 78, i8 31, i8 60, i8 29, i8 30, i8 79, i8 52, i8 21, i8 38, i8 55, i8 20, i8 53, i8 102, i8 23, i8 92, i8 93, i8 94, i8 95, i8 108, i8 109, i8 110, i8 111>)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @permute3_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @permute3_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15, i32 20, i32 21, i32 22, i32 23, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31, i32 36, i32 37, i32 38, i32 39, i32 36, i32 37, i32 38, i32 39, i32 44, i32 45, i32 46, i32 47, i32 44, i32 45, i32 46, i32 47, i32 52, i32 53, i32 54, i32 55, i32 52, i32 53, i32 54, i32 55, i32 60, i32 61, i32 62, i32 63, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 52, i8 21, i8 38, i8 55, i8 20, i8 37, i8 54, i8 23, i8 28, i8 61, i8 78, i8 31, i8 60, i8 29, i8 30, i8 79, i8 52, i8 21, i8 38, i8 55, i8 20, i8 53, i8 102, i8 23, i8 92, i8 93, i8 94, i8 95, i8 108, i8 109, i8 110, i8 111, i8 52, i8 21, i8 38, i8 55, i8 20, i8 37, i8 54, i8 23, i8 28, i8 61, i8 78, i8 31, i8 60, i8 29, i8 30, i8 79, i8 52, i8 21, i8 38, i8 55, i8 20, i8 53, i8 102, i8 23, i8 108, i8 109, i8 110, i8 111, i8 124, i8 125, i8 126, i8 127>)
+  ret <64 x i8> %1
+}
+
+; FIXME: Verify that instcombine is able to fold constant byte shuffles with undef mask elements.
+
+define <16 x i8> @fold_with_undef_elts(<16 x i8> %InVec) {
+; CHECK-LABEL: @fold_with_undef_elts(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> %InVec, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 0, i32 16, i32 undef, i32 16, i32 1, i32 16, i32 undef, i32 16, i32 2, i32 16, i32 undef, i32 16, i32 3, i32 16, i32 undef, i32 16>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> <i8 0, i8 -128, i8 undef, i8 -128, i8 1, i8 -128, i8 undef, i8 -128, i8 2, i8 -128, i8 undef, i8 -128, i8 3, i8 -128, i8 undef, i8 -128>)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @fold_with_undef_elts_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @fold_with_undef_elts_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i8> %InVec, <32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <32 x i32> <i32 0, i32 32, i32 undef, i32 32, i32 1, i32 32, i32 undef, i32 32, i32 2, i32 32, i32 undef, i32 32, i32 3, i32 32, i32 undef, i32 32, i32 16, i32 48, i32 undef, i32 48, i32 17, i32 48, i32 undef, i32 48, i32 18, i32 48, i32 undef, i32 48, i32 19, i32 48, i32 undef, i32 48>
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> <i8 0, i8 -128, i8 undef, i8 -128, i8 1, i8 -128, i8 undef, i8 -128, i8 2, i8 -128, i8 undef, i8 -128, i8 3, i8 -128, i8 undef, i8 -128, i8 0, i8 -128, i8 undef, i8 -128, i8 1, i8 -128, i8 undef, i8 -128, i8 2, i8 -128, i8 undef, i8 -128, i8 3, i8 -128, i8 undef, i8 -128>)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @fold_with_undef_elts_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @fold_with_undef_elts_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i8> %InVec, <64 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <64 x i32> <i32 0, i32 64, i32 undef, i32 64, i32 1, i32 64, i32 undef, i32 64, i32 2, i32 64, i32 undef, i32 64, i32 3, i32 64, i32 undef, i32 64, i32 16, i32 80, i32 undef, i32 80, i32 17, i32 80, i32 undef, i32 80, i32 18, i32 80, i32 undef, i32 80, i32 19, i32 80, i32 undef, i32 80, i32 32, i32 96, i32 undef, i32 96, i32 33, i32 96, i32 undef, i32 96, i32 34, i32 96, i32 undef, i32 96, i32 35, i32 96, i32 undef, i32 96, i32 48, i32 112, i32 undef, i32 112, i32 49, i32 112, i32 undef, i32 112, i32 50, i32 112, i32 undef, i32 112, i32 51, i32 112, i32 undef, i32 112>
+; CHECK-NEXT:    ret <64 x i8> [[TMP1]]
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> <i8 0, i8 -128, i8 undef, i8 -128, i8 1, i8 -128, i8 undef, i8 -128, i8 2, i8 -128, i8 undef, i8 -128, i8 3, i8 -128, i8 undef, i8 -128, i8 0, i8 -128, i8 undef, i8 -128, i8 1, i8 -128, i8 undef, i8 -128, i8 2, i8 -128, i8 undef, i8 -128, i8 3, i8 -128, i8 undef, i8 -128, i8 0, i8 -128, i8 undef, i8 -128, i8 1, i8 -128, i8 undef, i8 -128, i8 2, i8 -128, i8 undef, i8 -128, i8 3, i8 -128, i8 undef, i8 -128, i8 0, i8 -128, i8 undef, i8 -128, i8 1, i8 -128, i8 undef, i8 -128, i8 2, i8 -128, i8 undef, i8 -128, i8 3, i8 -128, i8 undef, i8 -128>)
+  ret <64 x i8> %1
+}
+
+define <16 x i8> @fold_with_allundef_elts(<16 x i8> %InVec) {
+; CHECK-LABEL: @fold_with_allundef_elts(
+; CHECK-NEXT:    ret <16 x i8> undef
+;
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> undef)
+  ret <16 x i8> %1
+}
+
+define <32 x i8> @fold_with_allundef_elts_avx2(<32 x i8> %InVec) {
+; CHECK-LABEL: @fold_with_allundef_elts_avx2(
+; CHECK-NEXT:    ret <32 x i8> undef
+;
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> undef)
+  ret <32 x i8> %1
+}
+
+define <64 x i8> @fold_with_allundef_elts_avx512(<64 x i8> %InVec) {
+; CHECK-LABEL: @fold_with_allundef_elts_avx512(
+; CHECK-NEXT:    ret <64 x i8> undef
+;
+  %1 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> undef)
+  ret <64 x i8> %1
+}
+
+; Demanded elts tests.
+
+define <16 x i8> @demanded_elts_insertion(<16 x i8> %InVec, <16 x i8> %BaseMask, i8 %M0, i8 %M15) {
+; CHECK-LABEL: @demanded_elts_insertion(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> %BaseMask)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>
+; CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+;
+  %1 = insertelement <16 x i8> %BaseMask, i8 %M0, i32 0
+  %2 = insertelement <16 x i8> %1, i8 %M15, i32 15
+  %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %InVec, <16 x i8> %2)
+  %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>
+  ret <16 x i8> %4
+}
+
+define <32 x i8> @demanded_elts_insertion_avx2(<32 x i8> %InVec, <32 x i8> %BaseMask, i8 %M0, i8 %M22) {
+; CHECK-LABEL: @demanded_elts_insertion_avx2(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> %BaseMask)
+; CHECK-NEXT:    ret <32 x i8> [[TMP1]]
+;
+  %1 = insertelement <32 x i8> %BaseMask, i8 %M0, i32 0
+  %2 = insertelement <32 x i8> %1, i8 %M22, i32 22
+  %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %InVec, <32 x i8> %2)
+  %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 undef, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x i8> %4
+}
+
+define <64 x i8> @demanded_elts_insertion_avx512(<64 x i8> %InVec, <64 x i8> %BaseMask, i8 %M0, i8 %M30) {
+; CHECK-LABEL: @demanded_elts_insertion_avx512(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <64 x i8> undef, i8 %M0, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <64 x i8> [[TMP2]], <64 x i8> undef, <64 x i32> zeroinitializer
+; CHECK-NEXT:    ret <64 x i8> [[TMP3]]
+;
+  %1 = insertelement <64 x i8> %BaseMask, i8 %M0, i32 0
+  %2 = insertelement <64 x i8> %1, i8 %M30, i32 30
+  %3 = tail call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %InVec, <64 x i8> %2)
+  %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> zeroinitializer
+  ret <64 x i8> %4
+}
+
+declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
+declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>)
+declare <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8>, <64 x i8>)