[llvm] [InstCombine] Fold shuffles through all trivially vectorizable intrinsics (PR #141979)

Thu May 29 11:43:42 PDT 2025

https://github.com/lukel97 updated https://github.com/llvm/llvm-project/pull/141979

>From 46d23a731a12b19cc1043fd30b2a9155796ba67c Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 29 May 2025 17:44:06 +0100
Subject: [PATCH 1/3] Precommit tests

---
 llvm/test/Transforms/InstCombine/abs-1.ll | 11 +++++++++++
 llvm/test/Transforms/InstCombine/fma.ll   | 15 +++++++++++++++
 llvm/test/Transforms/InstCombine/sqrt.ll  | 11 +++++++++++
 3 files changed, 37 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/abs-1.ll b/llvm/test/Transforms/InstCombine/abs-1.ll
index 7037647d116ba..8841b25fb249d 100644
--- a/llvm/test/Transforms/InstCombine/abs-1.ll
+++ b/llvm/test/Transforms/InstCombine/abs-1.ll
@@ -978,3 +978,14 @@ define i32 @abs_diff_signed_slt_no_nsw_swap(i32 %a, i32 %b) {
   %cond = select i1 %cmp, i32 %sub_ba, i32 %sub_ab
   ret i32 %cond
 }
+
+define <2 x i32> @abs_unary_shuffle_ops(<2 x i32> %x) {
+; CHECK-LABEL: @abs_unary_shuffle_ops(
+; CHECK-NEXT:    [[R1:%.*]] = shufflevector <2 x i32> [[R:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[R2:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[R1]], i1 false)
+; CHECK-NEXT:    ret <2 x i32> [[R2]]
+;
+  %a = shufflevector <2 x i32> %x, <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+  %r = call <2 x i32> @llvm.abs(<2 x i32> %a, i1 false)
+  ret <2 x i32> %r
+}
diff --git a/llvm/test/Transforms/InstCombine/fma.ll b/llvm/test/Transforms/InstCombine/fma.ll
index f0d4f776a5d90..800f9beb974d9 100644
--- a/llvm/test/Transforms/InstCombine/fma.ll
+++ b/llvm/test/Transforms/InstCombine/fma.ll
@@ -972,6 +972,21 @@ define <2 x half> @fma_negone_vec_partial_undef(<2 x half> %x, <2 x half> %y) {
   ret <2 x half> %sub
 }
 
+define <2 x float> @fmuladd_unary_shuffle_ops(<2 x float> %x, <2 x float> %y, <2 x float> %z) {
+; CHECK-LABEL: @fmuladd_unary_shuffle_ops(
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <2 x float> [[Y:%.*]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[C:%.*]] = shufflevector <2 x float> [[Z:%.*]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[A]], <2 x float> [[B]], <2 x float> [[C]])
+; CHECK-NEXT:    ret <2 x float> [[R]]
+;
+  %a = shufflevector <2 x float> %x, <2 x float> poison, <2 x i32> <i32 1, i32 0>
+  %b = shufflevector <2 x float> %y, <2 x float> poison, <2 x i32> <i32 1, i32 0>
+  %c = shufflevector <2 x float> %z, <2 x float> poison, <2 x i32> <i32 1, i32 0>
+  %r = call <2 x float> @llvm.fmuladd(<2 x float> %a, <2 x float> %b, <2 x float> %c)
+  ret <2 x float> %r
+}
+
 ; negative tests
 
 define half @fma_non_negone(half %x, half %y) {
diff --git a/llvm/test/Transforms/InstCombine/sqrt.ll b/llvm/test/Transforms/InstCombine/sqrt.ll
index 0f4db3b3a65ae..ccc8fc693d58d 100644
--- a/llvm/test/Transforms/InstCombine/sqrt.ll
+++ b/llvm/test/Transforms/InstCombine/sqrt.ll
@@ -201,6 +201,17 @@ define <2 x float> @sqrt_exp_vec(<2 x float> %x) {
   ret <2 x float> %res
 }
 
+define <2 x float> @sqrt_unary_shuffle_ops(<2 x float> %x) {
+; CHECK-LABEL: @sqrt_unary_shuffle_ops(
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[A]])
+; CHECK-NEXT:    ret <2 x float> [[R]]
+;
+  %a = shufflevector <2 x float> %x, <2 x float> poison, <2 x i32> <i32 1, i32 0>
+  %r = call <2 x float> @llvm.sqrt(<2 x float> %a)
+  ret <2 x float> %r
+}
+
 declare i32 @foo(double)
 declare double @sqrt(double) readnone
 declare float @sqrtf(float)

>From 4c47e2b8737f8570e9fd34b0dc910aa612cf6318 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 29 May 2025 17:45:34 +0100
Subject: [PATCH 2/3] [InstCombine] Fold shuffles through all trivially
 vectorizable intrinsics

This addresses a TODO in foldShuffledIntrinsicOperands to use isTriviallyVectorizable instead of a hardcoded list of intrinsics, which in turn allows more intriniscs to be scalarized by VectorCombine.

>From what I can tell every intrinsic here should be speculatable so an assertion was added.

Because this enables intrinsics like abs which have a scalar operand, we need to also check isVectorIntrinsicWithScalarOpAtArg.
---
 .../InstCombine/InstCombineCalls.cpp          | 41 ++++++++++---------
 llvm/test/Transforms/InstCombine/abs-1.ll     |  6 +--
 llvm/test/Transforms/InstCombine/fma.ll       |  8 ++--
 llvm/test/Transforms/InstCombine/sqrt.ll      |  6 +--
 4 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index e101edf4a6208..5eb466f6c6df1 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1401,26 +1401,25 @@ static Instruction *factorizeMinMaxTree(IntrinsicInst *II) {
 /// try to shuffle after the intrinsic.
 Instruction *
 InstCombinerImpl::foldShuffledIntrinsicOperands(IntrinsicInst *II) {
-  // TODO: This should be extended to handle other intrinsics like fshl, ctpop,
-  //       etc. Use llvm::isTriviallyVectorizable() and related to determine
-  //       which intrinsics are safe to shuffle?
-  switch (II->getIntrinsicID()) {
-  case Intrinsic::smax:
-  case Intrinsic::smin:
-  case Intrinsic::umax:
-  case Intrinsic::umin:
-  case Intrinsic::fma:
-  case Intrinsic::fshl:
-  case Intrinsic::fshr:
-    break;
-  default:
+  if (!isTriviallyVectorizable(II->getIntrinsicID()))
+    return nullptr;
+
+  assert(isSafeToSpeculativelyExecute(II) &&
+         "Trivially vectorizable but not safe to speculatively execute?");
+
+  // fabs is canonicalized to fabs (shuffle ...) in foldShuffleOfUnaryOps, so
+  // avoid undoing it.
+  if (match(II, m_FAbs(m_Value())))
     return nullptr;
-  }
 
   Value *X;
   Constant *C;
   ArrayRef<int> Mask;
-  auto *NonConstArg = find_if_not(II->args(), IsaPred<Constant>);
+  auto *NonConstArg = find_if_not(II->args(), [&II](Use &Arg) {
+    return isa<Constant>(Arg.get()) ||
+           isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
+                                              Arg.getOperandNo(), nullptr);
+  });
   if (!NonConstArg ||
       !match(NonConstArg, m_Shuffle(m_Value(X), m_Poison(), m_Mask(Mask))))
     return nullptr;
@@ -1432,11 +1431,15 @@ InstCombinerImpl::foldShuffledIntrinsicOperands(IntrinsicInst *II) {
   // See if all arguments are shuffled with the same mask.
   SmallVector<Value *, 4> NewArgs;
   Type *SrcTy = X->getType();
-  for (Value *Arg : II->args()) {
-    if (match(Arg, m_Shuffle(m_Value(X), m_Poison(), m_SpecificMask(Mask))) &&
-        X->getType() == SrcTy)
+  for (Use &Arg : II->args()) {
+    if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
+                                           Arg.getOperandNo(), nullptr))
+      NewArgs.push_back(Arg);
+    else if (match(&Arg,
+                   m_Shuffle(m_Value(X), m_Poison(), m_SpecificMask(Mask))) &&
+             X->getType() == SrcTy)
       NewArgs.push_back(X);
-    else if (match(Arg, m_ImmConstant(C))) {
+    else if (match(&Arg, m_ImmConstant(C))) {
       // If it's a constant, try find the constant that would be shuffled to C.
       if (Constant *ShuffledC =
               unshuffleConstant(Mask, C, cast<VectorType>(SrcTy)))
diff --git a/llvm/test/Transforms/InstCombine/abs-1.ll b/llvm/test/Transforms/InstCombine/abs-1.ll
index 8841b25fb249d..fd67fc3421498 100644
--- a/llvm/test/Transforms/InstCombine/abs-1.ll
+++ b/llvm/test/Transforms/InstCombine/abs-1.ll
@@ -981,9 +981,9 @@ define i32 @abs_diff_signed_slt_no_nsw_swap(i32 %a, i32 %b) {
 
 define <2 x i32> @abs_unary_shuffle_ops(<2 x i32> %x) {
 ; CHECK-LABEL: @abs_unary_shuffle_ops(
-; CHECK-NEXT:    [[R1:%.*]] = shufflevector <2 x i32> [[R:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[R2:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[R1]], i1 false)
-; CHECK-NEXT:    ret <2 x i32> [[R2]]
+; CHECK-NEXT:    [[R2:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[R1:%.*]], i1 false)
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i32> [[R2]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
   %a = shufflevector <2 x i32> %x, <2 x i32> poison, <2 x i32> <i32 1, i32 0>
   %r = call <2 x i32> @llvm.abs(<2 x i32> %a, i1 false)
diff --git a/llvm/test/Transforms/InstCombine/fma.ll b/llvm/test/Transforms/InstCombine/fma.ll
index 800f9beb974d9..e3d3e722bcc23 100644
--- a/llvm/test/Transforms/InstCombine/fma.ll
+++ b/llvm/test/Transforms/InstCombine/fma.ll
@@ -974,11 +974,9 @@ define <2 x half> @fma_negone_vec_partial_undef(<2 x half> %x, <2 x half> %y) {
 
 define <2 x float> @fmuladd_unary_shuffle_ops(<2 x float> %x, <2 x float> %y, <2 x float> %z) {
 ; CHECK-LABEL: @fmuladd_unary_shuffle_ops(
-; CHECK-NEXT:    [[A:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[B:%.*]] = shufflevector <2 x float> [[Y:%.*]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[C:%.*]] = shufflevector <2 x float> [[Z:%.*]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[R:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[A]], <2 x float> [[B]], <2 x float> [[C]])
-; CHECK-NEXT:    ret <2 x float> [[R]]
+; CHECK-NEXT:    [[R:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]])
+; CHECK-NEXT:    [[R1:%.*]] = shufflevector <2 x float> [[R]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    ret <2 x float> [[R1]]
 ;
   %a = shufflevector <2 x float> %x, <2 x float> poison, <2 x i32> <i32 1, i32 0>
   %b = shufflevector <2 x float> %y, <2 x float> poison, <2 x i32> <i32 1, i32 0>
diff --git a/llvm/test/Transforms/InstCombine/sqrt.ll b/llvm/test/Transforms/InstCombine/sqrt.ll
index ccc8fc693d58d..2fda5bc37d023 100644
--- a/llvm/test/Transforms/InstCombine/sqrt.ll
+++ b/llvm/test/Transforms/InstCombine/sqrt.ll
@@ -203,9 +203,9 @@ define <2 x float> @sqrt_exp_vec(<2 x float> %x) {
 
 define <2 x float> @sqrt_unary_shuffle_ops(<2 x float> %x) {
 ; CHECK-LABEL: @sqrt_unary_shuffle_ops(
-; CHECK-NEXT:    [[A:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[R:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[A]])
-; CHECK-NEXT:    ret <2 x float> [[R]]
+; CHECK-NEXT:    [[R:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[A:%.*]])
+; CHECK-NEXT:    [[R1:%.*]] = shufflevector <2 x float> [[R]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    ret <2 x float> [[R1]]
 ;
   %a = shufflevector <2 x float> %x, <2 x float> poison, <2 x i32> <i32 1, i32 0>
   %r = call <2 x float> @llvm.sqrt(<2 x float> %a)

>From 3ab18649a881566b455928c147469b5fa07834c7 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 29 May 2025 19:43:25 +0100
Subject: [PATCH 3/3] Update SLP tests

---
 .../AMDGPU/add_sub_sat-inseltpoison.ll        | 30 ++++++----------
 .../SLPVectorizer/AMDGPU/add_sub_sat.ll       | 30 ++++++----------
 .../X86/alternate-calls-inseltpoison.ll       | 36 +++++++++----------
 .../SLPVectorizer/X86/alternate-calls.ll      | 36 +++++++++----------
 4 files changed, 56 insertions(+), 76 deletions(-)

diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
index 3749bdf1bba39..783a1e83c6724 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
@@ -264,11 +264,8 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX8-NEXT:  bb:
 ; GFX8-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
 ; GFX8-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
-; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; GFX8-NEXT:    [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
 ; GFX8-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
 ; GFX8-NEXT:    ret <3 x i16> [[INS_2]]
 ;
@@ -276,11 +273,8 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX9-NEXT:  bb:
 ; GFX9-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
 ; GFX9-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
-; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX9-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; GFX9-NEXT:    [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
 ; GFX9-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; GFX9-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
 ; GFX9-NEXT:    ret <3 x i16> [[INS_2]]
 ;
@@ -323,24 +317,20 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
 ;
 ; GFX8-LABEL: @uadd_sat_v4i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]])
+; GFX8-NEXT:    [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]])
 ; GFX8-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
-; GFX8-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
-; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; GFX8-NEXT:    ret <4 x i16> [[INS_31]]
 ;
 ; GFX9-LABEL: @uadd_sat_v4i16(
 ; GFX9-NEXT:  bb:
-; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX9-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]])
+; GFX9-NEXT:    [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]])
 ; GFX9-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
-; GFX9-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
-; GFX9-NEXT:    [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX9-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; GFX9-NEXT:    ret <4 x i16> [[INS_31]]
 ;
 bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
index 0bb641371825b..7e31ec9a0b39a 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
@@ -264,11 +264,8 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX8-NEXT:  bb:
 ; GFX8-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
 ; GFX8-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
-; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; GFX8-NEXT:    [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
 ; GFX8-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
 ; GFX8-NEXT:    ret <3 x i16> [[INS_2]]
 ;
@@ -276,11 +273,8 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX9-NEXT:  bb:
 ; GFX9-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
 ; GFX9-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
-; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX9-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; GFX9-NEXT:    [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
 ; GFX9-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; GFX9-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
 ; GFX9-NEXT:    ret <3 x i16> [[INS_2]]
 ;
@@ -323,24 +317,20 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
 ;
 ; GFX8-LABEL: @uadd_sat_v4i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]])
+; GFX8-NEXT:    [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]])
 ; GFX8-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
-; GFX8-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
-; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; GFX8-NEXT:    ret <4 x i16> [[INS_31]]
 ;
 ; GFX9-LABEL: @uadd_sat_v4i16(
 ; GFX9-NEXT:  bb:
-; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX9-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]])
+; GFX9-NEXT:    [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]])
 ; GFX9-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
-; GFX9-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
-; GFX9-NEXT:    [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX9-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; GFX9-NEXT:    ret <4 x i16> [[INS_31]]
 ;
 bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
index b790e6f3c99c6..77d36f0107665 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
@@ -38,13 +38,13 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
 ; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
 ; SLM-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
-; SLM-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]])
+; SLM-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
 ; SLM-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; SLM-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; SLM-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
+; SLM-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]])
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; SLM-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
 ; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
 ; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SLM-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -59,13 +59,13 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
 ; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
 ; AVX-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
-; AVX-NEXT:    [[TMP9:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP8]])
+; AVX-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
 ; AVX-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP1]])
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP3]])
+; AVX-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]])
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
 ; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
 ; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[R2:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -80,13 +80,13 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ; AVX2-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
 ; AVX2-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
 ; AVX2-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
-; AVX2-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]])
+; AVX2-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
 ; AVX2-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX2-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
-; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX2-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
+; AVX2-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]])
+; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
 ; AVX2-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
 ; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
index ef1a67032c237..18d79752b0b44 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
@@ -38,13 +38,13 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
 ; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
 ; SLM-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
-; SLM-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]])
+; SLM-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
 ; SLM-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; SLM-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; SLM-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
+; SLM-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]])
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; SLM-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
 ; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
 ; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SLM-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -59,13 +59,13 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
 ; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
 ; AVX-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
-; AVX-NEXT:    [[TMP9:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP8]])
+; AVX-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
 ; AVX-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP1]])
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP3]])
+; AVX-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]])
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
 ; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
 ; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[R2:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -80,13 +80,13 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ; AVX2-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
 ; AVX2-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
 ; AVX2-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
-; AVX2-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]])
+; AVX2-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
 ; AVX2-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX2-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
-; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX2-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
+; AVX2-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]])
+; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
 ; AVX2-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
 ; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>