[clang] [llvm] [AMDGPU] Enable vectorization of i8 values. (PR #134934)

Thu Jun 26 13:46:26 PDT 2025

https://github.com/doru1004 updated https://github.com/llvm/llvm-project/pull/134934

>From f968b80b31955b3f42cf84ee98f6067446234408 Mon Sep 17 00:00:00 2001
From: Erich Keane <ekeane at nvidia.com>
Date: Thu, 26 Jun 2025 13:17:22 -0700
Subject: [PATCH] [OpenACC][Docs] Add a release note for Clang 21 (#145938)

This patch adds a release note that explains the current status of
OpenACC in Clang. Currently we cannot actually make an executable
because the OpenACC dialect of MLIR doesn't support any amount of
lowering to LLVM-IR, so the usefulness of OpenACC is entirely for
front-end related uses, such as tooling or semantic checking.
---
 clang/docs/ReleaseNotes.rst                   |  20 +-
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      |  55 +-
 .../Target/AMDGPU/AMDGPUTargetTransformInfo.h |  14 +
 llvm/test/Analysis/CostModel/AMDGPU/load.ll   |  66 +--
 .../CostModel/AMDGPU/shufflevector.ll         | 400 +++++++-------
 llvm/test/Analysis/CostModel/AMDGPU/store.ll  |  66 +--
 .../SLPVectorizer/AMDGPU/vectorize-i8.ll      | 504 ++++--------------
 7 files changed, 455 insertions(+), 670 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 01f3b7a557a5c..86216f1503457 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1068,10 +1068,22 @@ Static Analyzer
 New features
 ^^^^^^^^^^^^
 
-A new flag - `-static-libclosure` was introduced to support statically linking
-the runtime for the Blocks extension on Windows. This flag currently only
-changes the code generation, and even then, only on Windows. This does not
-impact the linker behaviour like the other `-static-*` flags.
+- A new flag - `-static-libclosure` was introduced to support statically linking
+  the runtime for the Blocks extension on Windows. This flag currently only
+  changes the code generation, and even then, only on Windows. This does not
+  impact the linker behaviour like the other `-static-*` flags.
+- OpenACC support, enabled via `-fopenacc` has reached a level of completeness
+  to finally be at least notionally usable. Currently, the OpenACC 3.4
+  specification has been completely implemented for Sema and AST creation, so
+  nodes will show up in the AST after having been properly checked. Lowering is
+  currently a work in progress, with compute, loop, and combined constructs
+  partially implemented, plus a handful of data and executable constructs
+  implemented. Lowering will only work in Clang-IR mode (so only with a compiler
+  built with Clang-IR enabled, and with `-fclangir` used on the command line).
+  However, note that the Clang-IR implementation status is also quite partial,
+  so frequent 'not yet implemented' diagnostics should be expected.  Also, the
+  ACC MLIR dialect does not currently implement any lowering to LLVM-IR, so no
+  code generation is possible for OpenACC.
 
 Crash and bug fixes
 ^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 5e41273556d3d..f693580929518 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -344,9 +344,12 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
 unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
   if (Opcode == Instruction::Load || Opcode == Instruction::Store)
     return 32 * 4 / ElemWidth;
-  return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
-       : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
-       : 1;
+  // For a given width return the max 0number of elements that can be combined
+  // into a wider bit value:
+  return (ElemWidth == 8 && ST->has16BitInsts())       ? 4
+         : (ElemWidth == 16 && ST->has16BitInsts())    ? 2
+         : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
+                                                       : 1;
 }
 
 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -1195,14 +1198,15 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
 
   Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
 
-  // Larger vector widths may require additional instructions, but are
-  // typically cheaper than scalarized versions.
-  unsigned NumVectorElts = cast<FixedVectorType>(SrcTy)->getNumElements();
+  unsigned ScalarSize = DL.getTypeSizeInBits(SrcTy->getElementType());
   if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
-      DL.getTypeSizeInBits(SrcTy->getElementType()) == 16) {
-    bool HasVOP3P = ST->hasVOP3PInsts();
+      (ScalarSize == 16 || ScalarSize == 8)) {
+    // Larger vector widths may require additional instructions, but are
+    // typically cheaper than scalarized versions.
+    unsigned NumVectorElts = cast<FixedVectorType>(SrcTy)->getNumElements();
     unsigned RequestedElts =
         count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
+    unsigned EltsPerReg = 32 / ScalarSize;
     if (RequestedElts == 0)
       return 0;
     switch (Kind) {
@@ -1211,9 +1215,9 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
     case TTI::SK_PermuteSingleSrc: {
       // With op_sel VOP3P instructions freely can access the low half or high
       // half of a register, so any swizzle of two elements is free.
-      if (HasVOP3P && NumVectorElts == 2)
+      if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumVectorElts == 2)
         return 0;
-      unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
+      unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
       // SK_Broadcast just reuses the same mask
       unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
       return NumPerms + NumPermMasks;
@@ -1225,12 +1229,12 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
         return 0;
       // Insert/extract subvectors only require shifts / extract code to get the
       // relevant bits
-      return alignTo(RequestedElts, 2) / 2;
+      return alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
     }
     case TTI::SK_PermuteTwoSrc:
     case TTI::SK_Splice:
     case TTI::SK_Select: {
-      unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
+      unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
       // SK_Select just reuses the same mask
       unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
       return NumPerms + NumPermMasks;
@@ -1505,3 +1509,30 @@ GCNTTIImpl::fpenvIEEEMode(const Instruction &I) const {
   return AMDGPU::isShader(F->getCallingConv()) ? KnownIEEEMode::Off
                                                : KnownIEEEMode::On;
 }
+
+InstructionCost GCNTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                            Align Alignment,
+                                            unsigned AddressSpace,
+                                            TTI::TargetCostKind CostKind,
+                                            TTI::OperandValueInfo OpInfo,
+                                            const Instruction *I) const {
+  if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
+    if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
+        VecTy->getElementType()->isIntegerTy(8)) {
+      return divideCeil(DL.getTypeSizeInBits(VecTy) - 1,
+                        getLoadStoreVecRegBitWidth(AddressSpace));
+    }
+  }
+  return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
+                                OpInfo, I);
+}
+
+unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {
+  if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
+    if (VecTy->getElementType()->isIntegerTy(8)) {
+      unsigned ElementCount = VecTy->getElementCount().getFixedValue();
+      return divideCeil(ElementCount - 1, 4);
+    }
+  }
+  return BaseT::getNumberOfParts(Tp);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 64a244e33f18f..20da8344c9d37 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -288,6 +288,20 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
   /// "amdgpu-ieee"="true" and KnownIEEEMode::Off if we can assume
   /// "amdgpu-ieee"="false".
   KnownIEEEMode fpenvIEEEMode(const Instruction &I) const;
+
+  /// Account for loads of i8 vector types to have reduced cost. For
+  /// example the cost of load 4 i8s values is one is the cost of loading
+  /// a single i32 value.
+  InstructionCost getMemoryOpCost(
+      unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
+      TTI::TargetCostKind CostKind,
+      TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
+      const Instruction *I = nullptr) const override;
+
+  /// When counting parts on AMD GPUs, account for i8s being grouped
+  /// together under a single i32 value. Otherwise fall back to base
+  /// implementation.
+  unsigned getNumberOfParts(Type *Tp) const override;
 };
 
 } // end namespace llvm
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/load.ll b/llvm/test/Analysis/CostModel/AMDGPU/load.ll
index 3f8016178e719..6ec84bd88cd4d 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/load.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/load.ll
@@ -21,17 +21,17 @@ define void @loads_i1(i32 %arg) {
 define void @loads_i8(i32 %arg) {
 ; GFX90A-LABEL: 'loads_i8'
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, ptr poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %2 = load <2 x i8>, ptr poison, align 2
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %3 = load <3 x i8>, ptr poison, align 4
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %4 = load <4 x i8>, ptr poison, align 4
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load <2 x i8>, ptr poison, align 2
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load <3 x i8>, ptr poison, align 4
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = load <4 x i8>, ptr poison, align 4
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = load i8, ptr poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %6 = load <2 x i8>, ptr poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %7 = load <3 x i8>, ptr poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %8 = load <4 x i8>, ptr poison, align 1
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load <2 x i8>, ptr poison, align 1
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = load <3 x i8>, ptr poison, align 1
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = load <4 x i8>, ptr poison, align 1
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = load i8, ptr poison, align 4
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %10 = load <2 x i8>, ptr poison, align 4
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %11 = load <3 x i8>, ptr poison, align 4
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %12 = load <4 x i8>, ptr poison, align 4
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = load <2 x i8>, ptr poison, align 4
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = load <3 x i8>, ptr poison, align 4
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = load <4 x i8>, ptr poison, align 4
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
   load i8, ptr poison
@@ -154,35 +154,35 @@ define void @loads_addrspace_1(i32 %arg) {
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load <1 x i16>, ptr addrspace(1) poison, align 2
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = load <1 x i32>, ptr addrspace(1) poison, align 4
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = load <2 x i1>, ptr addrspace(1) poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %6 = load <2 x i8>, ptr addrspace(1) poison, align 2
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load <2 x i8>, ptr addrspace(1) poison, align 2
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = load <2 x i16>, ptr addrspace(1) poison, align 4
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = load <2 x i32>, ptr addrspace(1) poison, align 8
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %9 = load <3 x i1>, ptr addrspace(1) poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %10 = load <3 x i8>, ptr addrspace(1) poison, align 4
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = load <3 x i8>, ptr addrspace(1) poison, align 4
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %11 = load <3 x i16>, ptr addrspace(1) poison, align 8
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = load <3 x i32>, ptr addrspace(1) poison, align 16
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %13 = load <4 x i1>, ptr addrspace(1) poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %14 = load <4 x i8>, ptr addrspace(1) poison, align 4
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = load <4 x i8>, ptr addrspace(1) poison, align 4
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = load <4 x i16>, ptr addrspace(1) poison, align 8
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = load <4 x i32>, ptr addrspace(1) poison, align 16
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %17 = load <8 x i1>, ptr addrspace(1) poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %18 = load <8 x i8>, ptr addrspace(1) poison, align 8
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = load <8 x i8>, ptr addrspace(1) poison, align 8
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = load <8 x i16>, ptr addrspace(1) poison, align 16
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = load <8 x i32>, ptr addrspace(1) poison, align 32
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %21 = load <16 x i1>, ptr addrspace(1) poison, align 2
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %22 = load <16 x i8>, ptr addrspace(1) poison, align 16
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = load <16 x i8>, ptr addrspace(1) poison, align 16
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = load <16 x i16>, ptr addrspace(1) poison, align 32
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %24 = load <16 x i32>, ptr addrspace(1) poison, align 64
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %25 = load <32 x i1>, ptr addrspace(1) poison, align 4
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %26 = load <32 x i8>, ptr addrspace(1) poison, align 32
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = load <32 x i8>, ptr addrspace(1) poison, align 32
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = load <32 x i16>, ptr addrspace(1) poison, align 64
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = load <32 x i32>, ptr addrspace(1) poison, align 128
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %29 = load <64 x i1>, ptr addrspace(1) poison, align 8
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %30 = load <64 x i8>, ptr addrspace(1) poison, align 64
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %30 = load <64 x i8>, ptr addrspace(1) poison, align 64
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %31 = load <64 x i16>, ptr addrspace(1) poison, align 128
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %32 = load <64 x i32>, ptr addrspace(1) poison, align 256
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %33 = load <128 x i1>, ptr addrspace(1) poison, align 16
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %34 = load <128 x i8>, ptr addrspace(1) poison, align 128
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %34 = load <128 x i8>, ptr addrspace(1) poison, align 128
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %35 = load <128 x i16>, ptr addrspace(1) poison, align 256
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %36 = load <128 x i32>, ptr addrspace(1) poison, align 512
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
@@ -241,35 +241,35 @@ define void @loads_addrspace_3(i32 %arg) {
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load <1 x i16>, ptr addrspace(3) poison, align 2
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = load <1 x i32>, ptr addrspace(3) poison, align 4
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = load <2 x i1>, ptr addrspace(3) poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %6 = load <2 x i8>, ptr addrspace(3) poison, align 2
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load <2 x i8>, ptr addrspace(3) poison, align 2
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = load <2 x i16>, ptr addrspace(3) poison, align 4
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = load <2 x i32>, ptr addrspace(3) poison, align 8
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %9 = load <3 x i1>, ptr addrspace(3) poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %10 = load <3 x i8>, ptr addrspace(3) poison, align 4
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = load <3 x i8>, ptr addrspace(3) poison, align 4
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %11 = load <3 x i16>, ptr addrspace(3) poison, align 8
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = load <3 x i32>, ptr addrspace(3) poison, align 16
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %13 = load <4 x i1>, ptr addrspace(3) poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %14 = load <4 x i8>, ptr addrspace(3) poison, align 4
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = load <4 x i8>, ptr addrspace(3) poison, align 4
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = load <4 x i16>, ptr addrspace(3) poison, align 8
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = load <4 x i32>, ptr addrspace(3) poison, align 16
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %17 = load <8 x i1>, ptr addrspace(3) poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %18 = load <8 x i8>, ptr addrspace(3) poison, align 8
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = load <8 x i8>, ptr addrspace(3) poison, align 8
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = load <8 x i16>, ptr addrspace(3) poison, align 16
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = load <8 x i32>, ptr addrspace(3) poison, align 32
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %21 = load <16 x i1>, ptr addrspace(3) poison, align 2
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %22 = load <16 x i8>, ptr addrspace(3) poison, align 16
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = load <16 x i8>, ptr addrspace(3) poison, align 16
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = load <16 x i16>, ptr addrspace(3) poison, align 32
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %24 = load <16 x i32>, ptr addrspace(3) poison, align 64
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %25 = load <32 x i1>, ptr addrspace(3) poison, align 4
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %26 = load <32 x i8>, ptr addrspace(3) poison, align 32
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %26 = load <32 x i8>, ptr addrspace(3) poison, align 32
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = load <32 x i16>, ptr addrspace(3) poison, align 64
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = load <32 x i32>, ptr addrspace(3) poison, align 128
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %29 = load <64 x i1>, ptr addrspace(3) poison, align 8
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %30 = load <64 x i8>, ptr addrspace(3) poison, align 64
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %30 = load <64 x i8>, ptr addrspace(3) poison, align 64
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %31 = load <64 x i16>, ptr addrspace(3) poison, align 128
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %32 = load <64 x i32>, ptr addrspace(3) poison, align 256
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %33 = load <128 x i1>, ptr addrspace(3) poison, align 16
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %34 = load <128 x i8>, ptr addrspace(3) poison, align 128
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %34 = load <128 x i8>, ptr addrspace(3) poison, align 128
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %35 = load <128 x i16>, ptr addrspace(3) poison, align 256
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %36 = load <128 x i32>, ptr addrspace(3) poison, align 512
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
@@ -328,35 +328,35 @@ define void @loads_addrspace_5(i32 %arg) {
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load <1 x i16>, ptr addrspace(5) poison, align 2
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = load <1 x i32>, ptr addrspace(5) poison, align 4
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = load <2 x i1>, ptr addrspace(5) poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %6 = load <2 x i8>, ptr addrspace(5) poison, align 2
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load <2 x i8>, ptr addrspace(5) poison, align 2
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = load <2 x i16>, ptr addrspace(5) poison, align 4
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = load <2 x i32>, ptr addrspace(5) poison, align 8
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %9 = load <3 x i1>, ptr addrspace(5) poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %10 = load <3 x i8>, ptr addrspace(5) poison, align 4
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = load <3 x i8>, ptr addrspace(5) poison, align 4
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %11 = load <3 x i16>, ptr addrspace(5) poison, align 8
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = load <3 x i32>, ptr addrspace(5) poison, align 16
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %13 = load <4 x i1>, ptr addrspace(5) poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %14 = load <4 x i8>, ptr addrspace(5) poison, align 4
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = load <4 x i8>, ptr addrspace(5) poison, align 4
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = load <4 x i16>, ptr addrspace(5) poison, align 8
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = load <4 x i32>, ptr addrspace(5) poison, align 16
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %17 = load <8 x i1>, ptr addrspace(5) poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %18 = load <8 x i8>, ptr addrspace(5) poison, align 8
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %18 = load <8 x i8>, ptr addrspace(5) poison, align 8
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = load <8 x i16>, ptr addrspace(5) poison, align 16
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = load <8 x i32>, ptr addrspace(5) poison, align 32
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %21 = load <16 x i1>, ptr addrspace(5) poison, align 2
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %22 = load <16 x i8>, ptr addrspace(5) poison, align 16
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %22 = load <16 x i8>, ptr addrspace(5) poison, align 16
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = load <16 x i16>, ptr addrspace(5) poison, align 32
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %24 = load <16 x i32>, ptr addrspace(5) poison, align 64
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %25 = load <32 x i1>, ptr addrspace(5) poison, align 4
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %26 = load <32 x i8>, ptr addrspace(5) poison, align 32
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %26 = load <32 x i8>, ptr addrspace(5) poison, align 32
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = load <32 x i16>, ptr addrspace(5) poison, align 64
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = load <32 x i32>, ptr addrspace(5) poison, align 128
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %29 = load <64 x i1>, ptr addrspace(5) poison, align 8
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %30 = load <64 x i8>, ptr addrspace(5) poison, align 64
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %30 = load <64 x i8>, ptr addrspace(5) poison, align 64
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %31 = load <64 x i16>, ptr addrspace(5) poison, align 128
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %32 = load <64 x i32>, ptr addrspace(5) poison, align 256
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %33 = load <128 x i1>, ptr addrspace(5) poison, align 16
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %34 = load <128 x i8>, ptr addrspace(5) poison, align 128
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %34 = load <128 x i8>, ptr addrspace(5) poison, align 128
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %35 = load <128 x i16>, ptr addrspace(5) poison, align 256
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %36 = load <128 x i32>, ptr addrspace(5) poison, align 512
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
index 4fc8c44e12668..67a1597d77a6f 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
@@ -396,157 +396,157 @@ define amdgpu_kernel void @shufflevector_i16(<2 x i16> %vec1, <2 x i16> %vec2) {
 ; Should not assert
 define amdgpu_kernel void @shufflevector_i8(<2 x i8> %vec1, <2 x i8> %vec2) {
 ; ALL-LABEL: 'shufflevector_i8'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 1>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf02 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 2>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 2>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 1>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf13 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf31 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 1>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf32 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf000 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> zeroinitializer
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf001 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf010 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf011 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf100 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf101 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf110 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf111 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf002 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf020 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf022 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf200 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf202 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf220 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf222 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf112 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf121 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf122 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf211 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf212 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf221 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf32 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf000 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> zeroinitializer
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf001 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf010 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf011 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf100 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf101 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf110 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf111 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf002 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf020 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf022 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf200 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf202 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf220 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf222 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf112 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf121 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf122 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf211 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf212 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf221 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 1>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf02_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 2>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 2>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 1>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf13_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf31_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 1>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf32_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf000_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> zeroinitializer
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf001_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 0, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf010_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 1, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf011_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 1, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf100_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 0, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf101_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 0, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf110_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf111_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf002_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 0, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf020_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 2, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf022_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 2, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf200_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 0, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf202_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 0, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf220_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf222_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf112_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf121_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 2, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf122_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 2, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf211_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 1, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf212_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 1, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf221_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf32_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf000_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> zeroinitializer
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf001_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf010_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf011_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf100_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 0, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf101_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf110_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf111_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf002_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf020_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf022_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf200_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 0, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf202_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf220_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf222_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf112_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf121_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf122_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf211_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf212_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf221_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 1>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; ALL-SIZE-LABEL: 'shufflevector_i8'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 1>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf02 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 2>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 2>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 1>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf13 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf31 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 1>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf32 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf000 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> zeroinitializer
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf001 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf010 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf011 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf100 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf101 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf110 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf111 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf002 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf020 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf022 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf200 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf202 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf220 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf222 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf112 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf121 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf122 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf211 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf212 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf221 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf32 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf000 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> zeroinitializer
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf001 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf010 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf011 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf100 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf101 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf110 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf111 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf002 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf020 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf022 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf200 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf202 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf220 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf222 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf112 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf121 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf122 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf211 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf212 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf221 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 1>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf02_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 2>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 2>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 1>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf13_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf31_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 1>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf32_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf000_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> zeroinitializer
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf001_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 0, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf010_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 1, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf011_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 1, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf100_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 0, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf101_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 0, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf110_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf111_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf002_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 0, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf020_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 2, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf022_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 2, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf200_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 0, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf202_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 0, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf220_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf222_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf112_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf121_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 2, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf122_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 2, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf211_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 1, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf212_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 1, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf221_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf32_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf000_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> zeroinitializer
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf001_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf010_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf011_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf100_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 0, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf101_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf110_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf111_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf002_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf020_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf022_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf200_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 0, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf202_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf220_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf222_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf112_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf121_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf122_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf211_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf212_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf221_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 1>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer
@@ -861,22 +861,22 @@ define amdgpu_kernel void @shufflevector_i32(<2 x i32> %vec1, <2 x i32> %vec2) {
 ; Other shuffle cases
 define void @shuffle(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> %i8v4_2, <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i8> %i8v16, <16 x i8> %i8v16_2, <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i16> %i16v8, <8 x i16> %i16v8_2, <2 x i32> %i32v2, <2 x i32> %i32v2_2, <4 x i32> %i32v4, <4 x i32> %i32v4_2, <2 x float> %floatv2, <2 x float> %floatv2_2, <4 x float> %floatv4, <4 x float> %floatv4_2,<2 x i64> %i64v2, <2 x i64> %i64v2_2,<2 x double> %doublev2, <2 x double> %doublev2_2) {
 ; GFX9-10-LABEL: 'shuffle'
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> <i32 1, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> <i32 1, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
 ; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <2 x i32> <i32 1, i32 0>
 ; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> <i32 1, i32 0>
 ; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
@@ -898,22 +898,22 @@ define void @shuffle(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8>
 ; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; VI-LABEL: 'shuffle'
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> <i32 1, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> <i32 1, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
 ; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <2 x i32> <i32 1, i32 0>
 ; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> <i32 1, i32 0>
 ; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
@@ -935,22 +935,22 @@ define void @shuffle(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8>
 ; VI-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX9-10-SIZE-LABEL: 'shuffle'
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> <i32 1, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> <i32 1, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
 ; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <2 x i32> <i32 1, i32 0>
 ; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> <i32 1, i32 0>
 ; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
@@ -972,22 +972,22 @@ define void @shuffle(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8>
 ; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; VI-SIZE-LABEL: 'shuffle'
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> <i32 1, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> <i32 1, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
 ; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <2 x i32> <i32 1, i32 0>
 ; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> <i32 1, i32 0>
 ; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
@@ -1047,9 +1047,9 @@ define void @shuffle(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8>
 
 define void @concat(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i8> %i8v8, <8 x i8> %i8v8_2, <2 x half> %halfv2, <2 x half> %halfv2_2, <4 x half> %halfv4, <4 x half> %halfv4_2, <8 x half> %halfv8, <8 x half> %halfv8_2, <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i16> %i16v8, <8 x i16> %i16v8_2, <2 x i32> %i32v2, <2 x i32> %i32v2_2, <4 x i32> %i32v4, <4 x i32> %i32v4_2, <2 x float> %floatv2, <2 x float> %floatv2_2, <4 x float> %floatv4, <4 x float> %floatv4_2,<2 x i64> %i64v2, <2 x i64> %i64v2_2,<2 x double> %doublev2, <2 x double> %doublev2_2) {
 ; ALL-LABEL: 'concat'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i16 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i16 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i16 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -1062,9 +1062,9 @@ define void @concat(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> %
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i16_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i16_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -1080,9 +1080,9 @@ define void @concat(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> %
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; ALL-SIZE-LABEL: 'concat'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i16 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i16 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i16 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -1095,9 +1095,9 @@ define void @concat(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> %
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i16_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i16_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/store.ll b/llvm/test/Analysis/CostModel/AMDGPU/store.ll
index 9672c3256c751..6dc4befdfbd9e 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/store.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/store.ll
@@ -20,17 +20,17 @@ define void @stores_i1(i32 %arg) {
 define void @stores_i8(i32 %arg) {
 ; GFX90A-LABEL: 'stores_i8'
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i8 poison, ptr poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i8> poison, ptr poison, align 2
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <3 x i8> poison, ptr poison, align 4
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x i8> poison, ptr poison, align 4
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> poison, ptr poison, align 2
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <3 x i8> poison, ptr poison, align 4
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i8> poison, ptr poison, align 4
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i8 poison, ptr poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i8> poison, ptr poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <3 x i8> poison, ptr poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x i8> poison, ptr poison, align 1
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> poison, ptr poison, align 1
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <3 x i8> poison, ptr poison, align 1
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i8> poison, ptr poison, align 1
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i8 poison, ptr poison, align 4
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i8> poison, ptr poison, align 4
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <3 x i8> poison, ptr poison, align 4
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x i8> poison, ptr poison, align 4
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> poison, ptr poison, align 4
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <3 x i8> poison, ptr poison, align 4
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i8> poison, ptr poison, align 4
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
   store i8 poison, ptr poison
@@ -153,35 +153,35 @@ define void @stores_addrspace_1(i32 %arg) {
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x i16> poison, ptr addrspace(1) poison, align 2
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x i32> poison, ptr addrspace(1) poison, align 4
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i1> poison, ptr addrspace(1) poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i8> poison, ptr addrspace(1) poison, align 2
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> poison, ptr addrspace(1) poison, align 2
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i16> poison, ptr addrspace(1) poison, align 4
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i32> poison, ptr addrspace(1) poison, align 8
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <3 x i1> poison, ptr addrspace(1) poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <3 x i8> poison, ptr addrspace(1) poison, align 4
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <3 x i8> poison, ptr addrspace(1) poison, align 4
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <3 x i16> poison, ptr addrspace(1) poison, align 8
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <3 x i32> poison, ptr addrspace(1) poison, align 16
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x i1> poison, ptr addrspace(1) poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x i8> poison, ptr addrspace(1) poison, align 4
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i8> poison, ptr addrspace(1) poison, align 4
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i16> poison, ptr addrspace(1) poison, align 8
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> poison, ptr addrspace(1) poison, align 16
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <8 x i1> poison, ptr addrspace(1) poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <8 x i8> poison, ptr addrspace(1) poison, align 8
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i8> poison, ptr addrspace(1) poison, align 8
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> poison, ptr addrspace(1) poison, align 16
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i32> poison, ptr addrspace(1) poison, align 32
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <16 x i1> poison, ptr addrspace(1) poison, align 2
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <16 x i8> poison, ptr addrspace(1) poison, align 16
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i8> poison, ptr addrspace(1) poison, align 16
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i16> poison, ptr addrspace(1) poison, align 32
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i32> poison, ptr addrspace(1) poison, align 64
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: store <32 x i1> poison, ptr addrspace(1) poison, align 4
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: store <32 x i8> poison, ptr addrspace(1) poison, align 32
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <32 x i8> poison, ptr addrspace(1) poison, align 32
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <32 x i16> poison, ptr addrspace(1) poison, align 64
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <32 x i32> poison, ptr addrspace(1) poison, align 128
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: store <64 x i1> poison, ptr addrspace(1) poison, align 8
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: store <64 x i8> poison, ptr addrspace(1) poison, align 64
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <64 x i8> poison, ptr addrspace(1) poison, align 64
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <64 x i16> poison, ptr addrspace(1) poison, align 128
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <64 x i32> poison, ptr addrspace(1) poison, align 256
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: store <128 x i1> poison, ptr addrspace(1) poison, align 16
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: store <128 x i8> poison, ptr addrspace(1) poison, align 128
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <128 x i8> poison, ptr addrspace(1) poison, align 128
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <128 x i16> poison, ptr addrspace(1) poison, align 256
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <128 x i32> poison, ptr addrspace(1) poison, align 512
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
@@ -241,35 +241,35 @@ define void @stores_addrspace_3(i32 %arg) {
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x i16> poison, ptr addrspace(3) poison, align 2
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x i32> poison, ptr addrspace(3) poison, align 4
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i1> poison, ptr addrspace(3) poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i8> poison, ptr addrspace(3) poison, align 2
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> poison, ptr addrspace(3) poison, align 2
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i16> poison, ptr addrspace(3) poison, align 4
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i32> poison, ptr addrspace(3) poison, align 8
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <3 x i1> poison, ptr addrspace(3) poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <3 x i8> poison, ptr addrspace(3) poison, align 4
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <3 x i8> poison, ptr addrspace(3) poison, align 4
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <3 x i16> poison, ptr addrspace(3) poison, align 8
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <3 x i32> poison, ptr addrspace(3) poison, align 16
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x i1> poison, ptr addrspace(3) poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x i8> poison, ptr addrspace(3) poison, align 4
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i8> poison, ptr addrspace(3) poison, align 4
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i16> poison, ptr addrspace(3) poison, align 8
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> poison, ptr addrspace(3) poison, align 16
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <8 x i1> poison, ptr addrspace(3) poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <8 x i8> poison, ptr addrspace(3) poison, align 8
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i8> poison, ptr addrspace(3) poison, align 8
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> poison, ptr addrspace(3) poison, align 16
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i32> poison, ptr addrspace(3) poison, align 32
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <16 x i1> poison, ptr addrspace(3) poison, align 2
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <16 x i8> poison, ptr addrspace(3) poison, align 16
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i8> poison, ptr addrspace(3) poison, align 16
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i16> poison, ptr addrspace(3) poison, align 32
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i32> poison, ptr addrspace(3) poison, align 64
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: store <32 x i1> poison, ptr addrspace(3) poison, align 4
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: store <32 x i8> poison, ptr addrspace(3) poison, align 32
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <32 x i8> poison, ptr addrspace(3) poison, align 32
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <32 x i16> poison, ptr addrspace(3) poison, align 64
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <32 x i32> poison, ptr addrspace(3) poison, align 128
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: store <64 x i1> poison, ptr addrspace(3) poison, align 8
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: store <64 x i8> poison, ptr addrspace(3) poison, align 64
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <64 x i8> poison, ptr addrspace(3) poison, align 64
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <64 x i16> poison, ptr addrspace(3) poison, align 128
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <64 x i32> poison, ptr addrspace(3) poison, align 256
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: store <128 x i1> poison, ptr addrspace(3) poison, align 16
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: store <128 x i8> poison, ptr addrspace(3) poison, align 128
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <128 x i8> poison, ptr addrspace(3) poison, align 128
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <128 x i16> poison, ptr addrspace(3) poison, align 256
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <128 x i32> poison, ptr addrspace(3) poison, align 512
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
@@ -329,35 +329,35 @@ define void @stores_addrspace_5(i32 %arg) {
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x i16> poison, ptr addrspace(5) poison, align 2
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x i32> poison, ptr addrspace(5) poison, align 4
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i1> poison, ptr addrspace(5) poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i8> poison, ptr addrspace(5) poison, align 2
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> poison, ptr addrspace(5) poison, align 2
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i16> poison, ptr addrspace(5) poison, align 4
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i32> poison, ptr addrspace(5) poison, align 8
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <3 x i1> poison, ptr addrspace(5) poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <3 x i8> poison, ptr addrspace(5) poison, align 4
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <3 x i8> poison, ptr addrspace(5) poison, align 4
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <3 x i16> poison, ptr addrspace(5) poison, align 8
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <3 x i32> poison, ptr addrspace(5) poison, align 16
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x i1> poison, ptr addrspace(5) poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x i8> poison, ptr addrspace(5) poison, align 4
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i8> poison, ptr addrspace(5) poison, align 4
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i16> poison, ptr addrspace(5) poison, align 8
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> poison, ptr addrspace(5) poison, align 16
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <8 x i1> poison, ptr addrspace(5) poison, align 1
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <8 x i8> poison, ptr addrspace(5) poison, align 8
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i8> poison, ptr addrspace(5) poison, align 8
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> poison, ptr addrspace(5) poison, align 16
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i32> poison, ptr addrspace(5) poison, align 32
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <16 x i1> poison, ptr addrspace(5) poison, align 2
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <16 x i8> poison, ptr addrspace(5) poison, align 16
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <16 x i8> poison, ptr addrspace(5) poison, align 16
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i16> poison, ptr addrspace(5) poison, align 32
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i32> poison, ptr addrspace(5) poison, align 64
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: store <32 x i1> poison, ptr addrspace(5) poison, align 4
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: store <32 x i8> poison, ptr addrspace(5) poison, align 32
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <32 x i8> poison, ptr addrspace(5) poison, align 32
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <32 x i16> poison, ptr addrspace(5) poison, align 64
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <32 x i32> poison, ptr addrspace(5) poison, align 128
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: store <64 x i1> poison, ptr addrspace(5) poison, align 8
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: store <64 x i8> poison, ptr addrspace(5) poison, align 64
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <64 x i8> poison, ptr addrspace(5) poison, align 64
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <64 x i16> poison, ptr addrspace(5) poison, align 128
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <64 x i32> poison, ptr addrspace(5) poison, align 256
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: store <128 x i1> poison, ptr addrspace(5) poison, align 16
-; GFX90A-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: store <128 x i8> poison, ptr addrspace(5) poison, align 128
+; GFX90A-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: store <128 x i8> poison, ptr addrspace(5) poison, align 128
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <128 x i16> poison, ptr addrspace(5) poison, align 256
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <128 x i32> poison, ptr addrspace(5) poison, align 512
 ; GFX90A-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/vectorize-i8.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/vectorize-i8.ll
index b9b1bc1be681e..605eccf26cd3c 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/vectorize-i8.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/vectorize-i8.ll
@@ -21,28 +21,20 @@ define protected amdgpu_kernel void @arith_2(<16 x i8> %invec, ptr %out, i32 %fl
 ; GFX8-LABEL: define protected amdgpu_kernel void @arith_2(
 ; GFX8-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0:[0-9]+]] {
 ; GFX8-NEXT:  [[ENTRY:.*:]]
-; GFX8-NEXT:    [[EL0:%.*]] = extractelement <16 x i8> [[INVEC]], i64 0
-; GFX8-NEXT:    [[EL1:%.*]] = extractelement <16 x i8> [[INVEC]], i64 1
-; GFX8-NEXT:    [[MUL0:%.*]] = mul i8 [[EL0]], 1
-; GFX8-NEXT:    [[MUL1:%.*]] = mul i8 [[EL1]], 1
-; GFX8-NEXT:    [[ADD0:%.*]] = add i8 [[MUL0]], 1
-; GFX8-NEXT:    [[ADD1:%.*]] = add i8 [[MUL1]], 1
-; GFX8-NEXT:    [[VECINS0:%.*]] = insertelement <16 x i8> poison, i8 [[ADD0]], i64 0
-; GFX8-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> [[VECINS0]], i8 [[ADD1]], i64 1
+; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP1:%.*]] = mul <2 x i8> [[TMP0]], splat (i8 1)
+; GFX8-NEXT:    [[TMP2:%.*]] = add <2 x i8> [[TMP1]], splat (i8 1)
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GFX8-NEXT:    store <16 x i8> [[TMP3]], ptr [[OUT]], align 16
 ; GFX8-NEXT:    ret void
 ;
 ; GFX9-LABEL: define protected amdgpu_kernel void @arith_2(
 ; GFX9-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0:[0-9]+]] {
 ; GFX9-NEXT:  [[ENTRY:.*:]]
-; GFX9-NEXT:    [[EL0:%.*]] = extractelement <16 x i8> [[INVEC]], i64 0
-; GFX9-NEXT:    [[EL1:%.*]] = extractelement <16 x i8> [[INVEC]], i64 1
-; GFX9-NEXT:    [[MUL0:%.*]] = mul i8 [[EL0]], 1
-; GFX9-NEXT:    [[MUL1:%.*]] = mul i8 [[EL1]], 1
-; GFX9-NEXT:    [[ADD0:%.*]] = add i8 [[MUL0]], 1
-; GFX9-NEXT:    [[ADD1:%.*]] = add i8 [[MUL1]], 1
-; GFX9-NEXT:    [[VECINS0:%.*]] = insertelement <16 x i8> poison, i8 [[ADD0]], i64 0
-; GFX9-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> [[VECINS0]], i8 [[ADD1]], i64 1
+; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP1:%.*]] = mul <2 x i8> [[TMP0]], splat (i8 1)
+; GFX9-NEXT:    [[TMP2:%.*]] = add <2 x i8> [[TMP1]], splat (i8 1)
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GFX9-NEXT:    store <16 x i8> [[TMP3]], ptr [[OUT]], align 16
 ; GFX9-NEXT:    ret void
 ;
@@ -82,17 +74,14 @@ define protected amdgpu_kernel void @arith_3(<16 x i8> %invec, ptr %out, i32 %fl
 ; GFX8-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
 ; GFX8-NEXT:  [[ENTRY:.*:]]
 ; GFX8-NEXT:    [[EL0:%.*]] = extractelement <16 x i8> [[INVEC]], i64 0
-; GFX8-NEXT:    [[EL1:%.*]] = extractelement <16 x i8> [[INVEC]], i64 1
-; GFX8-NEXT:    [[EL2:%.*]] = extractelement <16 x i8> [[INVEC]], i64 2
-; GFX8-NEXT:    [[MUL2:%.*]] = mul i8 [[EL0]], 1
-; GFX8-NEXT:    [[MUL1:%.*]] = mul i8 [[EL1]], 1
-; GFX8-NEXT:    [[MUL3:%.*]] = mul i8 [[EL2]], 1
-; GFX8-NEXT:    [[ADD2:%.*]] = add i8 [[MUL2]], 1
-; GFX8-NEXT:    [[ADD1:%.*]] = add i8 [[MUL1]], 1
+; GFX8-NEXT:    [[MUL3:%.*]] = mul i8 [[EL0]], 1
 ; GFX8-NEXT:    [[ADD3:%.*]] = add i8 [[MUL3]], 1
-; GFX8-NEXT:    [[VECINS0:%.*]] = insertelement <16 x i8> poison, i8 [[ADD2]], i64 0
-; GFX8-NEXT:    [[VECINS1:%.*]] = insertelement <16 x i8> [[VECINS0]], i8 [[ADD1]], i64 1
-; GFX8-NEXT:    [[VECINS2:%.*]] = insertelement <16 x i8> [[VECINS1]], i8 [[ADD3]], i64 2
+; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <2 x i32> <i32 1, i32 2>
+; GFX8-NEXT:    [[TMP1:%.*]] = mul <2 x i8> [[TMP0]], splat (i8 1)
+; GFX8-NEXT:    [[TMP2:%.*]] = add <2 x i8> [[TMP1]], splat (i8 1)
+; GFX8-NEXT:    [[VECINS0:%.*]] = insertelement <16 x i8> poison, i8 [[ADD3]], i64 0
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8-NEXT:    [[VECINS2:%.*]] = shufflevector <16 x i8> [[VECINS0]], <16 x i8> [[TMP3]], <16 x i32> <i32 0, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GFX8-NEXT:    store <16 x i8> [[VECINS2]], ptr [[OUT]], align 16
 ; GFX8-NEXT:    ret void
 ;
@@ -100,17 +89,14 @@ define protected amdgpu_kernel void @arith_3(<16 x i8> %invec, ptr %out, i32 %fl
 ; GFX9-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
 ; GFX9-NEXT:  [[ENTRY:.*:]]
 ; GFX9-NEXT:    [[EL0:%.*]] = extractelement <16 x i8> [[INVEC]], i64 0
-; GFX9-NEXT:    [[EL1:%.*]] = extractelement <16 x i8> [[INVEC]], i64 1
-; GFX9-NEXT:    [[EL2:%.*]] = extractelement <16 x i8> [[INVEC]], i64 2
-; GFX9-NEXT:    [[MUL2:%.*]] = mul i8 [[EL0]], 1
-; GFX9-NEXT:    [[MUL1:%.*]] = mul i8 [[EL1]], 1
-; GFX9-NEXT:    [[MUL3:%.*]] = mul i8 [[EL2]], 1
-; GFX9-NEXT:    [[ADD2:%.*]] = add i8 [[MUL2]], 1
-; GFX9-NEXT:    [[ADD1:%.*]] = add i8 [[MUL1]], 1
+; GFX9-NEXT:    [[MUL3:%.*]] = mul i8 [[EL0]], 1
 ; GFX9-NEXT:    [[ADD3:%.*]] = add i8 [[MUL3]], 1
-; GFX9-NEXT:    [[VECINS0:%.*]] = insertelement <16 x i8> poison, i8 [[ADD2]], i64 0
-; GFX9-NEXT:    [[VECINS1:%.*]] = insertelement <16 x i8> [[VECINS0]], i8 [[ADD1]], i64 1
-; GFX9-NEXT:    [[VECINS2:%.*]] = insertelement <16 x i8> [[VECINS1]], i8 [[ADD3]], i64 2
+; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <2 x i32> <i32 1, i32 2>
+; GFX9-NEXT:    [[TMP1:%.*]] = mul <2 x i8> [[TMP0]], splat (i8 1)
+; GFX9-NEXT:    [[TMP2:%.*]] = add <2 x i8> [[TMP1]], splat (i8 1)
+; GFX9-NEXT:    [[VECINS0:%.*]] = insertelement <16 x i8> poison, i8 [[ADD3]], i64 0
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[VECINS2:%.*]] = shufflevector <16 x i8> [[VECINS0]], <16 x i8> [[TMP3]], <16 x i32> <i32 0, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GFX9-NEXT:    store <16 x i8> [[VECINS2]], ptr [[OUT]], align 16
 ; GFX9-NEXT:    ret void
 ;
@@ -157,44 +143,20 @@ define protected amdgpu_kernel void @arith_4(<16 x i8> %invec, ptr %out, i32 %fl
 ; GFX8-LABEL: define protected amdgpu_kernel void @arith_4(
 ; GFX8-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
 ; GFX8-NEXT:  [[ENTRY:.*:]]
-; GFX8-NEXT:    [[EL0:%.*]] = extractelement <16 x i8> [[INVEC]], i64 0
-; GFX8-NEXT:    [[EL1:%.*]] = extractelement <16 x i8> [[INVEC]], i64 1
-; GFX8-NEXT:    [[EL2:%.*]] = extractelement <16 x i8> [[INVEC]], i64 2
-; GFX8-NEXT:    [[EL3:%.*]] = extractelement <16 x i8> [[INVEC]], i64 3
-; GFX8-NEXT:    [[MUL0:%.*]] = mul i8 [[EL0]], 1
-; GFX8-NEXT:    [[MUL1:%.*]] = mul i8 [[EL1]], 1
-; GFX8-NEXT:    [[MUL2:%.*]] = mul i8 [[EL2]], 1
-; GFX8-NEXT:    [[MUL3:%.*]] = mul i8 [[EL3]], 1
-; GFX8-NEXT:    [[ADD0:%.*]] = add i8 [[MUL0]], 1
-; GFX8-NEXT:    [[ADD1:%.*]] = add i8 [[MUL1]], 1
-; GFX8-NEXT:    [[ADD2:%.*]] = add i8 [[MUL2]], 1
-; GFX8-NEXT:    [[ADD3:%.*]] = add i8 [[MUL3]], 1
-; GFX8-NEXT:    [[VECINS0:%.*]] = insertelement <16 x i8> poison, i8 [[ADD0]], i64 0
-; GFX8-NEXT:    [[VECINS1:%.*]] = insertelement <16 x i8> [[VECINS0]], i8 [[ADD1]], i64 1
-; GFX8-NEXT:    [[VECINS2:%.*]] = insertelement <16 x i8> [[VECINS1]], i8 [[ADD2]], i64 2
-; GFX8-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> [[VECINS2]], i8 [[ADD3]], i64 3
+; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX8-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[TMP0]], splat (i8 1)
+; GFX8-NEXT:    [[TMP2:%.*]] = add <4 x i8> [[TMP1]], splat (i8 1)
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GFX8-NEXT:    store <16 x i8> [[TMP3]], ptr [[OUT]], align 16
 ; GFX8-NEXT:    ret void
 ;
 ; GFX9-LABEL: define protected amdgpu_kernel void @arith_4(
 ; GFX9-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
 ; GFX9-NEXT:  [[ENTRY:.*:]]
-; GFX9-NEXT:    [[EL0:%.*]] = extractelement <16 x i8> [[INVEC]], i64 0
-; GFX9-NEXT:    [[EL1:%.*]] = extractelement <16 x i8> [[INVEC]], i64 1
-; GFX9-NEXT:    [[EL2:%.*]] = extractelement <16 x i8> [[INVEC]], i64 2
-; GFX9-NEXT:    [[EL3:%.*]] = extractelement <16 x i8> [[INVEC]], i64 3
-; GFX9-NEXT:    [[MUL0:%.*]] = mul i8 [[EL0]], 1
-; GFX9-NEXT:    [[MUL1:%.*]] = mul i8 [[EL1]], 1
-; GFX9-NEXT:    [[MUL2:%.*]] = mul i8 [[EL2]], 1
-; GFX9-NEXT:    [[MUL3:%.*]] = mul i8 [[EL3]], 1
-; GFX9-NEXT:    [[ADD0:%.*]] = add i8 [[MUL0]], 1
-; GFX9-NEXT:    [[ADD1:%.*]] = add i8 [[MUL1]], 1
-; GFX9-NEXT:    [[ADD2:%.*]] = add i8 [[MUL2]], 1
-; GFX9-NEXT:    [[ADD3:%.*]] = add i8 [[MUL3]], 1
-; GFX9-NEXT:    [[VECINS0:%.*]] = insertelement <16 x i8> poison, i8 [[ADD0]], i64 0
-; GFX9-NEXT:    [[VECINS1:%.*]] = insertelement <16 x i8> [[VECINS0]], i8 [[ADD1]], i64 1
-; GFX9-NEXT:    [[VECINS2:%.*]] = insertelement <16 x i8> [[VECINS1]], i8 [[ADD2]], i64 2
-; GFX9-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> [[VECINS2]], i8 [[ADD3]], i64 3
+; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX9-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[TMP0]], splat (i8 1)
+; GFX9-NEXT:    [[TMP2:%.*]] = add <4 x i8> [[TMP1]], splat (i8 1)
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GFX9-NEXT:    store <16 x i8> [[TMP3]], ptr [[OUT]], align 16
 ; GFX9-NEXT:    ret void
 ;
@@ -293,140 +255,50 @@ define protected amdgpu_kernel void @arith_16(<16 x i8> %invec, ptr %out, i32 %f
 ; GFX8-LABEL: define protected amdgpu_kernel void @arith_16(
 ; GFX8-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
 ; GFX8-NEXT:  [[ENTRY:.*:]]
-; GFX8-NEXT:    [[EL0:%.*]] = extractelement <16 x i8> [[INVEC]], i64 0
-; GFX8-NEXT:    [[EL1:%.*]] = extractelement <16 x i8> [[INVEC]], i64 1
-; GFX8-NEXT:    [[EL2:%.*]] = extractelement <16 x i8> [[INVEC]], i64 2
-; GFX8-NEXT:    [[EL3:%.*]] = extractelement <16 x i8> [[INVEC]], i64 3
-; GFX8-NEXT:    [[EL4:%.*]] = extractelement <16 x i8> [[INVEC]], i64 4
-; GFX8-NEXT:    [[EL5:%.*]] = extractelement <16 x i8> [[INVEC]], i64 5
-; GFX8-NEXT:    [[EL6:%.*]] = extractelement <16 x i8> [[INVEC]], i64 6
-; GFX8-NEXT:    [[EL7:%.*]] = extractelement <16 x i8> [[INVEC]], i64 7
-; GFX8-NEXT:    [[EL8:%.*]] = extractelement <16 x i8> [[INVEC]], i64 8
-; GFX8-NEXT:    [[EL9:%.*]] = extractelement <16 x i8> [[INVEC]], i64 9
-; GFX8-NEXT:    [[EL10:%.*]] = extractelement <16 x i8> [[INVEC]], i64 10
-; GFX8-NEXT:    [[EL11:%.*]] = extractelement <16 x i8> [[INVEC]], i64 11
-; GFX8-NEXT:    [[EL12:%.*]] = extractelement <16 x i8> [[INVEC]], i64 12
-; GFX8-NEXT:    [[EL13:%.*]] = extractelement <16 x i8> [[INVEC]], i64 13
-; GFX8-NEXT:    [[EL14:%.*]] = extractelement <16 x i8> [[INVEC]], i64 14
-; GFX8-NEXT:    [[EL15:%.*]] = extractelement <16 x i8> [[INVEC]], i64 15
-; GFX8-NEXT:    [[MUL0:%.*]] = mul i8 [[EL0]], 1
-; GFX8-NEXT:    [[MUL1:%.*]] = mul i8 [[EL1]], 1
-; GFX8-NEXT:    [[MUL2:%.*]] = mul i8 [[EL2]], 1
-; GFX8-NEXT:    [[MUL3:%.*]] = mul i8 [[EL3]], 1
-; GFX8-NEXT:    [[MUL4:%.*]] = mul i8 [[EL4]], 1
-; GFX8-NEXT:    [[MUL5:%.*]] = mul i8 [[EL5]], 1
-; GFX8-NEXT:    [[MUL6:%.*]] = mul i8 [[EL6]], 1
-; GFX8-NEXT:    [[MUL7:%.*]] = mul i8 [[EL7]], 1
-; GFX8-NEXT:    [[MUL8:%.*]] = mul i8 [[EL8]], 1
-; GFX8-NEXT:    [[MUL9:%.*]] = mul i8 [[EL9]], 1
-; GFX8-NEXT:    [[MUL10:%.*]] = mul i8 [[EL10]], 1
-; GFX8-NEXT:    [[MUL11:%.*]] = mul i8 [[EL11]], 1
-; GFX8-NEXT:    [[MUL12:%.*]] = mul i8 [[EL12]], 1
-; GFX8-NEXT:    [[MUL13:%.*]] = mul i8 [[EL13]], 1
-; GFX8-NEXT:    [[MUL14:%.*]] = mul i8 [[EL14]], 1
-; GFX8-NEXT:    [[MUL15:%.*]] = mul i8 [[EL15]], 1
-; GFX8-NEXT:    [[ADD0:%.*]] = add i8 [[MUL0]], 1
-; GFX8-NEXT:    [[ADD1:%.*]] = add i8 [[MUL1]], 1
-; GFX8-NEXT:    [[ADD2:%.*]] = add i8 [[MUL2]], 1
-; GFX8-NEXT:    [[ADD3:%.*]] = add i8 [[MUL3]], 1
-; GFX8-NEXT:    [[ADD4:%.*]] = add i8 [[MUL4]], 1
-; GFX8-NEXT:    [[ADD5:%.*]] = add i8 [[MUL5]], 1
-; GFX8-NEXT:    [[ADD6:%.*]] = add i8 [[MUL6]], 1
-; GFX8-NEXT:    [[ADD7:%.*]] = add i8 [[MUL7]], 1
-; GFX8-NEXT:    [[ADD8:%.*]] = add i8 [[MUL8]], 1
-; GFX8-NEXT:    [[ADD9:%.*]] = add i8 [[MUL9]], 1
-; GFX8-NEXT:    [[ADD10:%.*]] = add i8 [[MUL10]], 1
-; GFX8-NEXT:    [[ADD11:%.*]] = add i8 [[MUL11]], 1
-; GFX8-NEXT:    [[ADD12:%.*]] = add i8 [[MUL12]], 1
-; GFX8-NEXT:    [[ADD13:%.*]] = add i8 [[MUL13]], 1
-; GFX8-NEXT:    [[ADD14:%.*]] = add i8 [[MUL14]], 1
-; GFX8-NEXT:    [[ADD15:%.*]] = add i8 [[MUL15]], 1
-; GFX8-NEXT:    [[VECINS0:%.*]] = insertelement <16 x i8> poison, i8 [[ADD0]], i64 0
-; GFX8-NEXT:    [[VECINS1:%.*]] = insertelement <16 x i8> [[VECINS0]], i8 [[ADD1]], i64 1
-; GFX8-NEXT:    [[VECINS2:%.*]] = insertelement <16 x i8> [[VECINS1]], i8 [[ADD2]], i64 2
-; GFX8-NEXT:    [[VECINS3:%.*]] = insertelement <16 x i8> [[VECINS2]], i8 [[ADD3]], i64 3
-; GFX8-NEXT:    [[VECINS4:%.*]] = insertelement <16 x i8> [[VECINS3]], i8 [[ADD4]], i64 4
-; GFX8-NEXT:    [[VECINS5:%.*]] = insertelement <16 x i8> [[VECINS4]], i8 [[ADD5]], i64 5
-; GFX8-NEXT:    [[VECINS6:%.*]] = insertelement <16 x i8> [[VECINS5]], i8 [[ADD6]], i64 6
-; GFX8-NEXT:    [[VECINS7:%.*]] = insertelement <16 x i8> [[VECINS6]], i8 [[ADD7]], i64 7
-; GFX8-NEXT:    [[VECINS8:%.*]] = insertelement <16 x i8> [[VECINS7]], i8 [[ADD8]], i64 8
-; GFX8-NEXT:    [[VECINS9:%.*]] = insertelement <16 x i8> [[VECINS8]], i8 [[ADD9]], i64 9
-; GFX8-NEXT:    [[VECINS10:%.*]] = insertelement <16 x i8> [[VECINS9]], i8 [[ADD10]], i64 10
-; GFX8-NEXT:    [[VECINS11:%.*]] = insertelement <16 x i8> [[VECINS10]], i8 [[ADD11]], i64 11
-; GFX8-NEXT:    [[VECINS12:%.*]] = insertelement <16 x i8> [[VECINS11]], i8 [[ADD12]], i64 12
-; GFX8-NEXT:    [[VECINS13:%.*]] = insertelement <16 x i8> [[VECINS12]], i8 [[ADD13]], i64 13
-; GFX8-NEXT:    [[VECINS14:%.*]] = insertelement <16 x i8> [[VECINS13]], i8 [[ADD14]], i64 14
-; GFX8-NEXT:    [[VECINS153:%.*]] = insertelement <16 x i8> [[VECINS14]], i8 [[ADD15]], i64 15
+; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX8-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[TMP0]], splat (i8 1)
+; GFX8-NEXT:    [[TMP2:%.*]] = add <4 x i8> [[TMP1]], splat (i8 1)
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; GFX8-NEXT:    [[TMP4:%.*]] = mul <4 x i8> [[TMP3]], splat (i8 1)
+; GFX8-NEXT:    [[TMP5:%.*]] = add <4 x i8> [[TMP4]], splat (i8 1)
+; GFX8-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; GFX8-NEXT:    [[TMP7:%.*]] = mul <4 x i8> [[TMP6]], splat (i8 1)
+; GFX8-NEXT:    [[TMP8:%.*]] = add <4 x i8> [[TMP7]], splat (i8 1)
+; GFX8-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; GFX8-NEXT:    [[TMP10:%.*]] = mul <4 x i8> [[TMP9]], splat (i8 1)
+; GFX8-NEXT:    [[TMP11:%.*]] = add <4 x i8> [[TMP10]], splat (i8 1)
+; GFX8-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8-NEXT:    [[VECINS71:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GFX8-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8-NEXT:    [[VECINS112:%.*]] = shufflevector <16 x i8> [[VECINS71]], <16 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
+; GFX8-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8-NEXT:    [[VECINS153:%.*]] = shufflevector <16 x i8> [[VECINS112]], <16 x i8> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; GFX8-NEXT:    store <16 x i8> [[VECINS153]], ptr [[OUT]], align 16
 ; GFX8-NEXT:    ret void
 ;
 ; GFX9-LABEL: define protected amdgpu_kernel void @arith_16(
 ; GFX9-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
 ; GFX9-NEXT:  [[ENTRY:.*:]]
-; GFX9-NEXT:    [[EL0:%.*]] = extractelement <16 x i8> [[INVEC]], i64 0
-; GFX9-NEXT:    [[EL1:%.*]] = extractelement <16 x i8> [[INVEC]], i64 1
-; GFX9-NEXT:    [[EL2:%.*]] = extractelement <16 x i8> [[INVEC]], i64 2
-; GFX9-NEXT:    [[EL3:%.*]] = extractelement <16 x i8> [[INVEC]], i64 3
-; GFX9-NEXT:    [[EL4:%.*]] = extractelement <16 x i8> [[INVEC]], i64 4
-; GFX9-NEXT:    [[EL5:%.*]] = extractelement <16 x i8> [[INVEC]], i64 5
-; GFX9-NEXT:    [[EL6:%.*]] = extractelement <16 x i8> [[INVEC]], i64 6
-; GFX9-NEXT:    [[EL7:%.*]] = extractelement <16 x i8> [[INVEC]], i64 7
-; GFX9-NEXT:    [[EL8:%.*]] = extractelement <16 x i8> [[INVEC]], i64 8
-; GFX9-NEXT:    [[EL9:%.*]] = extractelement <16 x i8> [[INVEC]], i64 9
-; GFX9-NEXT:    [[EL10:%.*]] = extractelement <16 x i8> [[INVEC]], i64 10
-; GFX9-NEXT:    [[EL11:%.*]] = extractelement <16 x i8> [[INVEC]], i64 11
-; GFX9-NEXT:    [[EL12:%.*]] = extractelement <16 x i8> [[INVEC]], i64 12
-; GFX9-NEXT:    [[EL13:%.*]] = extractelement <16 x i8> [[INVEC]], i64 13
-; GFX9-NEXT:    [[EL14:%.*]] = extractelement <16 x i8> [[INVEC]], i64 14
-; GFX9-NEXT:    [[EL15:%.*]] = extractelement <16 x i8> [[INVEC]], i64 15
-; GFX9-NEXT:    [[MUL0:%.*]] = mul i8 [[EL0]], 1
-; GFX9-NEXT:    [[MUL1:%.*]] = mul i8 [[EL1]], 1
-; GFX9-NEXT:    [[MUL2:%.*]] = mul i8 [[EL2]], 1
-; GFX9-NEXT:    [[MUL3:%.*]] = mul i8 [[EL3]], 1
-; GFX9-NEXT:    [[MUL4:%.*]] = mul i8 [[EL4]], 1
-; GFX9-NEXT:    [[MUL5:%.*]] = mul i8 [[EL5]], 1
-; GFX9-NEXT:    [[MUL6:%.*]] = mul i8 [[EL6]], 1
-; GFX9-NEXT:    [[MUL7:%.*]] = mul i8 [[EL7]], 1
-; GFX9-NEXT:    [[MUL8:%.*]] = mul i8 [[EL8]], 1
-; GFX9-NEXT:    [[MUL9:%.*]] = mul i8 [[EL9]], 1
-; GFX9-NEXT:    [[MUL10:%.*]] = mul i8 [[EL10]], 1
-; GFX9-NEXT:    [[MUL11:%.*]] = mul i8 [[EL11]], 1
-; GFX9-NEXT:    [[MUL12:%.*]] = mul i8 [[EL12]], 1
-; GFX9-NEXT:    [[MUL13:%.*]] = mul i8 [[EL13]], 1
-; GFX9-NEXT:    [[MUL14:%.*]] = mul i8 [[EL14]], 1
-; GFX9-NEXT:    [[MUL15:%.*]] = mul i8 [[EL15]], 1
-; GFX9-NEXT:    [[ADD0:%.*]] = add i8 [[MUL0]], 1
-; GFX9-NEXT:    [[ADD1:%.*]] = add i8 [[MUL1]], 1
-; GFX9-NEXT:    [[ADD2:%.*]] = add i8 [[MUL2]], 1
-; GFX9-NEXT:    [[ADD3:%.*]] = add i8 [[MUL3]], 1
-; GFX9-NEXT:    [[ADD4:%.*]] = add i8 [[MUL4]], 1
-; GFX9-NEXT:    [[ADD5:%.*]] = add i8 [[MUL5]], 1
-; GFX9-NEXT:    [[ADD6:%.*]] = add i8 [[MUL6]], 1
-; GFX9-NEXT:    [[ADD7:%.*]] = add i8 [[MUL7]], 1
-; GFX9-NEXT:    [[ADD8:%.*]] = add i8 [[MUL8]], 1
-; GFX9-NEXT:    [[ADD9:%.*]] = add i8 [[MUL9]], 1
-; GFX9-NEXT:    [[ADD10:%.*]] = add i8 [[MUL10]], 1
-; GFX9-NEXT:    [[ADD11:%.*]] = add i8 [[MUL11]], 1
-; GFX9-NEXT:    [[ADD12:%.*]] = add i8 [[MUL12]], 1
-; GFX9-NEXT:    [[ADD13:%.*]] = add i8 [[MUL13]], 1
-; GFX9-NEXT:    [[ADD14:%.*]] = add i8 [[MUL14]], 1
-; GFX9-NEXT:    [[ADD15:%.*]] = add i8 [[MUL15]], 1
-; GFX9-NEXT:    [[VECINS0:%.*]] = insertelement <16 x i8> poison, i8 [[ADD0]], i64 0
-; GFX9-NEXT:    [[VECINS1:%.*]] = insertelement <16 x i8> [[VECINS0]], i8 [[ADD1]], i64 1
-; GFX9-NEXT:    [[VECINS2:%.*]] = insertelement <16 x i8> [[VECINS1]], i8 [[ADD2]], i64 2
-; GFX9-NEXT:    [[VECINS3:%.*]] = insertelement <16 x i8> [[VECINS2]], i8 [[ADD3]], i64 3
-; GFX9-NEXT:    [[VECINS4:%.*]] = insertelement <16 x i8> [[VECINS3]], i8 [[ADD4]], i64 4
-; GFX9-NEXT:    [[VECINS5:%.*]] = insertelement <16 x i8> [[VECINS4]], i8 [[ADD5]], i64 5
-; GFX9-NEXT:    [[VECINS6:%.*]] = insertelement <16 x i8> [[VECINS5]], i8 [[ADD6]], i64 6
-; GFX9-NEXT:    [[VECINS7:%.*]] = insertelement <16 x i8> [[VECINS6]], i8 [[ADD7]], i64 7
-; GFX9-NEXT:    [[VECINS8:%.*]] = insertelement <16 x i8> [[VECINS7]], i8 [[ADD8]], i64 8
-; GFX9-NEXT:    [[VECINS9:%.*]] = insertelement <16 x i8> [[VECINS8]], i8 [[ADD9]], i64 9
-; GFX9-NEXT:    [[VECINS10:%.*]] = insertelement <16 x i8> [[VECINS9]], i8 [[ADD10]], i64 10
-; GFX9-NEXT:    [[VECINS11:%.*]] = insertelement <16 x i8> [[VECINS10]], i8 [[ADD11]], i64 11
-; GFX9-NEXT:    [[VECINS12:%.*]] = insertelement <16 x i8> [[VECINS11]], i8 [[ADD12]], i64 12
-; GFX9-NEXT:    [[VECINS13:%.*]] = insertelement <16 x i8> [[VECINS12]], i8 [[ADD13]], i64 13
-; GFX9-NEXT:    [[VECINS14:%.*]] = insertelement <16 x i8> [[VECINS13]], i8 [[ADD14]], i64 14
-; GFX9-NEXT:    [[VECINS153:%.*]] = insertelement <16 x i8> [[VECINS14]], i8 [[ADD15]], i64 15
+; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX9-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[TMP0]], splat (i8 1)
+; GFX9-NEXT:    [[TMP2:%.*]] = add <4 x i8> [[TMP1]], splat (i8 1)
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; GFX9-NEXT:    [[TMP4:%.*]] = mul <4 x i8> [[TMP3]], splat (i8 1)
+; GFX9-NEXT:    [[TMP5:%.*]] = add <4 x i8> [[TMP4]], splat (i8 1)
+; GFX9-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; GFX9-NEXT:    [[TMP7:%.*]] = mul <4 x i8> [[TMP6]], splat (i8 1)
+; GFX9-NEXT:    [[TMP8:%.*]] = add <4 x i8> [[TMP7]], splat (i8 1)
+; GFX9-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; GFX9-NEXT:    [[TMP10:%.*]] = mul <4 x i8> [[TMP9]], splat (i8 1)
+; GFX9-NEXT:    [[TMP11:%.*]] = add <4 x i8> [[TMP10]], splat (i8 1)
+; GFX9-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[VECINS71:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GFX9-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[VECINS112:%.*]] = shufflevector <16 x i8> [[VECINS71]], <16 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
+; GFX9-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[VECINS153:%.*]] = shufflevector <16 x i8> [[VECINS112]], <16 x i8> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; GFX9-NEXT:    store <16 x i8> [[VECINS153]], ptr [[OUT]], align 16
 ; GFX9-NEXT:    ret void
 ;
@@ -529,19 +401,13 @@ define protected amdgpu_kernel void @phi_2(ptr addrspace(3) %inptr0, ptr addrspa
 ; GFX8-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
 ; GFX8-NEXT:  [[ENTRY:.*]]:
 ; GFX8-NEXT:    [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
-; GFX8-NEXT:    [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX8-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
-; GFX8-NEXT:    [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
+; GFX8-NEXT:    [[TMP0:%.*]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8
 ; GFX8-NEXT:    br label %[[DO_BODY:.*]]
 ; GFX8:       [[DO_BODY]]:
-; GFX8-NEXT:    [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ]
-; GFX8-NEXT:    [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ]
-; GFX8-NEXT:    [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX8-NEXT:    [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1
-; GFX8-NEXT:    [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8
-; GFX8-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9
-; GFX8-NEXT:    [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
-; GFX8-NEXT:    [[VEC111:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
+; GFX8-NEXT:    [[TMP1:%.*]] = phi <2 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
+; GFX8-NEXT:    [[TMP2]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8-NEXT:    [[VEC111:%.*]] = shufflevector <2 x i8> [[TMP1]], <2 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GFX8-NEXT:    store <16 x i8> [[VEC111]], ptr addrspace(3) [[INPTR1]], align 2
 ; GFX8-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
 ; GFX8-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
@@ -554,19 +420,13 @@ define protected amdgpu_kernel void @phi_2(ptr addrspace(3) %inptr0, ptr addrspa
 ; GFX9-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
 ; GFX9-NEXT:  [[ENTRY:.*]]:
 ; GFX9-NEXT:    [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
-; GFX9-NEXT:    [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX9-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
-; GFX9-NEXT:    [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
+; GFX9-NEXT:    [[TMP0:%.*]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8
 ; GFX9-NEXT:    br label %[[DO_BODY:.*]]
 ; GFX9:       [[DO_BODY]]:
-; GFX9-NEXT:    [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ]
-; GFX9-NEXT:    [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ]
-; GFX9-NEXT:    [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX9-NEXT:    [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1
-; GFX9-NEXT:    [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8
-; GFX9-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9
-; GFX9-NEXT:    [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
-; GFX9-NEXT:    [[VEC111:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
+; GFX9-NEXT:    [[TMP1:%.*]] = phi <2 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT:    [[TMP2]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[VEC111:%.*]] = shufflevector <2 x i8> [[TMP1]], <2 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GFX9-NEXT:    store <16 x i8> [[VEC111]], ptr addrspace(3) [[INPTR1]], align 2
 ; GFX9-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
 ; GFX9-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
@@ -637,24 +497,18 @@ define protected amdgpu_kernel void @phi_3(ptr addrspace(3) %inptr0, ptr addrspa
 ; GFX8-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
 ; GFX8-NEXT:  [[ENTRY:.*]]:
 ; GFX8-NEXT:    [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
-; GFX8-NEXT:    [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX8-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
-; GFX8-NEXT:    [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
+; GFX8-NEXT:    [[TMP0:%.*]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8
 ; GFX8-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2
 ; GFX8-NEXT:    [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
 ; GFX8-NEXT:    br label %[[DO_BODY:.*]]
 ; GFX8:       [[DO_BODY]]:
 ; GFX8-NEXT:    [[PHI1:%.*]] = phi i8 [ [[ELE2]], %[[ENTRY]] ], [ [[OTHERELE2:%.*]], %[[DO_BODY]] ]
-; GFX8-NEXT:    [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ]
-; GFX8-NEXT:    [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ]
-; GFX8-NEXT:    [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX8-NEXT:    [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1
+; GFX8-NEXT:    [[TMP1:%.*]] = phi <2 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
+; GFX8-NEXT:    [[TMP2]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8
 ; GFX8-NEXT:    [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2
-; GFX8-NEXT:    [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8
-; GFX8-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GFX8-NEXT:    [[VEC02:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[OTHERELE2]], i64 10
-; GFX8-NEXT:    [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
-; GFX8-NEXT:    [[VEC111:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
+; GFX8-NEXT:    [[VEC111:%.*]] = shufflevector <2 x i8> [[TMP1]], <2 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GFX8-NEXT:    [[VEC12:%.*]] = insertelement <16 x i8> [[VEC111]], i8 [[PHI1]], i64 10
 ; GFX8-NEXT:    store <16 x i8> [[VEC12]], ptr addrspace(3) [[INPTR1]], align 2
 ; GFX8-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
@@ -668,24 +522,18 @@ define protected amdgpu_kernel void @phi_3(ptr addrspace(3) %inptr0, ptr addrspa
 ; GFX9-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
 ; GFX9-NEXT:  [[ENTRY:.*]]:
 ; GFX9-NEXT:    [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
-; GFX9-NEXT:    [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX9-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
-; GFX9-NEXT:    [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
+; GFX9-NEXT:    [[TMP0:%.*]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8
 ; GFX9-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2
 ; GFX9-NEXT:    [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
 ; GFX9-NEXT:    br label %[[DO_BODY:.*]]
 ; GFX9:       [[DO_BODY]]:
 ; GFX9-NEXT:    [[PHI1:%.*]] = phi i8 [ [[ELE2]], %[[ENTRY]] ], [ [[OTHERELE2:%.*]], %[[DO_BODY]] ]
-; GFX9-NEXT:    [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ]
-; GFX9-NEXT:    [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ]
-; GFX9-NEXT:    [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX9-NEXT:    [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1
+; GFX9-NEXT:    [[TMP1:%.*]] = phi <2 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT:    [[TMP2]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8
 ; GFX9-NEXT:    [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2
-; GFX9-NEXT:    [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8
-; GFX9-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GFX9-NEXT:    [[VEC02:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[OTHERELE2]], i64 10
-; GFX9-NEXT:    [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
-; GFX9-NEXT:    [[VEC111:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
+; GFX9-NEXT:    [[VEC111:%.*]] = shufflevector <2 x i8> [[TMP1]], <2 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GFX9-NEXT:    [[VEC12:%.*]] = insertelement <16 x i8> [[VEC111]], i8 [[PHI1]], i64 10
 ; GFX9-NEXT:    store <16 x i8> [[VEC12]], ptr addrspace(3) [[INPTR1]], align 2
 ; GFX9-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
@@ -769,31 +617,13 @@ define protected amdgpu_kernel void @phi_4(ptr addrspace(3) %inptr0, ptr addrspa
 ; GFX8-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
 ; GFX8-NEXT:  [[ENTRY:.*]]:
 ; GFX8-NEXT:    [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
-; GFX8-NEXT:    [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX8-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
-; GFX8-NEXT:    [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
-; GFX8-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2
-; GFX8-NEXT:    [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
-; GFX8-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3
-; GFX8-NEXT:    [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1
+; GFX8-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
 ; GFX8-NEXT:    br label %[[DO_BODY:.*]]
 ; GFX8:       [[DO_BODY]]:
-; GFX8-NEXT:    [[PHI0:%.*]] = phi i8 [ [[ELE3]], %[[ENTRY]] ], [ [[OTHERELE3:%.*]], %[[DO_BODY]] ]
-; GFX8-NEXT:    [[PHI1:%.*]] = phi i8 [ [[ELE2]], %[[ENTRY]] ], [ [[OTHERELE2:%.*]], %[[DO_BODY]] ]
-; GFX8-NEXT:    [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ]
-; GFX8-NEXT:    [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ]
-; GFX8-NEXT:    [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX8-NEXT:    [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1
-; GFX8-NEXT:    [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2
-; GFX8-NEXT:    [[OTHERELE3]] = load i8, ptr addrspace(3) [[GEP3]], align 1
-; GFX8-NEXT:    [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8
-; GFX8-NEXT:    [[VEC01:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9
-; GFX8-NEXT:    [[VEC02:%.*]] = insertelement <16 x i8> [[VEC01]], i8 [[OTHERELE2]], i64 10
-; GFX8-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> [[VEC02]], i8 [[OTHERELE3]], i64 11
-; GFX8-NEXT:    [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
-; GFX8-NEXT:    [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
-; GFX8-NEXT:    [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10
-; GFX8-NEXT:    [[VEC131:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11
+; GFX8-NEXT:    [[TMP1:%.*]] = phi <4 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
+; GFX8-NEXT:    [[TMP2]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8-NEXT:    [[VEC131:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GFX8-NEXT:    store <16 x i8> [[VEC131]], ptr addrspace(3) [[INPTR1]], align 2
 ; GFX8-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
 ; GFX8-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
@@ -806,31 +636,13 @@ define protected amdgpu_kernel void @phi_4(ptr addrspace(3) %inptr0, ptr addrspa
 ; GFX9-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
 ; GFX9-NEXT:  [[ENTRY:.*]]:
 ; GFX9-NEXT:    [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
-; GFX9-NEXT:    [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX9-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
-; GFX9-NEXT:    [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
-; GFX9-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2
-; GFX9-NEXT:    [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
-; GFX9-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3
-; GFX9-NEXT:    [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1
+; GFX9-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
 ; GFX9-NEXT:    br label %[[DO_BODY:.*]]
 ; GFX9:       [[DO_BODY]]:
-; GFX9-NEXT:    [[PHI0:%.*]] = phi i8 [ [[ELE3]], %[[ENTRY]] ], [ [[OTHERELE3:%.*]], %[[DO_BODY]] ]
-; GFX9-NEXT:    [[PHI1:%.*]] = phi i8 [ [[ELE2]], %[[ENTRY]] ], [ [[OTHERELE2:%.*]], %[[DO_BODY]] ]
-; GFX9-NEXT:    [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ]
-; GFX9-NEXT:    [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ]
-; GFX9-NEXT:    [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX9-NEXT:    [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1
-; GFX9-NEXT:    [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2
-; GFX9-NEXT:    [[OTHERELE3]] = load i8, ptr addrspace(3) [[GEP3]], align 1
-; GFX9-NEXT:    [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8
-; GFX9-NEXT:    [[VEC01:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9
-; GFX9-NEXT:    [[VEC02:%.*]] = insertelement <16 x i8> [[VEC01]], i8 [[OTHERELE2]], i64 10
-; GFX9-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> [[VEC02]], i8 [[OTHERELE3]], i64 11
-; GFX9-NEXT:    [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
-; GFX9-NEXT:    [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
-; GFX9-NEXT:    [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10
-; GFX9-NEXT:    [[VEC131:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11
+; GFX9-NEXT:    [[TMP1:%.*]] = phi <4 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT:    [[TMP2]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[VEC131:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GFX9-NEXT:    store <16 x i8> [[VEC131]], ptr addrspace(3) [[INPTR1]], align 2
 ; GFX9-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
 ; GFX9-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
@@ -882,31 +694,13 @@ define protected amdgpu_kernel void @phi_4_with_stores(ptr addrspace(3) %inptr0,
 ; GFX7-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
 ; GFX7-NEXT:  [[ENTRY:.*]]:
 ; GFX7-NEXT:    [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
-; GFX7-NEXT:    [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX7-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
-; GFX7-NEXT:    [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
-; GFX7-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2
-; GFX7-NEXT:    [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
-; GFX7-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3
-; GFX7-NEXT:    [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1
+; GFX7-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
 ; GFX7-NEXT:    br label %[[DO_BODY:.*]]
 ; GFX7:       [[DO_BODY]]:
-; GFX7-NEXT:    [[PHI0:%.*]] = phi i8 [ [[ELE3]], %[[ENTRY]] ], [ [[OTHERELE3:%.*]], %[[DO_BODY]] ]
-; GFX7-NEXT:    [[PHI1:%.*]] = phi i8 [ [[ELE2]], %[[ENTRY]] ], [ [[OTHERELE2:%.*]], %[[DO_BODY]] ]
-; GFX7-NEXT:    [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ]
-; GFX7-NEXT:    [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ]
-; GFX7-NEXT:    [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX7-NEXT:    [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1
-; GFX7-NEXT:    [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2
-; GFX7-NEXT:    [[OTHERELE3]] = load i8, ptr addrspace(3) [[GEP3]], align 1
-; GFX7-NEXT:    store i8 [[PHI3]], ptr addrspace(3) [[GEP0]], align 2
-; GFX7-NEXT:    store i8 [[PHI2]], ptr addrspace(3) [[GEP1]], align 2
-; GFX7-NEXT:    store i8 [[PHI1]], ptr addrspace(3) [[GEP2]], align 2
-; GFX7-NEXT:    store i8 [[PHI0]], ptr addrspace(3) [[GEP3]], align 2
-; GFX7-NEXT:    [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
-; GFX7-NEXT:    [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
-; GFX7-NEXT:    [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10
-; GFX7-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11
+; GFX7-NEXT:    [[TMP1:%.*]] = phi <4 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
+; GFX7-NEXT:    [[TMP2]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
+; GFX7-NEXT:    store <4 x i8> [[TMP1]], ptr addrspace(3) [[GEP0]], align 2
+; GFX7-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GFX7-NEXT:    store <16 x i8> [[TMP3]], ptr addrspace(3) [[INPTR1]], align 2
 ; GFX7-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
 ; GFX7-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
@@ -919,31 +713,13 @@ define protected amdgpu_kernel void @phi_4_with_stores(ptr addrspace(3) %inptr0,
 ; GFX8-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
 ; GFX8-NEXT:  [[ENTRY:.*]]:
 ; GFX8-NEXT:    [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
-; GFX8-NEXT:    [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX8-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
-; GFX8-NEXT:    [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
-; GFX8-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2
-; GFX8-NEXT:    [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
-; GFX8-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3
-; GFX8-NEXT:    [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1
+; GFX8-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
 ; GFX8-NEXT:    br label %[[DO_BODY:.*]]
 ; GFX8:       [[DO_BODY]]:
-; GFX8-NEXT:    [[PHI0:%.*]] = phi i8 [ [[ELE3]], %[[ENTRY]] ], [ [[OTHERELE3:%.*]], %[[DO_BODY]] ]
-; GFX8-NEXT:    [[PHI1:%.*]] = phi i8 [ [[ELE2]], %[[ENTRY]] ], [ [[OTHERELE2:%.*]], %[[DO_BODY]] ]
-; GFX8-NEXT:    [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ]
-; GFX8-NEXT:    [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ]
-; GFX8-NEXT:    [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX8-NEXT:    [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1
-; GFX8-NEXT:    [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2
-; GFX8-NEXT:    [[OTHERELE3]] = load i8, ptr addrspace(3) [[GEP3]], align 1
-; GFX8-NEXT:    store i8 [[PHI3]], ptr addrspace(3) [[GEP0]], align 2
-; GFX8-NEXT:    store i8 [[PHI2]], ptr addrspace(3) [[GEP1]], align 2
-; GFX8-NEXT:    store i8 [[PHI1]], ptr addrspace(3) [[GEP2]], align 2
-; GFX8-NEXT:    store i8 [[PHI0]], ptr addrspace(3) [[GEP3]], align 2
-; GFX8-NEXT:    [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
-; GFX8-NEXT:    [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
-; GFX8-NEXT:    [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10
-; GFX8-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11
+; GFX8-NEXT:    [[TMP1:%.*]] = phi <4 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
+; GFX8-NEXT:    [[TMP2]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
+; GFX8-NEXT:    store <4 x i8> [[TMP1]], ptr addrspace(3) [[GEP0]], align 2
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GFX8-NEXT:    store <16 x i8> [[TMP3]], ptr addrspace(3) [[INPTR1]], align 2
 ; GFX8-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
 ; GFX8-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
@@ -956,31 +732,13 @@ define protected amdgpu_kernel void @phi_4_with_stores(ptr addrspace(3) %inptr0,
 ; GFX9-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
 ; GFX9-NEXT:  [[ENTRY:.*]]:
 ; GFX9-NEXT:    [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
-; GFX9-NEXT:    [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX9-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
-; GFX9-NEXT:    [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
-; GFX9-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2
-; GFX9-NEXT:    [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
-; GFX9-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3
-; GFX9-NEXT:    [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1
+; GFX9-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
 ; GFX9-NEXT:    br label %[[DO_BODY:.*]]
 ; GFX9:       [[DO_BODY]]:
-; GFX9-NEXT:    [[PHI0:%.*]] = phi i8 [ [[ELE3]], %[[ENTRY]] ], [ [[OTHERELE3:%.*]], %[[DO_BODY]] ]
-; GFX9-NEXT:    [[PHI1:%.*]] = phi i8 [ [[ELE2]], %[[ENTRY]] ], [ [[OTHERELE2:%.*]], %[[DO_BODY]] ]
-; GFX9-NEXT:    [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ]
-; GFX9-NEXT:    [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ]
-; GFX9-NEXT:    [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX9-NEXT:    [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1
-; GFX9-NEXT:    [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2
-; GFX9-NEXT:    [[OTHERELE3]] = load i8, ptr addrspace(3) [[GEP3]], align 1
-; GFX9-NEXT:    store i8 [[PHI3]], ptr addrspace(3) [[GEP0]], align 2
-; GFX9-NEXT:    store i8 [[PHI2]], ptr addrspace(3) [[GEP1]], align 2
-; GFX9-NEXT:    store i8 [[PHI1]], ptr addrspace(3) [[GEP2]], align 2
-; GFX9-NEXT:    store i8 [[PHI0]], ptr addrspace(3) [[GEP3]], align 2
-; GFX9-NEXT:    [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
-; GFX9-NEXT:    [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
-; GFX9-NEXT:    [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10
-; GFX9-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11
+; GFX9-NEXT:    [[TMP1:%.*]] = phi <4 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT:    [[TMP2]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
+; GFX9-NEXT:    store <4 x i8> [[TMP1]], ptr addrspace(3) [[GEP0]], align 2
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GFX9-NEXT:    store <16 x i8> [[TMP3]], ptr addrspace(3) [[INPTR1]], align 2
 ; GFX9-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
 ; GFX9-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
@@ -1031,10 +789,6 @@ define protected amdgpu_kernel void @phi_4_with_stores_outside_loop(<4 x i8> %in
 ; GFX7-LABEL: define protected amdgpu_kernel void @phi_4_with_stores_outside_loop(
 ; GFX7-SAME: <4 x i8> [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr addrspace(3) [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
 ; GFX7-NEXT:  [[ENTRY:.*]]:
-; GFX7-NEXT:    [[TMP4:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 3
-; GFX7-NEXT:    [[TMP1:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 2
-; GFX7-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 1
-; GFX7-NEXT:    [[TMP3:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 0
 ; GFX7-NEXT:    br label %[[DO_BODY:.*]]
 ; GFX7:       [[DO_BODY]]:
 ; GFX7-NEXT:    [[TMP0:%.*]] = phi <4 x i8> [ [[INPTR0]], %[[ENTRY]] ], [ [[INPTR0]], %[[DO_BODY]] ]
@@ -1044,22 +798,12 @@ define protected amdgpu_kernel void @phi_4_with_stores_outside_loop(<4 x i8> %in
 ; GFX7-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
 ; GFX7:       [[EXIT]]:
 ; GFX7-NEXT:    [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 0
-; GFX7-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 1
-; GFX7-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 2
-; GFX7-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 3
-; GFX7-NEXT:    store i8 [[TMP3]], ptr addrspace(3) [[GEP0]], align 1
-; GFX7-NEXT:    store i8 [[TMP2]], ptr addrspace(3) [[GEP1]], align 1
-; GFX7-NEXT:    store i8 [[TMP1]], ptr addrspace(3) [[GEP2]], align 1
-; GFX7-NEXT:    store i8 [[TMP4]], ptr addrspace(3) [[GEP3]], align 1
+; GFX7-NEXT:    store <4 x i8> [[INPTR0]], ptr addrspace(3) [[GEP0]], align 1
 ; GFX7-NEXT:    ret void
 ;
 ; GFX8-LABEL: define protected amdgpu_kernel void @phi_4_with_stores_outside_loop(
 ; GFX8-SAME: <4 x i8> [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr addrspace(3) [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
 ; GFX8-NEXT:  [[ENTRY:.*]]:
-; GFX8-NEXT:    [[TMP4:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 3
-; GFX8-NEXT:    [[TMP1:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 2
-; GFX8-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 1
-; GFX8-NEXT:    [[TMP3:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 0
 ; GFX8-NEXT:    br label %[[DO_BODY:.*]]
 ; GFX8:       [[DO_BODY]]:
 ; GFX8-NEXT:    [[TMP0:%.*]] = phi <4 x i8> [ [[INPTR0]], %[[ENTRY]] ], [ [[INPTR0]], %[[DO_BODY]] ]
@@ -1069,22 +813,12 @@ define protected amdgpu_kernel void @phi_4_with_stores_outside_loop(<4 x i8> %in
 ; GFX8-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
 ; GFX8:       [[EXIT]]:
 ; GFX8-NEXT:    [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 0
-; GFX8-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 1
-; GFX8-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 2
-; GFX8-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 3
-; GFX8-NEXT:    store i8 [[TMP3]], ptr addrspace(3) [[GEP0]], align 1
-; GFX8-NEXT:    store i8 [[TMP2]], ptr addrspace(3) [[GEP1]], align 1
-; GFX8-NEXT:    store i8 [[TMP1]], ptr addrspace(3) [[GEP2]], align 1
-; GFX8-NEXT:    store i8 [[TMP4]], ptr addrspace(3) [[GEP3]], align 1
+; GFX8-NEXT:    store <4 x i8> [[INPTR0]], ptr addrspace(3) [[GEP0]], align 1
 ; GFX8-NEXT:    ret void
 ;
 ; GFX9-LABEL: define protected amdgpu_kernel void @phi_4_with_stores_outside_loop(
 ; GFX9-SAME: <4 x i8> [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr addrspace(3) [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
 ; GFX9-NEXT:  [[ENTRY:.*]]:
-; GFX9-NEXT:    [[TMP4:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 3
-; GFX9-NEXT:    [[TMP1:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 2
-; GFX9-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 1
-; GFX9-NEXT:    [[TMP3:%.*]] = extractelement <4 x i8> [[INPTR0]], i32 0
 ; GFX9-NEXT:    br label %[[DO_BODY:.*]]
 ; GFX9:       [[DO_BODY]]:
 ; GFX9-NEXT:    [[TMP0:%.*]] = phi <4 x i8> [ [[INPTR0]], %[[ENTRY]] ], [ [[INPTR0]], %[[DO_BODY]] ]
@@ -1094,13 +828,7 @@ define protected amdgpu_kernel void @phi_4_with_stores_outside_loop(<4 x i8> %in
 ; GFX9-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
 ; GFX9:       [[EXIT]]:
 ; GFX9-NEXT:    [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 0
-; GFX9-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 1
-; GFX9-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 2
-; GFX9-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 3
-; GFX9-NEXT:    store i8 [[TMP3]], ptr addrspace(3) [[GEP0]], align 1
-; GFX9-NEXT:    store i8 [[TMP2]], ptr addrspace(3) [[GEP1]], align 1
-; GFX9-NEXT:    store i8 [[TMP1]], ptr addrspace(3) [[GEP2]], align 1
-; GFX9-NEXT:    store i8 [[TMP4]], ptr addrspace(3) [[GEP3]], align 1
+; GFX9-NEXT:    store <4 x i8> [[INPTR0]], ptr addrspace(3) [[GEP0]], align 1
 ; GFX9-NEXT:    ret void
 ;
 entry: