[llvm] [AMDGPU] Vectorize i8 Shuffles (PR #95840)

Mon Jun 17 13:31:09 PDT 2024

https://github.com/jrbyrnes created https://github.com/llvm/llvm-project/pull/95840

i8 shuffle vectors have standard isel dags and will be lowered into v_perms with CalculateByteProvider. 

Moreever, if we are unconvinced by https://github.com/llvm/llvm-project/pull/95328 , enabling i8 shuffle vectorization should indirectly capture many of the needed vectorization cases for https://github.com/llvm/llvm-project/pull/66838  . In that sense, this PR sort of competes with https://github.com/llvm/llvm-project/pull/95328 , though it does have standalone merits.

>From 69ecffc456160b7ba2aedab16dfff8ecc33eeb9a Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 3 May 2024 11:37:40 -0700
Subject: [PATCH 1/3] [AMDGPU] Allow SLP to analyze i8s

Change-Id: Ia995bc646e5f050083bd6277eeabe0b5ab410f47
---
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      |  20 +-
 .../Target/AMDGPU/AMDGPUTargetTransformInfo.h |   1 +
 .../AMDGPU/add_sub_sat-inseltpoison.ll        |  55 +++
 .../SLPVectorizer/AMDGPU/add_sub_sat.ll       |  56 +++
 .../Transforms/SLPVectorizer/AMDGPU/i8.ll     | 412 ++++++++++++++++++
 .../AMDGPU/phi-result-use-order.ll            |  75 ++++
 .../SLPVectorizer/AMDGPU/reduction.ll         | 327 ++++++++++++++
 7 files changed, 943 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/AMDGPU/i8.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 437e01c37c6b6..2430ad2a9a3aa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -306,6 +306,18 @@ bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
   return !F || !ST->isSingleLaneExecution(*F);
 }
 
+unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {
+  if (auto VTy = dyn_cast<FixedVectorType>(Tp)) {
+    if (DL.getTypeSizeInBits(VTy->getElementType()) == 8) {
+      auto ElCount = VTy->getElementCount().getFixedValue();
+      return ElCount / 4;
+    }
+  }
+
+  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
+  return LT.first.isValid() ? *LT.first.getValue() : 0;
+}
+
 unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
   // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
   // registers. See getRegisterClassForType for the implementation.
@@ -337,9 +349,11 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
 unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
   if (Opcode == Instruction::Load || Opcode == Instruction::Store)
     return 32 * 4 / ElemWidth;
-  return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
-       : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
-       : 1;
+
+  return (ElemWidth == 8)                              ? 4
+         : (ElemWidth == 16 && ST->has16BitInsts())    ? 2
+         : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
+                                                       : 1;
 }
 
 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index b423df17302ca..53502cbe490c2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -117,6 +117,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
     return TTI::PSK_FastHardware;
   }
 
+  unsigned getNumberOfParts(Type *Tp) const;
   unsigned getNumberOfRegisters(unsigned RCID) const;
   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const;
   unsigned getMinVectorRegisterBitWidth() const;
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
index 3749bdf1bba39..373c18d0c6431 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
@@ -363,11 +363,66 @@ bb:
   ret <4 x i16> %ins.3
 }
 
+define <4 x i8> @uadd_sat_v4i8(<4 x i8> %arg0, <4 x i8> %arg1) {
+; GCN-LABEL: @uadd_sat_v4i8(
+; GCN-NEXT:  bb:
+; GCN-NEXT:    [[TMP0:%.*]] = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> [[ARG0:%.*]], <4 x i8> [[ARG1:%.*]])
+; GCN-NEXT:    ret <4 x i8> [[TMP0]]
+;
+bb:
+  %arg0.0 = extractelement <4 x i8> %arg0, i64 0
+  %arg0.1 = extractelement <4 x i8> %arg0, i64 1
+  %arg0.2 = extractelement <4 x i8> %arg0, i64 2
+  %arg0.3 = extractelement <4 x i8> %arg0, i64 3
+  %arg1.0 = extractelement <4 x i8> %arg1, i64 0
+  %arg1.1 = extractelement <4 x i8> %arg1, i64 1
+  %arg1.2 = extractelement <4 x i8> %arg1, i64 2
+  %arg1.3 = extractelement <4 x i8> %arg1, i64 3
+  %add.0 = call i8 @llvm.uadd.sat.i8(i8 %arg0.0, i8 %arg1.0)
+  %add.1 = call i8 @llvm.uadd.sat.i8(i8 %arg0.1, i8 %arg1.1)
+  %add.2 = call i8 @llvm.uadd.sat.i8(i8 %arg0.2, i8 %arg1.2)
+  %add.3 = call i8 @llvm.uadd.sat.i8(i8 %arg0.3, i8 %arg1.3)
+  %ins.0 = insertelement <4 x i8> poison, i8 %add.0, i64 0
+  %ins.1 = insertelement <4 x i8> %ins.0, i8 %add.1, i64 1
+  %ins.2 = insertelement <4 x i8> %ins.1, i8 %add.2, i64 2
+  %ins.3 = insertelement <4 x i8> %ins.2, i8 %add.3, i64 3
+  ret <4 x i8> %ins.3
+}
+
+define <4 x i8> @usub_sat_v4i8(<4 x i8> %arg0, <4 x i8> %arg1) {
+; GCN-LABEL: @usub_sat_v4i8(
+; GCN-NEXT:  bb:
+; GCN-NEXT:    [[TMP0:%.*]] = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> [[ARG0:%.*]], <4 x i8> [[ARG1:%.*]])
+; GCN-NEXT:    ret <4 x i8> [[TMP0]]
+;
+bb:
+  %arg0.0 = extractelement <4 x i8> %arg0, i64 0
+  %arg0.1 = extractelement <4 x i8> %arg0, i64 1
+  %arg0.2 = extractelement <4 x i8> %arg0, i64 2
+  %arg0.3 = extractelement <4 x i8> %arg0, i64 3
+  %arg1.0 = extractelement <4 x i8> %arg1, i64 0
+  %arg1.1 = extractelement <4 x i8> %arg1, i64 1
+  %arg1.2 = extractelement <4 x i8> %arg1, i64 2
+  %arg1.3 = extractelement <4 x i8> %arg1, i64 3
+  %add.0 = call i8 @llvm.usub.sat.i8(i8 %arg0.0, i8 %arg1.0)
+  %add.1 = call i8 @llvm.usub.sat.i8(i8 %arg0.1, i8 %arg1.1)
+  %add.2 = call i8 @llvm.usub.sat.i8(i8 %arg0.2, i8 %arg1.2)
+  %add.3 = call i8 @llvm.usub.sat.i8(i8 %arg0.3, i8 %arg1.3)
+  %ins.0 = insertelement <4 x i8> poison, i8 %add.0, i64 0
+  %ins.1 = insertelement <4 x i8> %ins.0, i8 %add.1, i64 1
+  %ins.2 = insertelement <4 x i8> %ins.1, i8 %add.2, i64 2
+  %ins.3 = insertelement <4 x i8> %ins.2, i8 %add.3, i64 3
+  ret <4 x i8> %ins.3
+}
+
 declare i16 @llvm.uadd.sat.i16(i16, i16) #0
 declare i16 @llvm.usub.sat.i16(i16, i16) #0
 declare i16 @llvm.sadd.sat.i16(i16, i16) #0
 declare i16 @llvm.ssub.sat.i16(i16, i16) #0
 
+declare i8 @llvm.uadd.sat.i8(i8, i8) #0
+declare i8 @llvm.usub.sat.i8(i8, i8) #0
+
 declare i32 @llvm.uadd.sat.i32(i32, i32) #0
 declare i32 @llvm.usub.sat.i32(i32, i32) #0
 declare i32 @llvm.sadd.sat.i32(i32, i32) #0
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
index 0bb641371825b..1d18162a7960e 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
@@ -363,11 +363,67 @@ bb:
   ret <4 x i16> %ins.3
 }
 
+define <4 x i8> @uadd_sat_v4i8(<4 x i8> %arg0, <4 x i8> %arg1, ptr addrspace(1) %dst) {
+; GCN-LABEL: @uadd_sat_v4i8(
+; GCN-NEXT:  bb:
+; GCN-NEXT:    [[TMP0:%.*]] = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> [[ARG0:%.*]], <4 x i8> [[ARG1:%.*]])
+; GCN-NEXT:    ret <4 x i8> [[TMP0]]
+;
+bb:
+  %arg0.0 = extractelement <4 x i8> %arg0, i64 0
+  %arg0.1 = extractelement <4 x i8> %arg0, i64 1
+  %arg0.2 = extractelement <4 x i8> %arg0, i64 2
+  %arg0.3 = extractelement <4 x i8> %arg0, i64 3
+  %arg1.0 = extractelement <4 x i8> %arg1, i64 0
+  %arg1.1 = extractelement <4 x i8> %arg1, i64 1
+  %arg1.2 = extractelement <4 x i8> %arg1, i64 2
+  %arg1.3 = extractelement <4 x i8> %arg1, i64 3
+  %add.0 = call i8 @llvm.uadd.sat.i8(i8 %arg0.0, i8 %arg1.0)
+  %add.1 = call i8 @llvm.uadd.sat.i8(i8 %arg0.1, i8 %arg1.1)
+  %add.2 = call i8 @llvm.uadd.sat.i8(i8 %arg0.2, i8 %arg1.2)
+  %add.3 = call i8 @llvm.uadd.sat.i8(i8 %arg0.3, i8 %arg1.3)
+  %ins.0 = insertelement <4 x i8> undef, i8 %add.0, i64 0
+  %ins.1 = insertelement <4 x i8> %ins.0, i8 %add.1, i64 1
+  %ins.2 = insertelement <4 x i8> %ins.1, i8 %add.2, i64 2
+  %ins.3 = insertelement <4 x i8> %ins.2, i8 %add.3, i64 3
+  ret <4 x i8> %ins.3
+}
+define <4 x i8> @usub_sat_v4i8(<4 x i8> %arg0, <4 x i8> %arg1) {
+; GCN-LABEL: @usub_sat_v4i8(
+; GCN-NEXT:  bb:
+; GCN-NEXT:    [[TMP0:%.*]] = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> [[ARG0:%.*]], <4 x i8> [[ARG1:%.*]])
+; GCN-NEXT:    ret <4 x i8> [[TMP0]]
+;
+bb:
+  %arg0.0 = extractelement <4 x i8> %arg0, i64 0
+  %arg0.1 = extractelement <4 x i8> %arg0, i64 1
+  %arg0.2 = extractelement <4 x i8> %arg0, i64 2
+  %arg0.3 = extractelement <4 x i8> %arg0, i64 3
+  %arg1.0 = extractelement <4 x i8> %arg1, i64 0
+  %arg1.1 = extractelement <4 x i8> %arg1, i64 1
+  %arg1.2 = extractelement <4 x i8> %arg1, i64 2
+  %arg1.3 = extractelement <4 x i8> %arg1, i64 3
+  %add.0 = call i8 @llvm.usub.sat.i8(i8 %arg0.0, i8 %arg1.0)
+  %add.1 = call i8 @llvm.usub.sat.i8(i8 %arg0.1, i8 %arg1.1)
+  %add.2 = call i8 @llvm.usub.sat.i8(i8 %arg0.2, i8 %arg1.2)
+  %add.3 = call i8 @llvm.usub.sat.i8(i8 %arg0.3, i8 %arg1.3)
+  %ins.0 = insertelement <4 x i8> undef, i8 %add.0, i64 0
+  %ins.1 = insertelement <4 x i8> %ins.0, i8 %add.1, i64 1
+  %ins.2 = insertelement <4 x i8> %ins.1, i8 %add.2, i64 2
+  %ins.3 = insertelement <4 x i8> %ins.2, i8 %add.3, i64 3
+  ret <4 x i8> %ins.3
+
+}
+
+
 declare i16 @llvm.uadd.sat.i16(i16, i16) #0
 declare i16 @llvm.usub.sat.i16(i16, i16) #0
 declare i16 @llvm.sadd.sat.i16(i16, i16) #0
 declare i16 @llvm.ssub.sat.i16(i16, i16) #0
 
+declare i8 @llvm.uadd.sat.i8(i8, i8) #0
+declare i8 @llvm.usub.sat.i8(i8, i8) #0
+
 declare i32 @llvm.uadd.sat.i32(i32, i32) #0
 declare i32 @llvm.usub.sat.i32(i32, i32) #0
 declare i32 @llvm.sadd.sat.i32(i32, i32) #0
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/i8.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/i8.ll
new file mode 100644
index 0000000000000..9584e30e4ccaf
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/i8.ll
@@ -0,0 +1,412 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer %s | FileCheck -check-prefixes=GFX7 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer %s | FileCheck -check-prefixes=GFX8PLUS,GFX8 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer %s | FileCheck -check-prefixes=GFX8PLUS,GFX9 %s
+
+define protected amdgpu_kernel void @phi(ptr addrspace(3) %inptr0, ptr addrspace(3) %inptr1, ptr %out, ptr %out1, i32 %flag) {
+; GCN-LABEL: @vectorizePHI(
+; GCN-NEXT:  entry:
+; GCN-NEXT:    [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0:%.*]], i32 0
+; GCN-NEXT:    [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
+; GCN-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
+; GCN-NEXT:    [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
+; GCN-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2
+; GCN-NEXT:    [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
+; GCN-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3
+; GCN-NEXT:    [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1
+; GCN-NEXT:    br label [[DO_BODY:%.*]]
+; GCN:       do.body:
+; GCN-NEXT:    [[PHI0:%.*]] = phi i8 [ [[ELE3]], [[ENTRY:%.*]] ], [ [[OTHERELE3:%.*]], [[DO_BODY]] ]
+; GCN-NEXT:    [[PHI1:%.*]] = phi i8 [ [[ELE2]], [[ENTRY]] ], [ [[OTHERELE2:%.*]], [[DO_BODY]] ]
+; GCN-NEXT:    [[PHI2:%.*]] = phi i8 [ [[ELE1]], [[ENTRY]] ], [ [[OTHERELE1:%.*]], [[DO_BODY]] ]
+; GCN-NEXT:    [[PHI3:%.*]] = phi i8 [ [[ELE0]], [[ENTRY]] ], [ [[OTHERELE0:%.*]], [[DO_BODY]] ]
+; GCN-NEXT:    [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8
+; GCN-NEXT:    [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1
+; GCN-NEXT:    [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2
+; GCN-NEXT:    [[OTHERELE3]] = load i8, ptr addrspace(3) [[GEP3]], align 1
+; GCN-NEXT:    [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8
+; GCN-NEXT:    [[VEC01:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9
+; GCN-NEXT:    [[VEC02:%.*]] = insertelement <16 x i8> [[VEC01]], i8 [[OTHERELE2]], i64 10
+; GCN-NEXT:    [[VEC03:%.*]] = insertelement <16 x i8> [[VEC02]], i8 [[OTHERELE3]], i64 11
+; GCN-NEXT:    [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
+; GCN-NEXT:    [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
+; GCN-NEXT:    [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10
+; GCN-NEXT:    [[VEC13:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11
+; GCN-NEXT:    store <16 x i8> [[VEC13]], ptr addrspace(3) [[INPTR1:%.*]], align 2
+; GCN-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FLAG:%.*]], 0
+; GCN-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[DO_BODY]]
+; GCN:       exit:
+; GCN-NEXT:    store <16 x i8> [[VEC13]], ptr [[OUT:%.*]], align 16
+; GCN-NEXT:    store <16 x i8> [[VEC03]], ptr [[OUT1:%.*]], align 16
+; GCN-NEXT:    ret void
+;
+; GFX7-LABEL: @phi(
+; GFX7-NEXT:  entry:
+; GFX7-NEXT:    [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0:%.*]], i32 0
+; GFX7-NEXT:    [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
+; GFX7-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
+; GFX7-NEXT:    [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
+; GFX7-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2
+; GFX7-NEXT:    [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
+; GFX7-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3
+; GFX7-NEXT:    [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1
+; GFX7-NEXT:    br label [[DO_BODY:%.*]]
+; GFX7:       do.body:
+; GFX7-NEXT:    [[PHI0:%.*]] = phi i8 [ [[ELE3]], [[ENTRY:%.*]] ], [ [[OTHERELE3:%.*]], [[DO_BODY]] ]
+; GFX7-NEXT:    [[PHI1:%.*]] = phi i8 [ [[ELE2]], [[ENTRY]] ], [ [[OTHERELE2:%.*]], [[DO_BODY]] ]
+; GFX7-NEXT:    [[PHI2:%.*]] = phi i8 [ [[ELE1]], [[ENTRY]] ], [ [[OTHERELE1:%.*]], [[DO_BODY]] ]
+; GFX7-NEXT:    [[PHI3:%.*]] = phi i8 [ [[ELE0]], [[ENTRY]] ], [ [[OTHERELE0:%.*]], [[DO_BODY]] ]
+; GFX7-NEXT:    [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8
+; GFX7-NEXT:    [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1
+; GFX7-NEXT:    [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2
+; GFX7-NEXT:    [[OTHERELE3]] = load i8, ptr addrspace(3) [[GEP3]], align 1
+; GFX7-NEXT:    [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8
+; GFX7-NEXT:    [[VEC01:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9
+; GFX7-NEXT:    [[VEC02:%.*]] = insertelement <16 x i8> [[VEC01]], i8 [[OTHERELE2]], i64 10
+; GFX7-NEXT:    [[VEC03:%.*]] = insertelement <16 x i8> [[VEC02]], i8 [[OTHERELE3]], i64 11
+; GFX7-NEXT:    [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
+; GFX7-NEXT:    [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
+; GFX7-NEXT:    [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10
+; GFX7-NEXT:    [[VEC13:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11
+; GFX7-NEXT:    store <16 x i8> [[VEC13]], ptr addrspace(3) [[INPTR1:%.*]], align 2
+; GFX7-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FLAG:%.*]], 0
+; GFX7-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[DO_BODY]]
+; GFX7:       exit:
+; GFX7-NEXT:    store <16 x i8> [[VEC13]], ptr [[OUT:%.*]], align 16
+; GFX7-NEXT:    store <16 x i8> [[VEC03]], ptr [[OUT1:%.*]], align 16
+; GFX7-NEXT:    ret void
+;
+; GFX8PLUS-LABEL: @phi(
+; GFX8PLUS-NEXT:  entry:
+; GFX8PLUS-NEXT:    [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0:%.*]], i32 0
+; GFX8PLUS-NEXT:    [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
+; GFX8PLUS-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
+; GFX8PLUS-NEXT:    [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
+; GFX8PLUS-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2
+; GFX8PLUS-NEXT:    [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
+; GFX8PLUS-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3
+; GFX8PLUS-NEXT:    [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1
+; GFX8PLUS-NEXT:    br label [[DO_BODY:%.*]]
+; GFX8PLUS:       do.body:
+; GFX8PLUS-NEXT:    [[PHI0:%.*]] = phi i8 [ [[ELE3]], [[ENTRY:%.*]] ], [ [[OTHERELE3:%.*]], [[DO_BODY]] ]
+; GFX8PLUS-NEXT:    [[PHI1:%.*]] = phi i8 [ [[ELE2]], [[ENTRY]] ], [ [[OTHERELE2:%.*]], [[DO_BODY]] ]
+; GFX8PLUS-NEXT:    [[PHI2:%.*]] = phi i8 [ [[ELE1]], [[ENTRY]] ], [ [[OTHERELE1:%.*]], [[DO_BODY]] ]
+; GFX8PLUS-NEXT:    [[PHI3:%.*]] = phi i8 [ [[ELE0]], [[ENTRY]] ], [ [[OTHERELE0:%.*]], [[DO_BODY]] ]
+; GFX8PLUS-NEXT:    [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8
+; GFX8PLUS-NEXT:    [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1
+; GFX8PLUS-NEXT:    [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2
+; GFX8PLUS-NEXT:    [[OTHERELE3]] = load i8, ptr addrspace(3) [[GEP3]], align 1
+; GFX8PLUS-NEXT:    [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8
+; GFX8PLUS-NEXT:    [[VEC01:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9
+; GFX8PLUS-NEXT:    [[VEC02:%.*]] = insertelement <16 x i8> [[VEC01]], i8 [[OTHERELE2]], i64 10
+; GFX8PLUS-NEXT:    [[VEC03:%.*]] = insertelement <16 x i8> [[VEC02]], i8 [[OTHERELE3]], i64 11
+; GFX8PLUS-NEXT:    [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
+; GFX8PLUS-NEXT:    [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
+; GFX8PLUS-NEXT:    [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10
+; GFX8PLUS-NEXT:    [[VEC13:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11
+; GFX8PLUS-NEXT:    store <16 x i8> [[VEC13]], ptr addrspace(3) [[INPTR1:%.*]], align 2
+; GFX8PLUS-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FLAG:%.*]], 0
+; GFX8PLUS-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[DO_BODY]]
+; GFX8PLUS:       exit:
+; GFX8PLUS-NEXT:    store <16 x i8> [[VEC13]], ptr [[OUT:%.*]], align 16
+; GFX8PLUS-NEXT:    store <16 x i8> [[VEC03]], ptr [[OUT1:%.*]], align 16
+; GFX8PLUS-NEXT:    ret void
+;
+entry:
+  %gep0 = getelementptr i8, ptr addrspace(3) %inptr0, i32 0
+  %ele0 = load i8, ptr addrspace(3) %gep0, align 8
+  %gep1 = getelementptr i8, ptr addrspace(3) %inptr0, i32 1
+  %ele1 = load i8, ptr addrspace(3) %gep1, align 1
+  %gep2 = getelementptr i8, ptr addrspace(3) %inptr0, i32 2
+  %ele2 = load i8, ptr addrspace(3) %gep2, align 2
+  %gep3 = getelementptr i8, ptr addrspace(3) %inptr0, i32 3
+  %ele3 = load i8, ptr addrspace(3) %gep3, align 1
+  br label %do.body
+
+do.body:
+  %phi0 = phi i8 [ %ele3, %entry ], [ %otherele3, %do.body ]
+  %phi1 = phi i8 [ %ele2, %entry ], [ %otherele2, %do.body ]
+  %phi2 = phi i8 [ %ele1, %entry ], [ %otherele1, %do.body ]
+  %phi3 = phi i8 [ %ele0, %entry ], [ %otherele0, %do.body ]
+  %otherele0 = load i8, ptr addrspace(3) %gep0, align 8
+  %otherele1 = load i8, ptr addrspace(3) %gep1, align 1
+  %otherele2 = load i8, ptr addrspace(3) %gep2, align 2
+  %otherele3 = load i8, ptr addrspace(3) %gep3, align 1
+  %vec00 = insertelement <16 x i8> poison, i8 %otherele0, i64 8
+  %vec01 = insertelement <16 x i8> %vec00, i8 %otherele1, i64 9
+  %vec02 = insertelement <16 x i8> %vec01, i8 %otherele2, i64 10
+  %vec03 = insertelement <16 x i8> %vec02, i8 %otherele3, i64 11
+  %vec10 = insertelement <16 x i8> poison, i8 %phi3, i64 8
+  %vec11 = insertelement <16 x i8> %vec10, i8 %phi2, i64 9
+  %vec12 = insertelement <16 x i8> %vec11, i8 %phi1, i64 10
+  %vec13 = insertelement <16 x i8> %vec12, i8 %phi0, i64 11
+  store <16 x i8> %vec13, ptr addrspace(3) %inptr1, align 2
+  %cmp = icmp eq i32 %flag, 0
+  br i1 %cmp, label %exit, label %do.body
+
+exit:
+  store <16 x i8> %vec13, ptr %out
+  store <16 x i8> %vec03, ptr %out1
+  ret void
+}
+
+
+define protected amdgpu_kernel void @arith_phi(ptr addrspace(3) %inptr0, ptr %out, i32 %flag) {
+; GCN-LABEL: @vectorizePHI2(
+; GCN-NEXT:  entry:
+; GCN-NEXT:    [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0:%.*]], i32 0
+; GCN-NEXT:    [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
+; GCN-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
+; GCN-NEXT:    [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
+; GCN-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2
+; GCN-NEXT:    [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
+; GCN-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3
+; GCN-NEXT:    [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1
+; GCN-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FLAG:%.*]], 0
+; GCN-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[BB_1:%.*]]
+; GCN:       bb.1:
+; GCN-NEXT:    [[ADD0:%.*]] = add i8 [[ELE0]], 1
+; GCN-NEXT:    [[ADD1:%.*]] = add i8 [[ELE1]], 1
+; GCN-NEXT:    [[ADD2:%.*]] = add i8 [[ELE2]], 1
+; GCN-NEXT:    [[ADD3:%.*]] = add i8 [[ELE3]], 1
+; GCN-NEXT:    br label [[EXIT]]
+; GCN:       exit:
+; GCN-NEXT:    [[PHI0:%.*]] = phi i8 [ [[ELE3]], [[ENTRY:%.*]] ], [ [[ADD0]], [[BB_1]] ]
+; GCN-NEXT:    [[PHI1:%.*]] = phi i8 [ [[ELE2]], [[ENTRY]] ], [ [[ADD1]], [[BB_1]] ]
+; GCN-NEXT:    [[PHI2:%.*]] = phi i8 [ [[ELE1]], [[ENTRY]] ], [ [[ADD2]], [[BB_1]] ]
+; GCN-NEXT:    [[PHI3:%.*]] = phi i8 [ [[ELE0]], [[ENTRY]] ], [ [[ADD3]], [[BB_1]] ]
+; GCN-NEXT:    [[OTHERELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
+; GCN-NEXT:    [[OTHERELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
+; GCN-NEXT:    [[OTHERELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
+; GCN-NEXT:    [[OTHERELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1
+; GCN-NEXT:    [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
+; GCN-NEXT:    [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
+; GCN-NEXT:    [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10
+; GCN-NEXT:    [[VEC13:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11
+; GCN-NEXT:    store <16 x i8> [[VEC13]], ptr [[OUT:%.*]], align 2
+; GCN-NEXT:    ret void
+;
+; GFX7-LABEL: @arith_phi(
+; GFX7-NEXT:  entry:
+; GFX7-NEXT:    [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0:%.*]], i32 0
+; GFX7-NEXT:    [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
+; GFX7-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
+; GFX7-NEXT:    [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
+; GFX7-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2
+; GFX7-NEXT:    [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
+; GFX7-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3
+; GFX7-NEXT:    [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1
+; GFX7-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FLAG:%.*]], 0
+; GFX7-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[BB_1:%.*]]
+; GFX7:       bb.1:
+; GFX7-NEXT:    [[ADD0:%.*]] = add i8 [[ELE0]], 1
+; GFX7-NEXT:    [[ADD1:%.*]] = add i8 [[ELE1]], 1
+; GFX7-NEXT:    [[ADD2:%.*]] = add i8 [[ELE2]], 1
+; GFX7-NEXT:    [[ADD3:%.*]] = add i8 [[ELE3]], 1
+; GFX7-NEXT:    br label [[EXIT]]
+; GFX7:       exit:
+; GFX7-NEXT:    [[PHI0:%.*]] = phi i8 [ [[ELE3]], [[ENTRY:%.*]] ], [ [[ADD0]], [[BB_1]] ]
+; GFX7-NEXT:    [[PHI1:%.*]] = phi i8 [ [[ELE2]], [[ENTRY]] ], [ [[ADD1]], [[BB_1]] ]
+; GFX7-NEXT:    [[PHI2:%.*]] = phi i8 [ [[ELE1]], [[ENTRY]] ], [ [[ADD2]], [[BB_1]] ]
+; GFX7-NEXT:    [[PHI3:%.*]] = phi i8 [ [[ELE0]], [[ENTRY]] ], [ [[ADD3]], [[BB_1]] ]
+; GFX7-NEXT:    [[OTHERELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
+; GFX7-NEXT:    [[OTHERELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
+; GFX7-NEXT:    [[OTHERELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
+; GFX7-NEXT:    [[OTHERELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1
+; GFX7-NEXT:    [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
+; GFX7-NEXT:    [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
+; GFX7-NEXT:    [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10
+; GFX7-NEXT:    [[VEC13:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11
+; GFX7-NEXT:    store <16 x i8> [[VEC13]], ptr [[OUT:%.*]], align 2
+; GFX7-NEXT:    ret void
+;
+; GFX8PLUS-LABEL: @arith_phi(
+; GFX8PLUS-NEXT:  entry:
+; GFX8PLUS-NEXT:    [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0:%.*]], i32 0
+; GFX8PLUS-NEXT:    [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
+; GFX8PLUS-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
+; GFX8PLUS-NEXT:    [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
+; GFX8PLUS-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2
+; GFX8PLUS-NEXT:    [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
+; GFX8PLUS-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3
+; GFX8PLUS-NEXT:    [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1
+; GFX8PLUS-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FLAG:%.*]], 0
+; GFX8PLUS-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[BB_1:%.*]]
+; GFX8PLUS:       bb.1:
+; GFX8PLUS-NEXT:    [[ADD0:%.*]] = add i8 [[ELE0]], 1
+; GFX8PLUS-NEXT:    [[ADD1:%.*]] = add i8 [[ELE1]], 1
+; GFX8PLUS-NEXT:    [[ADD2:%.*]] = add i8 [[ELE2]], 1
+; GFX8PLUS-NEXT:    [[ADD3:%.*]] = add i8 [[ELE3]], 1
+; GFX8PLUS-NEXT:    br label [[EXIT]]
+; GFX8PLUS:       exit:
+; GFX8PLUS-NEXT:    [[PHI0:%.*]] = phi i8 [ [[ELE3]], [[ENTRY:%.*]] ], [ [[ADD0]], [[BB_1]] ]
+; GFX8PLUS-NEXT:    [[PHI1:%.*]] = phi i8 [ [[ELE2]], [[ENTRY]] ], [ [[ADD1]], [[BB_1]] ]
+; GFX8PLUS-NEXT:    [[PHI2:%.*]] = phi i8 [ [[ELE1]], [[ENTRY]] ], [ [[ADD2]], [[BB_1]] ]
+; GFX8PLUS-NEXT:    [[PHI3:%.*]] = phi i8 [ [[ELE0]], [[ENTRY]] ], [ [[ADD3]], [[BB_1]] ]
+; GFX8PLUS-NEXT:    [[OTHERELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
+; GFX8PLUS-NEXT:    [[OTHERELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
+; GFX8PLUS-NEXT:    [[OTHERELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
+; GFX8PLUS-NEXT:    [[OTHERELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1
+; GFX8PLUS-NEXT:    [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
+; GFX8PLUS-NEXT:    [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
+; GFX8PLUS-NEXT:    [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10
+; GFX8PLUS-NEXT:    [[VEC13:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11
+; GFX8PLUS-NEXT:    store <16 x i8> [[VEC13]], ptr [[OUT:%.*]], align 2
+; GFX8PLUS-NEXT:    ret void
+;
+entry:
+  %gep0 = getelementptr i8, ptr addrspace(3) %inptr0, i32 0
+  %ele0 = load i8, ptr addrspace(3) %gep0, align 8
+  %gep1 = getelementptr i8, ptr addrspace(3) %inptr0, i32 1
+  %ele1 = load i8, ptr addrspace(3) %gep1, align 1
+  %gep2 = getelementptr i8, ptr addrspace(3) %inptr0, i32 2
+  %ele2 = load i8, ptr addrspace(3) %gep2, align 2
+  %gep3 = getelementptr i8, ptr addrspace(3) %inptr0, i32 3
+  %ele3 = load i8, ptr addrspace(3) %gep3, align 1
+  %cmp = icmp eq i32 %flag, 0
+  br i1 %cmp, label %exit, label %bb.1
+
+bb.1:
+  %add0 = add i8 %ele0, 1
+  %add1 = add i8 %ele1, 1
+  %add2 = add i8 %ele2, 1
+  %add3 = add i8 %ele3, 1
+  br label %exit
+
+exit:
+  %phi0 = phi i8 [ %ele3, %entry ], [ %add0, %bb.1 ]
+  %phi1 = phi i8 [ %ele2, %entry ], [ %add1, %bb.1 ]
+  %phi2 = phi i8 [ %ele1, %entry ], [ %add2, %bb.1 ]
+  %phi3 = phi i8 [ %ele0, %entry ], [ %add3, %bb.1 ]
+  %otherele0 = load i8, ptr addrspace(3) %gep0, align 8
+  %otherele1 = load i8, ptr addrspace(3) %gep1, align 1
+  %otherele2 = load i8, ptr addrspace(3) %gep2, align 2
+  %otherele3 = load i8, ptr addrspace(3) %gep3, align 1
+  %vec10 = insertelement <16 x i8> poison, i8 %phi3, i64 8
+  %vec11 = insertelement <16 x i8> %vec10, i8 %phi2, i64 9
+  %vec12 = insertelement <16 x i8> %vec11, i8 %phi1, i64 10
+  %vec13 = insertelement <16 x i8> %vec12, i8 %phi0, i64 11
+  store <16 x i8> %vec13, ptr %out, align 2
+  ret void
+}
+
+define protected amdgpu_kernel void @arith(<16 x i8> %invec, ptr %out, i32 %flag) {
+; GFX7-LABEL: @arith(
+; GFX7-NEXT:  entry:
+; GFX7-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC:%.*]], <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX7-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[TMP0]], <i8 1, i8 1, i8 1, i8 1>
+; GFX7-NEXT:    [[TMP2:%.*]] = add <4 x i8> [[TMP1]], <i8 1, i8 1, i8 1, i8 1>
+; GFX7-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; GFX7-NEXT:    [[TMP4:%.*]] = mul <4 x i8> [[TMP3]], <i8 1, i8 1, i8 1, i8 1>
+; GFX7-NEXT:    [[TMP5:%.*]] = add <4 x i8> [[TMP4]], <i8 1, i8 1, i8 1, i8 1>
+; GFX7-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; GFX7-NEXT:    [[TMP7:%.*]] = mul <4 x i8> [[TMP6]], <i8 1, i8 1, i8 1, i8 1>
+; GFX7-NEXT:    [[TMP8:%.*]] = add <4 x i8> [[TMP7]], <i8 1, i8 1, i8 1, i8 1>
+; GFX7-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; GFX7-NEXT:    [[TMP10:%.*]] = mul <4 x i8> [[TMP9]], <i8 1, i8 1, i8 1, i8 1>
+; GFX7-NEXT:    [[TMP11:%.*]] = add <4 x i8> [[TMP10]], <i8 1, i8 1, i8 1, i8 1>
+; GFX7-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX7-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX7-NEXT:    [[VECINS71:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GFX7-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX7-NEXT:    [[VECINS112:%.*]] = shufflevector <16 x i8> [[VECINS71]], <16 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
+; GFX7-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX7-NEXT:    [[VECINS153:%.*]] = shufflevector <16 x i8> [[VECINS112]], <16 x i8> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; GFX7-NEXT:    store <16 x i8> [[VECINS153]], ptr [[OUT:%.*]], align 16
+; GFX7-NEXT:    ret void
+;
+; GFX8PLUS-LABEL: @arith(
+; GFX8PLUS-NEXT:  entry:
+; GFX8PLUS-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC:%.*]], <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX8PLUS-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[TMP0]], <i8 1, i8 1, i8 1, i8 1>
+; GFX8PLUS-NEXT:    [[TMP2:%.*]] = add <4 x i8> [[TMP1]], <i8 1, i8 1, i8 1, i8 1>
+; GFX8PLUS-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; GFX8PLUS-NEXT:    [[TMP4:%.*]] = mul <4 x i8> [[TMP3]], <i8 1, i8 1, i8 1, i8 1>
+; GFX8PLUS-NEXT:    [[TMP5:%.*]] = add <4 x i8> [[TMP4]], <i8 1, i8 1, i8 1, i8 1>
+; GFX8PLUS-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; GFX8PLUS-NEXT:    [[TMP7:%.*]] = mul <4 x i8> [[TMP6]], <i8 1, i8 1, i8 1, i8 1>
+; GFX8PLUS-NEXT:    [[TMP8:%.*]] = add <4 x i8> [[TMP7]], <i8 1, i8 1, i8 1, i8 1>
+; GFX8PLUS-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; GFX8PLUS-NEXT:    [[TMP10:%.*]] = mul <4 x i8> [[TMP9]], <i8 1, i8 1, i8 1, i8 1>
+; GFX8PLUS-NEXT:    [[TMP11:%.*]] = add <4 x i8> [[TMP10]], <i8 1, i8 1, i8 1, i8 1>
+; GFX8PLUS-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8PLUS-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8PLUS-NEXT:    [[VECINS71:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GFX8PLUS-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8PLUS-NEXT:    [[VECINS112:%.*]] = shufflevector <16 x i8> [[VECINS71]], <16 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
+; GFX8PLUS-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8PLUS-NEXT:    [[VECINS153:%.*]] = shufflevector <16 x i8> [[VECINS112]], <16 x i8> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; GFX8PLUS-NEXT:    store <16 x i8> [[VECINS153]], ptr [[OUT:%.*]], align 16
+; GFX8PLUS-NEXT:    ret void
+;
+entry:
+  %el0 = extractelement <16 x i8> %invec, i64  0
+  %el1 = extractelement <16 x i8> %invec, i64  1
+  %el2 = extractelement <16 x i8> %invec, i64  2
+  %el3 = extractelement <16 x i8> %invec, i64  3
+  %el4 = extractelement <16 x i8> %invec, i64  4
+  %el5 = extractelement <16 x i8> %invec, i64  5
+  %el6 = extractelement <16 x i8> %invec, i64  6
+  %el7 = extractelement <16 x i8> %invec, i64  7
+  %el8 = extractelement <16 x i8> %invec, i64  8
+  %el9 = extractelement <16 x i8> %invec, i64  9
+  %el10 = extractelement <16 x i8> %invec, i64 10
+  %el11 = extractelement <16 x i8> %invec, i64 11
+  %el12 = extractelement <16 x i8> %invec, i64 12
+  %el13 = extractelement <16 x i8> %invec, i64 13
+  %el14 = extractelement <16 x i8> %invec, i64 14
+  %el15 = extractelement <16 x i8> %invec, i64 15
+  %mul0 = mul i8 %el0, 1
+  %mul1 = mul i8 %el1, 1
+  %mul2 = mul i8 %el2, 1
+  %mul3 = mul i8 %el3, 1
+  %mul4 = mul i8 %el4, 1
+  %mul5 = mul i8 %el5, 1
+  %mul6 = mul i8 %el6, 1
+  %mul7 = mul i8 %el7, 1
+  %mul8 = mul i8 %el8, 1
+  %mul9 = mul i8 %el9, 1
+  %mul10 = mul i8 %el10, 1
+  %mul11 = mul i8 %el11, 1
+  %mul12 = mul i8 %el12, 1
+  %mul13 = mul i8 %el13, 1
+  %mul14 = mul i8 %el14, 1
+  %mul15 = mul i8 %el15, 1
+  %add0 = add i8 %mul0, 1
+  %add1 = add i8 %mul1, 1
+  %add2 = add i8 %mul2, 1
+  %add3 = add i8 %mul3, 1
+  %add4 = add i8 %mul4, 1
+  %add5 = add i8 %mul5, 1
+  %add6 = add i8 %mul6, 1
+  %add7 = add i8 %mul7, 1
+  %add8 = add i8 %mul8, 1
+  %add9 = add i8 %mul9, 1
+  %add10 = add i8 %mul10, 1
+  %add11 = add i8 %mul11, 1
+  %add12 = add i8 %mul12, 1
+  %add13 = add i8 %mul13, 1
+  %add14 = add i8 %mul14, 1
+  %add15 = add i8 %mul15, 1
+  %vecins0 = insertelement <16 x i8> poison, i8 %add0, i64 0
+  %vecins1 = insertelement <16 x i8> %vecins0, i8 %add1, i64 1
+  %vecins2 = insertelement <16 x i8> %vecins1, i8 %add2, i64 2
+  %vecins3 = insertelement <16 x i8> %vecins2, i8 %add3, i64 3
+  %vecins4 = insertelement <16 x i8> %vecins3, i8 %add4, i64 4
+  %vecins5 = insertelement <16 x i8> %vecins4, i8 %add5, i64 5
+  %vecins6 = insertelement <16 x i8> %vecins5, i8 %add6, i64 6
+  %vecins7 = insertelement <16 x i8> %vecins6, i8 %add7, i64 7
+  %vecins8 = insertelement <16 x i8> %vecins7, i8 %add8, i64 8
+  %vecins9 = insertelement <16 x i8> %vecins8, i8 %add9, i64 9
+  %vecins10 = insertelement <16 x i8> %vecins9, i8 %add10, i64 10
+  %vecins11 = insertelement <16 x i8> %vecins10, i8 %add11, i64 11
+  %vecins12 = insertelement <16 x i8> %vecins11, i8 %add12, i64 12
+  %vecins13 = insertelement <16 x i8> %vecins12, i8 %add13, i64 13
+  %vecins14 = insertelement <16 x i8> %vecins13, i8 %add14, i64 14
+  %vecins15 = insertelement <16 x i8> %vecins14, i8 %add15, i64 15
+  store <16 x i8> %vecins15, ptr %out
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX8: {{.*}}
+; GFX9: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll
index 3b63c1e35610f..08c32691cf366 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll
@@ -90,3 +90,78 @@ bb1:
   %o3 = insertelement <4 x half> %o2, half %c3, i64 3
   ret <4 x half> %o3
 }
+
+
+define <4 x i8> @phisi8(i1 %cmp1, <4 x i8> %in1, <4 x i8> %in2)  {
+; CHECK-LABEL: @phisi8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[CMP1:%.*]], label [[BB1:%.*]], label [[BB0:%.*]]
+; CHECK:       bb0:
+; CHECK-NEXT:    br label [[BB1]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <4 x i8> [ [[IN1:%.*]], [[ENTRY:%.*]] ], [ [[IN2:%.*]], [[BB0]] ]
+; CHECK-NEXT:    ret <4 x i8> [[TMP0]]
+;
+entry:
+  %a0 = extractelement <4 x i8> %in1, i64 0
+  %a1 = extractelement <4 x i8> %in1, i64 1
+  %a2 = extractelement <4 x i8> %in1, i64 2
+  %a3 = extractelement <4 x i8> %in1, i64 3
+  br i1 %cmp1, label %bb1, label %bb0
+
+bb0:
+  %b0 = extractelement <4 x i8> %in2, i64 0
+  %b1 = extractelement <4 x i8> %in2, i64 1
+  %b2 = extractelement <4 x i8> %in2, i64 2
+  %b3 = extractelement <4 x i8> %in2, i64 3
+  br label %bb1
+
+bb1:
+  %c0 = phi i8 [ %a0, %entry ], [ %b0, %bb0 ]
+  %c1 = phi i8 [ %a1, %entry ], [ %b1, %bb0 ]
+  %c2 = phi i8 [ %a2, %entry ], [ %b2, %bb0 ]
+  %c3 = phi i8 [ %a3, %entry ], [ %b3, %bb0 ]
+
+  %o0 = insertelement <4 x i8> undef, i8 %c0, i64 0
+  %o1 = insertelement <4 x i8> %o0, i8 %c1, i64 1
+  %o2 = insertelement <4 x i8> %o1, i8 %c2, i64 2
+  %o3 = insertelement <4 x i8> %o2, i8 %c3, i64 3
+  ret <4 x i8> %o3
+}
+
+define <4 x i8> @phisi8_reverse(i1 %cmp1, <4 x i8> %in1, <4 x i8> %in2)  {
+; CHECK-LABEL: @phisi8_reverse(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[CMP1:%.*]], label [[BB1:%.*]], label [[BB0:%.*]]
+; CHECK:       bb0:
+; CHECK-NEXT:    br label [[BB1]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <4 x i8> [ [[IN1:%.*]], [[ENTRY:%.*]] ], [ [[IN2:%.*]], [[BB0]] ]
+; CHECK-NEXT:    ret <4 x i8> [[TMP0]]
+;
+entry:
+  %a0 = extractelement <4 x i8> %in1, i64 0
+  %a1 = extractelement <4 x i8> %in1, i64 1
+  %a2 = extractelement <4 x i8> %in1, i64 2
+  %a3 = extractelement <4 x i8> %in1, i64 3
+  br i1 %cmp1, label %bb1, label %bb0
+
+bb0:
+  %b0 = extractelement <4 x i8> %in2, i64 0
+  %b1 = extractelement <4 x i8> %in2, i64 1
+  %b2 = extractelement <4 x i8> %in2, i64 2
+  %b3 = extractelement <4 x i8> %in2, i64 3
+  br label %bb1
+
+bb1:
+  %c3 = phi i8 [ %a3, %entry ], [ %b3, %bb0 ]
+  %c2 = phi i8 [ %a2, %entry ], [ %b2, %bb0 ]
+  %c1 = phi i8 [ %a1, %entry ], [ %b1, %bb0 ]
+  %c0 = phi i8 [ %a0, %entry ], [ %b0, %bb0 ]
+
+  %o0 = insertelement <4 x i8> undef, i8 %c0, i64 0
+  %o1 = insertelement <4 x i8> %o0, i8 %c1, i64 1
+  %o2 = insertelement <4 x i8> %o1, i8 %c2, i64 2
+  %o3 = insertelement <4 x i8> %o2, i8 %c3, i64 3
+  ret <4 x i8> %o3
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll
index aceee8840bb40..567760f695598 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll
@@ -549,3 +549,330 @@ entry:
 
   ret float %add3
 }
+
+define i8 @reduction_v4i8(<4 x i8> %a) {
+; GCN-LABEL: @reduction_v4i8(
+; GCN-NEXT:  entry:
+; GCN-NEXT:    [[ELT0:%.*]] = extractelement <4 x i8> [[A:%.*]], i64 0
+; GCN-NEXT:    [[ELT1:%.*]] = extractelement <4 x i8> [[A]], i64 1
+; GCN-NEXT:    [[ELT2:%.*]] = extractelement <4 x i8> [[A]], i64 2
+; GCN-NEXT:    [[ELT3:%.*]] = extractelement <4 x i8> [[A]], i64 3
+; GCN-NEXT:    [[ADD1:%.*]] = add i8 [[ELT1]], [[ELT0]]
+; GCN-NEXT:    [[ADD2:%.*]] = add i8 [[ELT2]], [[ADD1]]
+; GCN-NEXT:    [[ADD3:%.*]] = add i8 [[ELT3]], [[ADD2]]
+; GCN-NEXT:    ret i8 [[ADD3]]
+;
+entry:
+  %elt0 = extractelement <4 x i8> %a, i64 0
+  %elt1 = extractelement <4 x i8> %a, i64 1
+  %elt2 = extractelement <4 x i8> %a, i64 2
+  %elt3 = extractelement <4 x i8> %a, i64 3
+
+  %add1 = add i8 %elt1, %elt0
+  %add2 = add i8 %elt2, %add1
+  %add3 = add i8 %elt3, %add2
+
+  ret i8 %add3
+}
+
+define i8 @reduction_v8i8(<8 x i8> %vec8) {
+; GCN-LABEL: @reduction_v8i8(
+; GCN-NEXT:  entry:
+; GCN-NEXT:    [[ELT0:%.*]] = extractelement <8 x i8> [[VEC8:%.*]], i64 0
+; GCN-NEXT:    [[ELT1:%.*]] = extractelement <8 x i8> [[VEC8]], i64 1
+; GCN-NEXT:    [[ELT2:%.*]] = extractelement <8 x i8> [[VEC8]], i64 2
+; GCN-NEXT:    [[ELT3:%.*]] = extractelement <8 x i8> [[VEC8]], i64 3
+; GCN-NEXT:    [[ELT4:%.*]] = extractelement <8 x i8> [[VEC8]], i64 4
+; GCN-NEXT:    [[ELT5:%.*]] = extractelement <8 x i8> [[VEC8]], i64 5
+; GCN-NEXT:    [[ELT6:%.*]] = extractelement <8 x i8> [[VEC8]], i64 6
+; GCN-NEXT:    [[ELT7:%.*]] = extractelement <8 x i8> [[VEC8]], i64 7
+; GCN-NEXT:    [[ADD1:%.*]] = add i8 [[ELT1]], [[ELT0]]
+; GCN-NEXT:    [[ADD2:%.*]] = add i8 [[ELT2]], [[ADD1]]
+; GCN-NEXT:    [[ADD3:%.*]] = add i8 [[ELT3]], [[ADD2]]
+; GCN-NEXT:    [[ADD4:%.*]] = add i8 [[ELT4]], [[ADD3]]
+; GCN-NEXT:    [[ADD5:%.*]] = add i8 [[ELT5]], [[ADD4]]
+; GCN-NEXT:    [[ADD6:%.*]] = add i8 [[ELT6]], [[ADD5]]
+; GCN-NEXT:    [[ADD7:%.*]] = add i8 [[ELT7]], [[ADD6]]
+; GCN-NEXT:    ret i8 [[ADD7]]
+;
+entry:
+  %elt0 = extractelement <8 x i8> %vec8, i64 0
+  %elt1 = extractelement <8 x i8> %vec8, i64 1
+  %elt2 = extractelement <8 x i8> %vec8, i64 2
+  %elt3 = extractelement <8 x i8> %vec8, i64 3
+  %elt4 = extractelement <8 x i8> %vec8, i64 4
+  %elt5 = extractelement <8 x i8> %vec8, i64 5
+  %elt6 = extractelement <8 x i8> %vec8, i64 6
+  %elt7 = extractelement <8 x i8> %vec8, i64 7
+
+  %add1 = add i8 %elt1, %elt0
+  %add2 = add i8 %elt2, %add1
+  %add3 = add i8 %elt3, %add2
+  %add4 = add i8 %elt4, %add3
+  %add5 = add i8 %elt5, %add4
+  %add6 = add i8 %elt6, %add5
+  %add7 = add i8 %elt7, %add6
+
+  ret i8 %add7
+}
+
+define i8 @reduction_umin_v4i8(<4 x i8> %vec4) {
+; GCN-LABEL: @reduction_umin_v4i8(
+; GCN-NEXT:  entry:
+; GCN-NEXT:    [[ELT0:%.*]] = extractelement <4 x i8> [[VEC4:%.*]], i64 0
+; GCN-NEXT:    [[ELT1:%.*]] = extractelement <4 x i8> [[VEC4]], i64 1
+; GCN-NEXT:    [[ELT2:%.*]] = extractelement <4 x i8> [[VEC4]], i64 2
+; GCN-NEXT:    [[ELT3:%.*]] = extractelement <4 x i8> [[VEC4]], i64 3
+; GCN-NEXT:    [[CMP1:%.*]] = icmp ult i8 [[ELT1]], [[ELT0]]
+; GCN-NEXT:    [[MIN1:%.*]] = select i1 [[CMP1]], i8 [[ELT1]], i8 [[ELT0]]
+; GCN-NEXT:    [[CMP2:%.*]] = icmp ult i8 [[ELT2]], [[MIN1]]
+; GCN-NEXT:    [[MIN2:%.*]] = select i1 [[CMP2]], i8 [[ELT2]], i8 [[MIN1]]
+; GCN-NEXT:    [[CMP3:%.*]] = icmp ult i8 [[ELT3]], [[MIN2]]
+; GCN-NEXT:    [[MIN3:%.*]] = select i1 [[CMP3]], i8 [[ELT3]], i8 [[MIN2]]
+; GCN-NEXT:    ret i8 [[MIN3]]
+;
+entry:
+  %elt0 = extractelement <4 x i8> %vec4, i64 0
+  %elt1 = extractelement <4 x i8> %vec4, i64 1
+  %elt2 = extractelement <4 x i8> %vec4, i64 2
+  %elt3 = extractelement <4 x i8> %vec4, i64 3
+
+  %cmp1 = icmp ult i8 %elt1, %elt0
+  %min1 = select i1 %cmp1, i8 %elt1, i8 %elt0
+  %cmp2 = icmp ult i8 %elt2, %min1
+  %min2 = select i1 %cmp2, i8 %elt2, i8 %min1
+  %cmp3 = icmp ult i8 %elt3, %min2
+  %min3 = select i1 %cmp3, i8 %elt3, i8 %min2
+
+  ret i8 %min3
+}
+
+define i8 @reduction_icmp_v8i8(<8 x i8> %vec8) {
+; GCN-LABEL: @reduction_icmp_v8i8(
+; GCN-NEXT:  entry:
+; GCN-NEXT:    [[ELT0:%.*]] = extractelement <8 x i8> [[VEC8:%.*]], i64 0
+; GCN-NEXT:    [[ELT1:%.*]] = extractelement <8 x i8> [[VEC8]], i64 1
+; GCN-NEXT:    [[ELT2:%.*]] = extractelement <8 x i8> [[VEC8]], i64 2
+; GCN-NEXT:    [[ELT3:%.*]] = extractelement <8 x i8> [[VEC8]], i64 3
+; GCN-NEXT:    [[ELT4:%.*]] = extractelement <8 x i8> [[VEC8]], i64 4
+; GCN-NEXT:    [[ELT5:%.*]] = extractelement <8 x i8> [[VEC8]], i64 5
+; GCN-NEXT:    [[ELT6:%.*]] = extractelement <8 x i8> [[VEC8]], i64 6
+; GCN-NEXT:    [[ELT7:%.*]] = extractelement <8 x i8> [[VEC8]], i64 7
+; GCN-NEXT:    [[CMP0:%.*]] = icmp ult i8 [[ELT1]], [[ELT0]]
+; GCN-NEXT:    [[MIN1:%.*]] = select i1 [[CMP0]], i8 [[ELT1]], i8 [[ELT0]]
+; GCN-NEXT:    [[CMP1:%.*]] = icmp ult i8 [[ELT2]], [[MIN1]]
+; GCN-NEXT:    [[MIN2:%.*]] = select i1 [[CMP1]], i8 [[ELT2]], i8 [[MIN1]]
+; GCN-NEXT:    [[CMP2:%.*]] = icmp ult i8 [[ELT3]], [[MIN2]]
+; GCN-NEXT:    [[MIN3:%.*]] = select i1 [[CMP2]], i8 [[ELT3]], i8 [[MIN2]]
+; GCN-NEXT:    [[CMP3:%.*]] = icmp ult i8 [[ELT4]], [[MIN3]]
+; GCN-NEXT:    [[MIN4:%.*]] = select i1 [[CMP3]], i8 [[ELT4]], i8 [[MIN3]]
+; GCN-NEXT:    [[CMP4:%.*]] = icmp ult i8 [[ELT5]], [[MIN4]]
+; GCN-NEXT:    [[MIN5:%.*]] = select i1 [[CMP4]], i8 [[ELT5]], i8 [[MIN4]]
+; GCN-NEXT:    [[CMP5:%.*]] = icmp ult i8 [[ELT6]], [[MIN5]]
+; GCN-NEXT:    [[MIN6:%.*]] = select i1 [[CMP5]], i8 [[ELT6]], i8 [[MIN5]]
+; GCN-NEXT:    [[CMP6:%.*]] = icmp ult i8 [[ELT7]], [[MIN6]]
+; GCN-NEXT:    [[MIN7:%.*]] = select i1 [[CMP6]], i8 [[ELT7]], i8 [[MIN6]]
+; GCN-NEXT:    ret i8 [[MIN7]]
+;
+entry:
+  %elt0 = extractelement <8 x i8> %vec8, i64 0
+  %elt1 = extractelement <8 x i8> %vec8, i64 1
+  %elt2 = extractelement <8 x i8> %vec8, i64 2
+  %elt3 = extractelement <8 x i8> %vec8, i64 3
+  %elt4 = extractelement <8 x i8> %vec8, i64 4
+  %elt5 = extractelement <8 x i8> %vec8, i64 5
+  %elt6 = extractelement <8 x i8> %vec8, i64 6
+  %elt7 = extractelement <8 x i8> %vec8, i64 7
+
+  %cmp0 = icmp ult i8 %elt1, %elt0
+  %min1 = select i1 %cmp0, i8 %elt1, i8 %elt0
+  %cmp1 = icmp ult i8 %elt2, %min1
+  %min2 = select i1 %cmp1, i8 %elt2, i8 %min1
+  %cmp2 = icmp ult i8 %elt3, %min2
+  %min3 = select i1 %cmp2, i8 %elt3, i8 %min2
+
+  %cmp3 = icmp ult i8 %elt4, %min3
+  %min4 = select i1 %cmp3, i8 %elt4, i8 %min3
+  %cmp4 = icmp ult i8 %elt5, %min4
+  %min5 = select i1 %cmp4, i8 %elt5, i8 %min4
+
+  %cmp5 = icmp ult i8 %elt6, %min5
+  %min6 = select i1 %cmp5, i8 %elt6, i8 %min5
+  %cmp6 = icmp ult i8 %elt7, %min6
+  %min7 = select i1 %cmp6, i8 %elt7, i8 %min6
+
+  ret i8 %min7
+}
+
+define i8 @reduction_smin_v16i8(<16 x i8> %vec16) {
+; GCN-LABEL: @reduction_smin_v16i8(
+; GCN-NEXT:  entry:
+; GCN-NEXT:    [[ELT0:%.*]] = extractelement <16 x i8> [[VEC16:%.*]], i64 0
+; GCN-NEXT:    [[ELT1:%.*]] = extractelement <16 x i8> [[VEC16]], i64 1
+; GCN-NEXT:    [[ELT2:%.*]] = extractelement <16 x i8> [[VEC16]], i64 2
+; GCN-NEXT:    [[ELT3:%.*]] = extractelement <16 x i8> [[VEC16]], i64 3
+; GCN-NEXT:    [[ELT4:%.*]] = extractelement <16 x i8> [[VEC16]], i64 4
+; GCN-NEXT:    [[ELT5:%.*]] = extractelement <16 x i8> [[VEC16]], i64 5
+; GCN-NEXT:    [[ELT6:%.*]] = extractelement <16 x i8> [[VEC16]], i64 6
+; GCN-NEXT:    [[ELT7:%.*]] = extractelement <16 x i8> [[VEC16]], i64 7
+; GCN-NEXT:    [[ELT8:%.*]] = extractelement <16 x i8> [[VEC16]], i64 8
+; GCN-NEXT:    [[ELT9:%.*]] = extractelement <16 x i8> [[VEC16]], i64 9
+; GCN-NEXT:    [[ELT10:%.*]] = extractelement <16 x i8> [[VEC16]], i64 10
+; GCN-NEXT:    [[ELT11:%.*]] = extractelement <16 x i8> [[VEC16]], i64 11
+; GCN-NEXT:    [[ELT12:%.*]] = extractelement <16 x i8> [[VEC16]], i64 12
+; GCN-NEXT:    [[ELT13:%.*]] = extractelement <16 x i8> [[VEC16]], i64 13
+; GCN-NEXT:    [[ELT14:%.*]] = extractelement <16 x i8> [[VEC16]], i64 14
+; GCN-NEXT:    [[ELT15:%.*]] = extractelement <16 x i8> [[VEC16]], i64 15
+; GCN-NEXT:    [[CMP0:%.*]] = icmp slt i8 [[ELT1]], [[ELT0]]
+; GCN-NEXT:    [[MIN1:%.*]] = select i1 [[CMP0]], i8 [[ELT1]], i8 [[ELT0]]
+; GCN-NEXT:    [[CMP1:%.*]] = icmp slt i8 [[ELT2]], [[MIN1]]
+; GCN-NEXT:    [[MIN2:%.*]] = select i1 [[CMP1]], i8 [[ELT2]], i8 [[MIN1]]
+; GCN-NEXT:    [[CMP2:%.*]] = icmp slt i8 [[ELT3]], [[MIN2]]
+; GCN-NEXT:    [[MIN3:%.*]] = select i1 [[CMP2]], i8 [[ELT3]], i8 [[MIN2]]
+; GCN-NEXT:    [[CMP3:%.*]] = icmp slt i8 [[ELT4]], [[MIN3]]
+; GCN-NEXT:    [[MIN4:%.*]] = select i1 [[CMP3]], i8 [[ELT4]], i8 [[MIN3]]
+; GCN-NEXT:    [[CMP4:%.*]] = icmp slt i8 [[ELT5]], [[MIN4]]
+; GCN-NEXT:    [[MIN5:%.*]] = select i1 [[CMP4]], i8 [[ELT5]], i8 [[MIN4]]
+; GCN-NEXT:    [[CMP5:%.*]] = icmp slt i8 [[ELT6]], [[MIN5]]
+; GCN-NEXT:    [[MIN6:%.*]] = select i1 [[CMP5]], i8 [[ELT6]], i8 [[MIN5]]
+; GCN-NEXT:    [[CMP6:%.*]] = icmp slt i8 [[ELT7]], [[MIN6]]
+; GCN-NEXT:    [[MIN7:%.*]] = select i1 [[CMP6]], i8 [[ELT7]], i8 [[MIN6]]
+; GCN-NEXT:    [[CMP7:%.*]] = icmp slt i8 [[ELT8]], [[MIN7]]
+; GCN-NEXT:    [[MIN8:%.*]] = select i1 [[CMP7]], i8 [[ELT8]], i8 [[MIN7]]
+; GCN-NEXT:    [[CMP8:%.*]] = icmp slt i8 [[ELT9]], [[MIN8]]
+; GCN-NEXT:    [[MIN9:%.*]] = select i1 [[CMP8]], i8 [[ELT9]], i8 [[MIN8]]
+; GCN-NEXT:    [[CMP9:%.*]] = icmp slt i8 [[ELT10]], [[MIN9]]
+; GCN-NEXT:    [[MIN10:%.*]] = select i1 [[CMP9]], i8 [[ELT10]], i8 [[MIN9]]
+; GCN-NEXT:    [[CMP10:%.*]] = icmp slt i8 [[ELT11]], [[MIN10]]
+; GCN-NEXT:    [[MIN11:%.*]] = select i1 [[CMP10]], i8 [[ELT11]], i8 [[MIN10]]
+; GCN-NEXT:    [[CMP11:%.*]] = icmp slt i8 [[ELT12]], [[MIN11]]
+; GCN-NEXT:    [[MIN12:%.*]] = select i1 [[CMP11]], i8 [[ELT12]], i8 [[MIN11]]
+; GCN-NEXT:    [[CMP12:%.*]] = icmp slt i8 [[ELT13]], [[MIN12]]
+; GCN-NEXT:    [[MIN13:%.*]] = select i1 [[CMP12]], i8 [[ELT13]], i8 [[MIN12]]
+; GCN-NEXT:    [[CMP13:%.*]] = icmp slt i8 [[ELT14]], [[MIN13]]
+; GCN-NEXT:    [[MIN14:%.*]] = select i1 [[CMP13]], i8 [[ELT14]], i8 [[MIN13]]
+; GCN-NEXT:    [[CMP14:%.*]] = icmp slt i8 [[ELT15]], [[MIN14]]
+; GCN-NEXT:    [[MIN15:%.*]] = select i1 [[CMP14]], i8 [[ELT15]], i8 [[MIN14]]
+; GCN-NEXT:    ret i8 [[MIN15]]
+;
+entry:
+  %elt0 = extractelement <16 x i8> %vec16, i64 0
+  %elt1 = extractelement <16 x i8> %vec16, i64 1
+  %elt2 = extractelement <16 x i8> %vec16, i64 2
+  %elt3 = extractelement <16 x i8> %vec16, i64 3
+  %elt4 = extractelement <16 x i8> %vec16, i64 4
+  %elt5 = extractelement <16 x i8> %vec16, i64 5
+  %elt6 = extractelement <16 x i8> %vec16, i64 6
+  %elt7 = extractelement <16 x i8> %vec16, i64 7
+
+  %elt8 = extractelement <16 x i8> %vec16, i64 8
+  %elt9 = extractelement <16 x i8> %vec16, i64 9
+  %elt10 = extractelement <16 x i8> %vec16, i64 10
+  %elt11 = extractelement <16 x i8> %vec16, i64 11
+  %elt12 = extractelement <16 x i8> %vec16, i64 12
+  %elt13 = extractelement <16 x i8> %vec16, i64 13
+  %elt14 = extractelement <16 x i8> %vec16, i64 14
+  %elt15 = extractelement <16 x i8> %vec16, i64 15
+
+  %cmp0 = icmp slt i8 %elt1, %elt0
+  %min1 = select i1 %cmp0, i8 %elt1, i8 %elt0
+  %cmp1 = icmp slt i8 %elt2, %min1
+  %min2 = select i1 %cmp1, i8 %elt2, i8 %min1
+  %cmp2 = icmp slt i8 %elt3, %min2
+  %min3 = select i1 %cmp2, i8 %elt3, i8 %min2
+
+  %cmp3 = icmp slt i8 %elt4, %min3
+  %min4 = select i1 %cmp3, i8 %elt4, i8 %min3
+  %cmp4 = icmp slt i8 %elt5, %min4
+  %min5 = select i1 %cmp4, i8 %elt5, i8 %min4
+
+  %cmp5 = icmp slt i8 %elt6, %min5
+  %min6 = select i1 %cmp5, i8 %elt6, i8 %min5
+  %cmp6 = icmp slt i8 %elt7, %min6
+  %min7 = select i1 %cmp6, i8 %elt7, i8 %min6
+
+  %cmp7 = icmp slt i8 %elt8, %min7
+  %min8 = select i1 %cmp7, i8 %elt8, i8 %min7
+  %cmp8 = icmp slt i8 %elt9, %min8
+  %min9 = select i1 %cmp8, i8 %elt9, i8 %min8
+
+  %cmp9 = icmp slt i8 %elt10, %min9
+  %min10 = select i1 %cmp9, i8 %elt10, i8 %min9
+  %cmp10 = icmp slt i8 %elt11, %min10
+  %min11 = select i1 %cmp10, i8 %elt11, i8 %min10
+
+  %cmp11 = icmp slt i8 %elt12, %min11
+  %min12 = select i1 %cmp11, i8 %elt12, i8 %min11
+  %cmp12 = icmp slt i8 %elt13, %min12
+  %min13 = select i1 %cmp12, i8 %elt13, i8 %min12
+
+  %cmp13 = icmp slt i8 %elt14, %min13
+  %min14 = select i1 %cmp13, i8 %elt14, i8 %min13
+  %cmp14 = icmp slt i8 %elt15, %min14
+  %min15 = select i1 %cmp14, i8 %elt15, i8 %min14
+
+
+  ret i8 %min15
+}
+
+define i8 @reduction_umax_v4i8(<4 x i8> %vec4) {
+; GCN-LABEL: @reduction_umax_v4i8(
+; GCN-NEXT:  entry:
+; GCN-NEXT:    [[ELT0:%.*]] = extractelement <4 x i8> [[VEC4:%.*]], i64 0
+; GCN-NEXT:    [[ELT1:%.*]] = extractelement <4 x i8> [[VEC4]], i64 1
+; GCN-NEXT:    [[ELT2:%.*]] = extractelement <4 x i8> [[VEC4]], i64 2
+; GCN-NEXT:    [[ELT3:%.*]] = extractelement <4 x i8> [[VEC4]], i64 3
+; GCN-NEXT:    [[CMP1:%.*]] = icmp ugt i8 [[ELT1]], [[ELT0]]
+; GCN-NEXT:    [[MAX1:%.*]] = select i1 [[CMP1]], i8 [[ELT1]], i8 [[ELT0]]
+; GCN-NEXT:    [[CMP2:%.*]] = icmp ugt i8 [[ELT2]], [[MAX1]]
+; GCN-NEXT:    [[MAX2:%.*]] = select i1 [[CMP2]], i8 [[ELT2]], i8 [[MAX1]]
+; GCN-NEXT:    [[CMP3:%.*]] = icmp ugt i8 [[ELT3]], [[MAX2]]
+; GCN-NEXT:    [[MAX3:%.*]] = select i1 [[CMP3]], i8 [[ELT3]], i8 [[MAX2]]
+; GCN-NEXT:    ret i8 [[MAX3]]
+;
+entry:
+  %elt0 = extractelement <4 x i8> %vec4, i64 0
+  %elt1 = extractelement <4 x i8> %vec4, i64 1
+  %elt2 = extractelement <4 x i8> %vec4, i64 2
+  %elt3 = extractelement <4 x i8> %vec4, i64 3
+
+  %cmp1 = icmp ugt i8 %elt1, %elt0
+  %max1 = select i1 %cmp1, i8 %elt1, i8 %elt0
+  %cmp2 = icmp ugt i8 %elt2, %max1
+  %max2 = select i1 %cmp2, i8 %elt2, i8 %max1
+  %cmp3 = icmp ugt i8 %elt3, %max2
+  %max3 = select i1 %cmp3, i8 %elt3, i8 %max2
+
+  ret i8 %max3
+}
+
+define i8 @reduction_smax_v4i8(<4 x i8> %vec4) {
+; GCN-LABEL: @reduction_smax_v4i8(
+; GCN-NEXT:  entry:
+; GCN-NEXT:    [[ELT0:%.*]] = extractelement <4 x i8> [[VEC4:%.*]], i64 0
+; GCN-NEXT:    [[ELT1:%.*]] = extractelement <4 x i8> [[VEC4]], i64 1
+; GCN-NEXT:    [[ELT2:%.*]] = extractelement <4 x i8> [[VEC4]], i64 2
+; GCN-NEXT:    [[ELT3:%.*]] = extractelement <4 x i8> [[VEC4]], i64 3
+; GCN-NEXT:    [[CMP1:%.*]] = icmp sgt i8 [[ELT1]], [[ELT0]]
+; GCN-NEXT:    [[MAX1:%.*]] = select i1 [[CMP1]], i8 [[ELT1]], i8 [[ELT0]]
+; GCN-NEXT:    [[CMP2:%.*]] = icmp sgt i8 [[ELT2]], [[MAX1]]
+; GCN-NEXT:    [[MAX2:%.*]] = select i1 [[CMP2]], i8 [[ELT2]], i8 [[MAX1]]
+; GCN-NEXT:    [[CMP3:%.*]] = icmp sgt i8 [[ELT3]], [[MAX2]]
+; GCN-NEXT:    [[MAX3:%.*]] = select i1 [[CMP3]], i8 [[ELT3]], i8 [[MAX2]]
+; GCN-NEXT:    ret i8 [[MAX3]]
+;
+entry:
+  %elt0 = extractelement <4 x i8> %vec4, i64 0
+  %elt1 = extractelement <4 x i8> %vec4, i64 1
+  %elt2 = extractelement <4 x i8> %vec4, i64 2
+  %elt3 = extractelement <4 x i8> %vec4, i64 3
+
+  %cmp1 = icmp sgt i8 %elt1, %elt0
+  %max1 = select i1 %cmp1, i8 %elt1, i8 %elt0
+  %cmp2 = icmp sgt i8 %elt2, %max1
+  %max2 = select i1 %cmp2, i8 %elt2, i8 %max1
+  %cmp3 = icmp sgt i8 %elt3, %max2
+  %max3 = select i1 %cmp3, i8 %elt3, i8 %max2
+
+  ret i8 %max3
+}

>From eb6c7cda23226d723d3066dceccef5923987ccc6 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 13 Jun 2024 12:16:25 -0700
Subject: [PATCH 2/3] Fix auto

Change-Id: I8a9505b7b611d12349509be7271f071965c382e2
---
 llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 2430ad2a9a3aa..e27fd4d0a1e33 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -307,9 +307,9 @@ bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
 }
 
 unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {
-  if (auto VTy = dyn_cast<FixedVectorType>(Tp)) {
+  if (FixedVectorType *VTy = dyn_cast<FixedVectorType>(Tp)) {
     if (DL.getTypeSizeInBits(VTy->getElementType()) == 8) {
-      auto ElCount = VTy->getElementCount().getFixedValue();
+      unsigned ElCount = VTy->getElementCount().getFixedValue();
       return ElCount / 4;
     }
   }

>From b503e2144a9845b2d5d9b1a554707a45acbb239d Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 17 Jun 2024 13:21:22 -0700
Subject: [PATCH 3/3] [AMDGPU] Vectorize i8 Shuffles

---
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      |  19 +-
 .../CostModel/AMDGPU/shufflevector.ll         | 416 +++++++++---------
 .../Transforms/SLPVectorizer/AMDGPU/i8.ll     |  57 +--
 .../SLPVectorizer/AMDGPU/reduction.ll         |  28 +-
 4 files changed, 236 insertions(+), 284 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index e27fd4d0a1e33..e73df181c8622 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1148,14 +1148,15 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
 
   Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp);
 
-  // Larger vector widths may require additional instructions, but are
-  // typically cheaper than scalarized versions.
-  unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
+  unsigned ScalarSize = DL.getTypeSizeInBits(VT->getElementType());
   if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
-      DL.getTypeSizeInBits(VT->getElementType()) == 16) {
-    bool HasVOP3P = ST->hasVOP3PInsts();
+      (ScalarSize == 16 || ScalarSize == 8)) {
+    // Larger vector widths may require additional instructions, but are
+    // typically cheaper than scalarized versions.
+    unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
     unsigned RequestedElts =
         count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
+    unsigned EltsPerReg = 32 / ScalarSize;
     if (RequestedElts == 0)
       return 0;
     switch (Kind) {
@@ -1164,9 +1165,9 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
     case TTI::SK_PermuteSingleSrc: {
       // With op_sel VOP3P instructions freely can access the low half or high
       // half of a register, so any swizzle of two elements is free.
-      if (HasVOP3P && NumVectorElts == 2)
+      if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumVectorElts == 2)
         return 0;
-      unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
+      unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
       // SK_Broadcast just reuses the same mask
       unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
       return NumPerms + NumPermMasks;
@@ -1178,12 +1179,12 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
         return 0;
       // Insert/extract subvectors only require shifts / extract code to get the
       // relevant bits
-      return alignTo(RequestedElts, 2) / 2;
+      return alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
     }
     case TTI::SK_PermuteTwoSrc:
     case TTI::SK_Splice:
     case TTI::SK_Select: {
-      unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
+      unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
       // SK_Select just reuses the same mask
       unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
       return NumPerms + NumPermMasks;
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
index a18156744a36b..adcabe70ec609 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
@@ -396,157 +396,157 @@ define amdgpu_kernel void @shufflevector_i16(<2 x i16> %vec1, <2 x i16> %vec2) {
 ; Should not assert
 define amdgpu_kernel void @shufflevector_i8(<2 x i8> %vec1, <2 x i8> %vec2) {
 ; ALL-LABEL: 'shufflevector_i8'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 1>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf02 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf03 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf21 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 1>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf13 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf31 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 1>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf32 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf000 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> zeroinitializer
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf001 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf010 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf011 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf100 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf101 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf110 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf111 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf002 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf020 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf022 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf200 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf202 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf220 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf222 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf112 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf121 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf122 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf211 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf212 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf221 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf32 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf000 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> zeroinitializer
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf001 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf010 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf011 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf100 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf101 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf110 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf111 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf002 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf020 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf022 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf200 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf202 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf220 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf222 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf112 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf121 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf122 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf211 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf212 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf221 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 1>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf02_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf03_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf21_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 1>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf13_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf31_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 1>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf32_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf000_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> zeroinitializer
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf001_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 0, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf010_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 1, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf011_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 1, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf100_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 0, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf101_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 0, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf110_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf111_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf002_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 0, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf020_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 2, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf022_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 2, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf200_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 0, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf202_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 0, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf220_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf222_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf112_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf121_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 2, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf122_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 2, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf211_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 1, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf212_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 1, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf221_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf32_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf000_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> zeroinitializer
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf001_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf010_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf011_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf100_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 0, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf101_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf110_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf111_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf002_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf020_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf022_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf200_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 0, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf202_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf220_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf222_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf112_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf121_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf122_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf211_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf212_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf221_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 1>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; ALL-SIZE-LABEL: 'shufflevector_i8'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 1>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf02 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf03 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf21 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 1>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf13 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf31 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 1>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf32 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf000 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> zeroinitializer
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf001 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf010 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf011 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf100 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf101 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf110 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf111 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf002 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf020 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf022 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf200 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf202 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf220 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf222 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf112 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf121 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf122 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf211 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf212 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf221 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf32 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf000 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> zeroinitializer
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf001 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf010 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf011 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf100 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf101 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf110 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf111 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf002 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf020 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf022 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf200 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf202 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf220 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf222 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf112 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf121 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf122 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf211 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf212 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf221 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 1>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf02_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf03_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf21_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 1>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf13_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf31_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 1>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf32_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf000_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> zeroinitializer
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf001_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 0, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf010_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 1, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf011_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 1, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf100_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 0, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf101_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 0, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf110_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf111_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf002_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 0, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf020_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 2, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf022_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 2, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf200_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 0, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf202_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 0, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf220_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf222_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf112_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf121_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 2, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf122_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 2, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf211_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 1, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf212_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 1, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf221_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf32_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf000_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> zeroinitializer
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf001_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf010_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf011_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf100_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 0, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf101_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf110_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf111_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf002_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf020_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf022_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf200_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 0, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf202_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf220_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf222_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf112_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf121_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf122_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf211_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf212_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf221_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 1>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer
@@ -861,22 +861,22 @@ define amdgpu_kernel void @shufflevector_i32(<2 x i32> %vec1, <2 x i32> %vec2) {
 ; Other shuffle cases
 define void @shuffle(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> %i8v4_2, <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i8> %i8v16, <16 x i8> %i8v16_2, <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i16> %i16v8, <8 x i16> %i16v8_2, <2 x i32> %i32v2, <2 x i32> %i32v2_2, <4 x i32> %i32v4, <4 x i32> %i32v4_2, <2 x float> %floatv2, <2 x float> %floatv2_2, <4 x float> %floatv4, <4 x float> %floatv4_2,<2 x i64> %i64v2, <2 x i64> %i64v2_2,<2 x double> %doublev2, <2 x double> %doublev2_2) {
 ; GFX9-10-LABEL: 'shuffle'
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> <i32 1, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> <i32 1, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
 ; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <2 x i32> <i32 1, i32 0>
 ; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> <i32 1, i32 0>
 ; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
@@ -898,22 +898,22 @@ define void @shuffle(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8>
 ; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; VI-LABEL: 'shuffle'
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> <i32 1, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> <i32 1, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
 ; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <2 x i32> <i32 1, i32 0>
 ; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> <i32 1, i32 0>
 ; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
@@ -935,22 +935,22 @@ define void @shuffle(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8>
 ; VI-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX9-10-SIZE-LABEL: 'shuffle'
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> <i32 1, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> <i32 1, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
 ; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <2 x i32> <i32 1, i32 0>
 ; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> <i32 1, i32 0>
 ; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
@@ -972,22 +972,22 @@ define void @shuffle(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8>
 ; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; VI-SIZE-LABEL: 'shuffle'
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> <i32 1, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> <i32 1, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
 ; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <2 x i32> <i32 1, i32 0>
 ; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> <i32 1, i32 0>
 ; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
@@ -1047,9 +1047,9 @@ define void @shuffle(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8>
 
 define void @concat(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i8> %i8v8, <8 x i8> %i8v8_2, <2 x half> %halfv2, <2 x half> %halfv2_2, <4 x half> %halfv4, <4 x half> %halfv4_2, <8 x half> %halfv8, <8 x half> %halfv8_2, <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i16> %i16v8, <8 x i16> %i16v8_2, <2 x i32> %i32v2, <2 x i32> %i32v2_2, <4 x i32> %i32v4, <4 x i32> %i32v4_2, <2 x float> %floatv2, <2 x float> %floatv2_2, <4 x float> %floatv4, <4 x float> %floatv4_2,<2 x i64> %i64v2, <2 x i64> %i64v2_2,<2 x double> %doublev2, <2 x double> %doublev2_2) {
 ; ALL-LABEL: 'concat'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i16 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i16 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i16 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -1062,9 +1062,9 @@ define void @concat(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> %
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i16_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i16_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -1080,9 +1080,9 @@ define void @concat(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> %
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; ALL-SIZE-LABEL: 'concat'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i16 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i16 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i16 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -1095,9 +1095,9 @@ define void @concat(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> %
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i16_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i16_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/i8.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/i8.ll
index 9584e30e4ccaf..67129f2cccebf 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/i8.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/i8.ll
@@ -79,37 +79,19 @@ define protected amdgpu_kernel void @phi(ptr addrspace(3) %inptr0, ptr addrspace
 ; GFX8PLUS-LABEL: @phi(
 ; GFX8PLUS-NEXT:  entry:
 ; GFX8PLUS-NEXT:    [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0:%.*]], i32 0
-; GFX8PLUS-NEXT:    [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX8PLUS-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
-; GFX8PLUS-NEXT:    [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
-; GFX8PLUS-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2
-; GFX8PLUS-NEXT:    [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
-; GFX8PLUS-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3
-; GFX8PLUS-NEXT:    [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1
+; GFX8PLUS-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
 ; GFX8PLUS-NEXT:    br label [[DO_BODY:%.*]]
 ; GFX8PLUS:       do.body:
-; GFX8PLUS-NEXT:    [[PHI0:%.*]] = phi i8 [ [[ELE3]], [[ENTRY:%.*]] ], [ [[OTHERELE3:%.*]], [[DO_BODY]] ]
-; GFX8PLUS-NEXT:    [[PHI1:%.*]] = phi i8 [ [[ELE2]], [[ENTRY]] ], [ [[OTHERELE2:%.*]], [[DO_BODY]] ]
-; GFX8PLUS-NEXT:    [[PHI2:%.*]] = phi i8 [ [[ELE1]], [[ENTRY]] ], [ [[OTHERELE1:%.*]], [[DO_BODY]] ]
-; GFX8PLUS-NEXT:    [[PHI3:%.*]] = phi i8 [ [[ELE0]], [[ENTRY]] ], [ [[OTHERELE0:%.*]], [[DO_BODY]] ]
-; GFX8PLUS-NEXT:    [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8
-; GFX8PLUS-NEXT:    [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1
-; GFX8PLUS-NEXT:    [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2
-; GFX8PLUS-NEXT:    [[OTHERELE3]] = load i8, ptr addrspace(3) [[GEP3]], align 1
-; GFX8PLUS-NEXT:    [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8
-; GFX8PLUS-NEXT:    [[VEC01:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9
-; GFX8PLUS-NEXT:    [[VEC02:%.*]] = insertelement <16 x i8> [[VEC01]], i8 [[OTHERELE2]], i64 10
-; GFX8PLUS-NEXT:    [[VEC03:%.*]] = insertelement <16 x i8> [[VEC02]], i8 [[OTHERELE3]], i64 11
-; GFX8PLUS-NEXT:    [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
-; GFX8PLUS-NEXT:    [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
-; GFX8PLUS-NEXT:    [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10
-; GFX8PLUS-NEXT:    [[VEC13:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11
-; GFX8PLUS-NEXT:    store <16 x i8> [[VEC13]], ptr addrspace(3) [[INPTR1:%.*]], align 2
+; GFX8PLUS-NEXT:    [[TMP1:%.*]] = phi <4 x i8> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP2:%.*]], [[DO_BODY]] ]
+; GFX8PLUS-NEXT:    [[TMP2]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
+; GFX8PLUS-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8PLUS-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8PLUS-NEXT:    store <16 x i8> [[TMP4]], ptr addrspace(3) [[INPTR1:%.*]], align 2
 ; GFX8PLUS-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FLAG:%.*]], 0
 ; GFX8PLUS-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[DO_BODY]]
 ; GFX8PLUS:       exit:
-; GFX8PLUS-NEXT:    store <16 x i8> [[VEC13]], ptr [[OUT:%.*]], align 16
-; GFX8PLUS-NEXT:    store <16 x i8> [[VEC03]], ptr [[OUT1:%.*]], align 16
+; GFX8PLUS-NEXT:    store <16 x i8> [[TMP4]], ptr [[OUT:%.*]], align 16
+; GFX8PLUS-NEXT:    store <16 x i8> [[TMP3]], ptr [[OUT1:%.*]], align 16
 ; GFX8PLUS-NEXT:    ret void
 ;
 entry:
@@ -223,35 +205,24 @@ define protected amdgpu_kernel void @arith_phi(ptr addrspace(3) %inptr0, ptr %ou
 ; GFX8PLUS-LABEL: @arith_phi(
 ; GFX8PLUS-NEXT:  entry:
 ; GFX8PLUS-NEXT:    [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0:%.*]], i32 0
-; GFX8PLUS-NEXT:    [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
 ; GFX8PLUS-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
-; GFX8PLUS-NEXT:    [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
 ; GFX8PLUS-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2
-; GFX8PLUS-NEXT:    [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
 ; GFX8PLUS-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3
-; GFX8PLUS-NEXT:    [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1
+; GFX8PLUS-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
 ; GFX8PLUS-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FLAG:%.*]], 0
 ; GFX8PLUS-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[BB_1:%.*]]
 ; GFX8PLUS:       bb.1:
-; GFX8PLUS-NEXT:    [[ADD0:%.*]] = add i8 [[ELE0]], 1
-; GFX8PLUS-NEXT:    [[ADD1:%.*]] = add i8 [[ELE1]], 1
-; GFX8PLUS-NEXT:    [[ADD2:%.*]] = add i8 [[ELE2]], 1
-; GFX8PLUS-NEXT:    [[ADD3:%.*]] = add i8 [[ELE3]], 1
+; GFX8PLUS-NEXT:    [[TMP1:%.*]] = add <4 x i8> [[TMP0]], <i8 1, i8 1, i8 1, i8 1>
+; GFX8PLUS-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; GFX8PLUS-NEXT:    br label [[EXIT]]
 ; GFX8PLUS:       exit:
-; GFX8PLUS-NEXT:    [[PHI0:%.*]] = phi i8 [ [[ELE3]], [[ENTRY:%.*]] ], [ [[ADD0]], [[BB_1]] ]
-; GFX8PLUS-NEXT:    [[PHI1:%.*]] = phi i8 [ [[ELE2]], [[ENTRY]] ], [ [[ADD1]], [[BB_1]] ]
-; GFX8PLUS-NEXT:    [[PHI2:%.*]] = phi i8 [ [[ELE1]], [[ENTRY]] ], [ [[ADD2]], [[BB_1]] ]
-; GFX8PLUS-NEXT:    [[PHI3:%.*]] = phi i8 [ [[ELE0]], [[ENTRY]] ], [ [[ADD3]], [[BB_1]] ]
+; GFX8PLUS-NEXT:    [[TMP3:%.*]] = phi <4 x i8> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP2]], [[BB_1]] ]
 ; GFX8PLUS-NEXT:    [[OTHERELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
 ; GFX8PLUS-NEXT:    [[OTHERELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
 ; GFX8PLUS-NEXT:    [[OTHERELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
 ; GFX8PLUS-NEXT:    [[OTHERELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1
-; GFX8PLUS-NEXT:    [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
-; GFX8PLUS-NEXT:    [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
-; GFX8PLUS-NEXT:    [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10
-; GFX8PLUS-NEXT:    [[VEC13:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11
-; GFX8PLUS-NEXT:    store <16 x i8> [[VEC13]], ptr [[OUT:%.*]], align 2
+; GFX8PLUS-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8PLUS-NEXT:    store <16 x i8> [[TMP4]], ptr [[OUT:%.*]], align 2
 ; GFX8PLUS-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll
index 567760f695598..c159387cbd782 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll
@@ -553,14 +553,8 @@ entry:
 define i8 @reduction_v4i8(<4 x i8> %a) {
 ; GCN-LABEL: @reduction_v4i8(
 ; GCN-NEXT:  entry:
-; GCN-NEXT:    [[ELT0:%.*]] = extractelement <4 x i8> [[A:%.*]], i64 0
-; GCN-NEXT:    [[ELT1:%.*]] = extractelement <4 x i8> [[A]], i64 1
-; GCN-NEXT:    [[ELT2:%.*]] = extractelement <4 x i8> [[A]], i64 2
-; GCN-NEXT:    [[ELT3:%.*]] = extractelement <4 x i8> [[A]], i64 3
-; GCN-NEXT:    [[ADD1:%.*]] = add i8 [[ELT1]], [[ELT0]]
-; GCN-NEXT:    [[ADD2:%.*]] = add i8 [[ELT2]], [[ADD1]]
-; GCN-NEXT:    [[ADD3:%.*]] = add i8 [[ELT3]], [[ADD2]]
-; GCN-NEXT:    ret i8 [[ADD3]]
+; GCN-NEXT:    [[TMP0:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[A:%.*]])
+; GCN-NEXT:    ret i8 [[TMP0]]
 ;
 entry:
   %elt0 = extractelement <4 x i8> %a, i64 0
@@ -578,22 +572,8 @@ entry:
 define i8 @reduction_v8i8(<8 x i8> %vec8) {
 ; GCN-LABEL: @reduction_v8i8(
 ; GCN-NEXT:  entry:
-; GCN-NEXT:    [[ELT0:%.*]] = extractelement <8 x i8> [[VEC8:%.*]], i64 0
-; GCN-NEXT:    [[ELT1:%.*]] = extractelement <8 x i8> [[VEC8]], i64 1
-; GCN-NEXT:    [[ELT2:%.*]] = extractelement <8 x i8> [[VEC8]], i64 2
-; GCN-NEXT:    [[ELT3:%.*]] = extractelement <8 x i8> [[VEC8]], i64 3
-; GCN-NEXT:    [[ELT4:%.*]] = extractelement <8 x i8> [[VEC8]], i64 4
-; GCN-NEXT:    [[ELT5:%.*]] = extractelement <8 x i8> [[VEC8]], i64 5
-; GCN-NEXT:    [[ELT6:%.*]] = extractelement <8 x i8> [[VEC8]], i64 6
-; GCN-NEXT:    [[ELT7:%.*]] = extractelement <8 x i8> [[VEC8]], i64 7
-; GCN-NEXT:    [[ADD1:%.*]] = add i8 [[ELT1]], [[ELT0]]
-; GCN-NEXT:    [[ADD2:%.*]] = add i8 [[ELT2]], [[ADD1]]
-; GCN-NEXT:    [[ADD3:%.*]] = add i8 [[ELT3]], [[ADD2]]
-; GCN-NEXT:    [[ADD4:%.*]] = add i8 [[ELT4]], [[ADD3]]
-; GCN-NEXT:    [[ADD5:%.*]] = add i8 [[ELT5]], [[ADD4]]
-; GCN-NEXT:    [[ADD6:%.*]] = add i8 [[ELT6]], [[ADD5]]
-; GCN-NEXT:    [[ADD7:%.*]] = add i8 [[ELT7]], [[ADD6]]
-; GCN-NEXT:    ret i8 [[ADD7]]
+; GCN-NEXT:    [[TMP0:%.*]] = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> [[VEC8:%.*]])
+; GCN-NEXT:    ret i8 [[TMP0]]
 ;
 entry:
   %elt0 = extractelement <8 x i8> %vec8, i64 0