[llvm] [AMDGPU] Enable vectorization of i8 values. (PR #134934)

Tue Apr 15 09:42:35 PDT 2025

https://github.com/doru1004 updated https://github.com/llvm/llvm-project/pull/134934

>From ce790a6d16359c4ad39881932534b55bf07f32b9 Mon Sep 17 00:00:00 2001
From: Doru Bercea <doru.bercea at amd.com>
Date: Mon, 7 Apr 2025 11:03:20 -0400
Subject: [PATCH] Enable vectorization of i8 values.

---
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      |  43 ++-
 .../Target/AMDGPU/AMDGPUTargetTransformInfo.h |  14 +
 .../test/Analysis/CostModel/AMDGPU/add-sub.ll |  28 +-
 .../test/Analysis/CostModel/AMDGPU/bit-ops.ll |  72 ++---
 llvm/test/Analysis/CostModel/AMDGPU/mul.ll    |  96 +++---
 llvm/test/Analysis/CostModel/AMDGPU/rem.ll    |  84 ++---
 llvm/test/Analysis/CostModel/AMDGPU/shifts.ll |  96 +++---
 .../AMDGPU/vectorize-i8-as-i32.ll             | 292 ++++++++++++++++++
 8 files changed, 531 insertions(+), 194 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/AMDGPU/vectorize-i8-as-i32.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 09f7877b13b3a..b0bdf5164eaf4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -344,9 +344,10 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
 unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
   if (Opcode == Instruction::Load || Opcode == Instruction::Store)
     return 32 * 4 / ElemWidth;
-  return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
-       : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
-       : 1;
+  return ElemWidth == 8                                ? 4
+         : (ElemWidth == 16 && ST->has16BitInsts())    ? 2
+         : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
+                                                       : 1;
 }
 
 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -537,6 +538,12 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
 
   MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
 
+  VectorType *VecTy = dyn_cast<VectorType>(Ty);
+  InstructionCost LTTypeCost = LT.first;
+  if (VecTy &&
+      VecTy->getElementType() == IntegerType::getInt8Ty(VecTy->getContext()))
+    LTTypeCost = (((LT.first - 1) / 4) + 1);
+
   switch (ISD) {
   case ISD::SHL:
   case ISD::SRL:
@@ -548,7 +555,7 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
       NElts = (NElts + 1) / 2;
 
     // i32
-    return getFullRateInstrCost() * LT.first * NElts;
+    return getFullRateInstrCost() * LTTypeCost * NElts;
   case ISD::ADD:
   case ISD::SUB:
   case ISD::AND:
@@ -562,7 +569,7 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
     if (ST->has16BitInsts() && SLT == MVT::i16)
       NElts = (NElts + 1) / 2;
 
-    return LT.first * NElts * getFullRateInstrCost();
+    return LTTypeCost * NElts * getFullRateInstrCost();
   case ISD::MUL: {
     const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
     if (SLT == MVT::i64) {
@@ -574,7 +581,7 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
       NElts = (NElts + 1) / 2;
 
     // i32
-    return QuarterRateCost * NElts * LT.first;
+    return QuarterRateCost * NElts * LTTypeCost;
   }
   case ISD::FMUL:
     // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
@@ -1423,3 +1430,27 @@ void GCNTTIImpl::collectKernelLaunchBounds(
   LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
   LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
 }
+
+InstructionCost GCNTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                            Align Alignment,
+                                            unsigned AddressSpace,
+                                            TTI::TargetCostKind CostKind,
+                                            TTI::OperandValueInfo OpInfo,
+                                            const Instruction *I) {
+  if (VectorType *VecTy = dyn_cast<VectorType>(Src))
+    if (Opcode == Instruction::Load && VecTy->getElementType() == IntegerType::getInt8Ty(VecTy->getContext())) {
+      unsigned ElementCount = VecTy->getElementCount().getFixedValue();
+      return ((ElementCount - 1) / 4) + 1;
+    }
+  return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
+                                OpInfo, I);
+}
+
+unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) {
+  if (VectorType *VecTy = dyn_cast<VectorType>(Tp))
+    if (VecTy->getElementType() == IntegerType::getInt8Ty(VecTy->getContext())) {
+      unsigned ElementCount = VecTy->getElementCount().getFixedValue();
+      return ((ElementCount - 1) / 4) + 1;
+    }
+  return BaseT::getNumberOfParts(Tp);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index f5062070ac6f4..dcd3c70fbe4b6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -282,6 +282,20 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
   void collectKernelLaunchBounds(
       const Function &F,
       SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const;
+
+  /// Account for loads of i8 vector types to have reduced cost. For
+  /// example the cost of load 4 i8s values is one is the cost of loading
+  /// a single i32 value.
+  InstructionCost getMemoryOpCost(
+      unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
+      TTI::TargetCostKind CostKind,
+      TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
+      const Instruction *I = nullptr);
+
+  /// When counting parts on AMD GPUs, account for i8s being grouped
+  /// together under a single i32 value. Otherwise fall back to base
+  /// implementation.
+  unsigned getNumberOfParts(Type *Tp);
 };
 
 } // end namespace llvm
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
index 2a966b4ea178f..1fb00e97c71f9 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
@@ -126,24 +126,24 @@ define amdgpu_kernel void @add_i16() #0 {
 define amdgpu_kernel void @add_i8() #0 {
 ; ALL-LABEL: 'add_i8'
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i8 = add i8 undef, undef
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = add <2 x i8> undef, undef
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i8 = add <3 x i8> undef, undef
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = add <4 x i8> undef, undef
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5i8 = add <5 x i8> undef, undef
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v6i8 = add <6 x i8> undef, undef
-; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v32i8 = add <32 x i8> undef, undef
-; ALL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v33i8 = add <33 x i8> undef, undef
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = add <2 x i8> undef, undef
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = add <3 x i8> undef, undef
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = add <4 x i8> undef, undef
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5i8 = add <5 x i8> undef, undef
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v6i8 = add <6 x i8> undef, undef
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8 = add <32 x i8> undef, undef
+; ALL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v33i8 = add <33 x i8> undef, undef
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; ALL-SIZE-LABEL: 'add_i8'
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i8 = add i8 undef, undef
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = add <2 x i8> undef, undef
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i8 = add <3 x i8> undef, undef
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = add <4 x i8> undef, undef
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5i8 = add <5 x i8> undef, undef
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v6i8 = add <6 x i8> undef, undef
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v32i8 = add <32 x i8> undef, undef
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v33i8 = add <33 x i8> undef, undef
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = add <2 x i8> undef, undef
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = add <3 x i8> undef, undef
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = add <4 x i8> undef, undef
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5i8 = add <5 x i8> undef, undef
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v6i8 = add <6 x i8> undef, undef
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8 = add <32 x i8> undef, undef
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v33i8 = add <33 x i8> undef, undef
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %i8 = add i8 undef, undef
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/bit-ops.ll b/llvm/test/Analysis/CostModel/AMDGPU/bit-ops.ll
index e64615f32e1a4..0ed576255ad1a 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/bit-ops.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/bit-ops.ll
@@ -8,9 +8,9 @@
 define amdgpu_kernel void @or() #0 {
 ; SLOW16-LABEL: 'or'
 ; SLOW16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i8 = or i8 undef, undef
-; SLOW16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = or <2 x i8> undef, undef
-; SLOW16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i8 = or <3 x i8> undef, undef
-; SLOW16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = or <4 x i8> undef, undef
+; SLOW16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = or <2 x i8> undef, undef
+; SLOW16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = or <3 x i8> undef, undef
+; SLOW16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = or <4 x i8> undef, undef
 ; SLOW16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i16 = or i16 undef, undef
 ; SLOW16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16 = or <2 x i16> undef, undef
 ; SLOW16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i16 = or <3 x i16> undef, undef
@@ -27,9 +27,9 @@ define amdgpu_kernel void @or() #0 {
 ;
 ; FAST16-LABEL: 'or'
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i8 = or i8 undef, undef
-; FAST16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = or <2 x i8> undef, undef
-; FAST16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i8 = or <3 x i8> undef, undef
-; FAST16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = or <4 x i8> undef, undef
+; FAST16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = or <2 x i8> undef, undef
+; FAST16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = or <3 x i8> undef, undef
+; FAST16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = or <4 x i8> undef, undef
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i16 = or i16 undef, undef
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = or <2 x i16> undef, undef
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3i16 = or <3 x i16> undef, undef
@@ -46,9 +46,9 @@ define amdgpu_kernel void @or() #0 {
 ;
 ; SLOW16-SIZE-LABEL: 'or'
 ; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i8 = or i8 undef, undef
-; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = or <2 x i8> undef, undef
-; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i8 = or <3 x i8> undef, undef
-; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = or <4 x i8> undef, undef
+; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = or <2 x i8> undef, undef
+; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = or <3 x i8> undef, undef
+; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = or <4 x i8> undef, undef
 ; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i16 = or i16 undef, undef
 ; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16 = or <2 x i16> undef, undef
 ; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i16 = or <3 x i16> undef, undef
@@ -65,9 +65,9 @@ define amdgpu_kernel void @or() #0 {
 ;
 ; FAST16-SIZE-LABEL: 'or'
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i8 = or i8 undef, undef
-; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = or <2 x i8> undef, undef
-; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i8 = or <3 x i8> undef, undef
-; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = or <4 x i8> undef, undef
+; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = or <2 x i8> undef, undef
+; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = or <3 x i8> undef, undef
+; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = or <4 x i8> undef, undef
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i16 = or i16 undef, undef
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = or <2 x i16> undef, undef
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3i16 = or <3 x i16> undef, undef
@@ -104,9 +104,9 @@ define amdgpu_kernel void @or() #0 {
 define amdgpu_kernel void @xor() #0 {
 ; SLOW16-LABEL: 'xor'
 ; SLOW16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i8 = xor i8 undef, undef
-; SLOW16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = xor <2 x i8> undef, undef
-; SLOW16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i8 = xor <3 x i8> undef, undef
-; SLOW16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = xor <4 x i8> undef, undef
+; SLOW16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = xor <2 x i8> undef, undef
+; SLOW16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = xor <3 x i8> undef, undef
+; SLOW16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = xor <4 x i8> undef, undef
 ; SLOW16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i16 = xor i16 undef, undef
 ; SLOW16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16 = xor <2 x i16> undef, undef
 ; SLOW16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i16 = xor <3 x i16> undef, undef
@@ -123,9 +123,9 @@ define amdgpu_kernel void @xor() #0 {
 ;
 ; FAST16-LABEL: 'xor'
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i8 = xor i8 undef, undef
-; FAST16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = xor <2 x i8> undef, undef
-; FAST16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i8 = xor <3 x i8> undef, undef
-; FAST16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = xor <4 x i8> undef, undef
+; FAST16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = xor <2 x i8> undef, undef
+; FAST16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = xor <3 x i8> undef, undef
+; FAST16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = xor <4 x i8> undef, undef
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i16 = xor i16 undef, undef
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = xor <2 x i16> undef, undef
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3i16 = xor <3 x i16> undef, undef
@@ -142,9 +142,9 @@ define amdgpu_kernel void @xor() #0 {
 ;
 ; SLOW16-SIZE-LABEL: 'xor'
 ; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i8 = xor i8 undef, undef
-; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = xor <2 x i8> undef, undef
-; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i8 = xor <3 x i8> undef, undef
-; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = xor <4 x i8> undef, undef
+; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = xor <2 x i8> undef, undef
+; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = xor <3 x i8> undef, undef
+; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = xor <4 x i8> undef, undef
 ; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i16 = xor i16 undef, undef
 ; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16 = xor <2 x i16> undef, undef
 ; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i16 = xor <3 x i16> undef, undef
@@ -161,9 +161,9 @@ define amdgpu_kernel void @xor() #0 {
 ;
 ; FAST16-SIZE-LABEL: 'xor'
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i8 = xor i8 undef, undef
-; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = xor <2 x i8> undef, undef
-; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i8 = xor <3 x i8> undef, undef
-; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = xor <4 x i8> undef, undef
+; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = xor <2 x i8> undef, undef
+; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = xor <3 x i8> undef, undef
+; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = xor <4 x i8> undef, undef
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i16 = xor i16 undef, undef
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = xor <2 x i16> undef, undef
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3i16 = xor <3 x i16> undef, undef
@@ -200,9 +200,9 @@ define amdgpu_kernel void @xor() #0 {
 define amdgpu_kernel void @and() #0 {
 ; SLOW16-LABEL: 'and'
 ; SLOW16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i8 = and i8 undef, undef
-; SLOW16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = and <2 x i8> undef, undef
-; SLOW16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i8 = and <3 x i8> undef, undef
-; SLOW16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = and <4 x i8> undef, undef
+; SLOW16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = and <2 x i8> undef, undef
+; SLOW16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = and <3 x i8> undef, undef
+; SLOW16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = and <4 x i8> undef, undef
 ; SLOW16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i16 = and i16 undef, undef
 ; SLOW16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16 = and <2 x i16> undef, undef
 ; SLOW16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i16 = and <3 x i16> undef, undef
@@ -219,9 +219,9 @@ define amdgpu_kernel void @and() #0 {
 ;
 ; FAST16-LABEL: 'and'
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i8 = and i8 undef, undef
-; FAST16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = and <2 x i8> undef, undef
-; FAST16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i8 = and <3 x i8> undef, undef
-; FAST16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = and <4 x i8> undef, undef
+; FAST16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = and <2 x i8> undef, undef
+; FAST16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = and <3 x i8> undef, undef
+; FAST16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = and <4 x i8> undef, undef
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i16 = and i16 undef, undef
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = and <2 x i16> undef, undef
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3i16 = and <3 x i16> undef, undef
@@ -238,9 +238,9 @@ define amdgpu_kernel void @and() #0 {
 ;
 ; SLOW16-SIZE-LABEL: 'and'
 ; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i8 = and i8 undef, undef
-; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = and <2 x i8> undef, undef
-; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i8 = and <3 x i8> undef, undef
-; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = and <4 x i8> undef, undef
+; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = and <2 x i8> undef, undef
+; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = and <3 x i8> undef, undef
+; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = and <4 x i8> undef, undef
 ; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i16 = and i16 undef, undef
 ; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16 = and <2 x i16> undef, undef
 ; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i16 = and <3 x i16> undef, undef
@@ -257,9 +257,9 @@ define amdgpu_kernel void @and() #0 {
 ;
 ; FAST16-SIZE-LABEL: 'and'
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i8 = and i8 undef, undef
-; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = and <2 x i8> undef, undef
-; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i8 = and <3 x i8> undef, undef
-; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = and <4 x i8> undef, undef
+; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = and <2 x i8> undef, undef
+; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = and <3 x i8> undef, undef
+; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = and <4 x i8> undef, undef
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i16 = and i16 undef, undef
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = and <2 x i16> undef, undef
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3i16 = and <3 x i16> undef, undef
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll
index ca5d2fdd938b5..ba1bfd5df2f3e 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll
@@ -127,9 +127,9 @@ define i32 @mul_constpow2() {
 ; SLOW16-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16i16 = mul <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; SLOW16-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V32i16 = mul <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; SLOW16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, 16
-; SLOW16-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; SLOW16-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; SLOW16-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V64i8 = mul <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SLOW16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = mul <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SLOW16-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = mul <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SLOW16-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64i8 = mul <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; SLOW16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; FAST16-LABEL: 'mul_constpow2'
@@ -146,9 +146,9 @@ define i32 @mul_constpow2() {
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32i16 = mul <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, 16
-; FAST16-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; FAST16-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; FAST16-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V64i8 = mul <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; FAST16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = mul <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; FAST16-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = mul <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; FAST16-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64i8 = mul <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; SLOW16-SIZE-LABEL: 'mul_constpow2'
@@ -165,9 +165,9 @@ define i32 @mul_constpow2() {
 ; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32i16 = mul <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, 16
-; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64i8 = mul <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i8 = mul <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = mul <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64i8 = mul <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; FAST16-SIZE-LABEL: 'mul_constpow2'
@@ -184,9 +184,9 @@ define i32 @mul_constpow2() {
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = mul <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32i16 = mul <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, 16
-; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64i8 = mul <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i8 = mul <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = mul <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64i8 = mul <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = mul i64 undef, 16
@@ -227,9 +227,9 @@ define i32 @mul_uniformconstpow2() {
 ; SLOW16-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16i16 = mul <16 x i16> undef, splat (i16 16)
 ; SLOW16-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V32i16 = mul <32 x i16> undef, splat (i16 16)
 ; SLOW16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, 16
-; SLOW16-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, splat (i8 16)
-; SLOW16-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, splat (i8 16)
-; SLOW16-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V64i8 = mul <64 x i8> undef, splat (i8 16)
+; SLOW16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = mul <16 x i8> undef, splat (i8 16)
+; SLOW16-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = mul <32 x i8> undef, splat (i8 16)
+; SLOW16-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64i8 = mul <64 x i8> undef, splat (i8 16)
 ; SLOW16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; FAST16-LABEL: 'mul_uniformconstpow2'
@@ -246,9 +246,9 @@ define i32 @mul_uniformconstpow2() {
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, splat (i16 16)
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32i16 = mul <32 x i16> undef, splat (i16 16)
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, 16
-; FAST16-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, splat (i8 16)
-; FAST16-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, splat (i8 16)
-; FAST16-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V64i8 = mul <64 x i8> undef, splat (i8 16)
+; FAST16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = mul <16 x i8> undef, splat (i8 16)
+; FAST16-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = mul <32 x i8> undef, splat (i8 16)
+; FAST16-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64i8 = mul <64 x i8> undef, splat (i8 16)
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; SLOW16-SIZE-LABEL: 'mul_uniformconstpow2'
@@ -265,9 +265,9 @@ define i32 @mul_uniformconstpow2() {
 ; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, splat (i16 16)
 ; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32i16 = mul <32 x i16> undef, splat (i16 16)
 ; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, 16
-; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, splat (i8 16)
-; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, splat (i8 16)
-; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64i8 = mul <64 x i8> undef, splat (i8 16)
+; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i8 = mul <16 x i8> undef, splat (i8 16)
+; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = mul <32 x i8> undef, splat (i8 16)
+; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64i8 = mul <64 x i8> undef, splat (i8 16)
 ; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; FAST16-SIZE-LABEL: 'mul_uniformconstpow2'
@@ -284,9 +284,9 @@ define i32 @mul_uniformconstpow2() {
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = mul <16 x i16> undef, splat (i16 16)
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32i16 = mul <32 x i16> undef, splat (i16 16)
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, 16
-; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, splat (i8 16)
-; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, splat (i8 16)
-; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64i8 = mul <64 x i8> undef, splat (i8 16)
+; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i8 = mul <16 x i8> undef, splat (i8 16)
+; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = mul <32 x i8> undef, splat (i8 16)
+; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64i8 = mul <64 x i8> undef, splat (i8 16)
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = mul i64 undef, 16
@@ -327,9 +327,9 @@ define i32 @mul_constnegpow2() {
 ; SLOW16-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16i16 = mul <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; SLOW16-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V32i16 = mul <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; SLOW16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, -16
-; SLOW16-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; SLOW16-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; SLOW16-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V64i8 = mul <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; SLOW16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = mul <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; SLOW16-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = mul <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; SLOW16-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64i8 = mul <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; SLOW16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; FAST16-LABEL: 'mul_constnegpow2'
@@ -346,9 +346,9 @@ define i32 @mul_constnegpow2() {
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32i16 = mul <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, -16
-; FAST16-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; FAST16-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; FAST16-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V64i8 = mul <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; FAST16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = mul <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; FAST16-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = mul <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; FAST16-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64i8 = mul <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; SLOW16-SIZE-LABEL: 'mul_constnegpow2'
@@ -365,9 +365,9 @@ define i32 @mul_constnegpow2() {
 ; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32i16 = mul <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, -16
-; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64i8 = mul <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i8 = mul <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = mul <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64i8 = mul <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; FAST16-SIZE-LABEL: 'mul_constnegpow2'
@@ -384,9 +384,9 @@ define i32 @mul_constnegpow2() {
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = mul <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32i16 = mul <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, -16
-; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64i8 = mul <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i8 = mul <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = mul <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64i8 = mul <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = mul i64 undef, -16
@@ -427,9 +427,9 @@ define i32 @mul_uniformconstnegpow2() {
 ; SLOW16-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16i16 = mul <16 x i16> undef, splat (i16 -16)
 ; SLOW16-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V32i16 = mul <32 x i16> undef, splat (i16 -16)
 ; SLOW16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, -16
-; SLOW16-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, splat (i8 -16)
-; SLOW16-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, splat (i8 -16)
-; SLOW16-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V64i8 = mul <64 x i8> undef, splat (i8 -16)
+; SLOW16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = mul <16 x i8> undef, splat (i8 -16)
+; SLOW16-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = mul <32 x i8> undef, splat (i8 -16)
+; SLOW16-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64i8 = mul <64 x i8> undef, splat (i8 -16)
 ; SLOW16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; FAST16-LABEL: 'mul_uniformconstnegpow2'
@@ -446,9 +446,9 @@ define i32 @mul_uniformconstnegpow2() {
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, splat (i16 -16)
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32i16 = mul <32 x i16> undef, splat (i16 -16)
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, -16
-; FAST16-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, splat (i8 -16)
-; FAST16-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, splat (i8 -16)
-; FAST16-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V64i8 = mul <64 x i8> undef, splat (i8 -16)
+; FAST16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = mul <16 x i8> undef, splat (i8 -16)
+; FAST16-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = mul <32 x i8> undef, splat (i8 -16)
+; FAST16-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64i8 = mul <64 x i8> undef, splat (i8 -16)
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; SLOW16-SIZE-LABEL: 'mul_uniformconstnegpow2'
@@ -465,9 +465,9 @@ define i32 @mul_uniformconstnegpow2() {
 ; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, splat (i16 -16)
 ; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32i16 = mul <32 x i16> undef, splat (i16 -16)
 ; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, -16
-; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, splat (i8 -16)
-; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, splat (i8 -16)
-; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64i8 = mul <64 x i8> undef, splat (i8 -16)
+; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i8 = mul <16 x i8> undef, splat (i8 -16)
+; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = mul <32 x i8> undef, splat (i8 -16)
+; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64i8 = mul <64 x i8> undef, splat (i8 -16)
 ; SLOW16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; FAST16-SIZE-LABEL: 'mul_uniformconstnegpow2'
@@ -484,9 +484,9 @@ define i32 @mul_uniformconstnegpow2() {
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = mul <16 x i16> undef, splat (i16 -16)
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32i16 = mul <32 x i16> undef, splat (i16 -16)
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, -16
-; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, splat (i8 -16)
-; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, splat (i8 -16)
-; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64i8 = mul <64 x i8> undef, splat (i8 -16)
+; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i8 = mul <16 x i8> undef, splat (i8 -16)
+; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = mul <32 x i8> undef, splat (i8 -16)
+; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64i8 = mul <64 x i8> undef, splat (i8 -16)
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = mul i64 undef, -16
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/rem.ll b/llvm/test/Analysis/CostModel/AMDGPU/rem.ll
index 7e37566db4417..2705c7d0aa867 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/rem.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/rem.ll
@@ -44,9 +44,9 @@ define i32 @srem() {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16i16 = srem <16 x i16> undef, undef
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 266 for instruction: %V32i16 = srem <32 x i16> undef, undef
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, undef
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16i8 = srem <16 x i8> undef, undef
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V32i8 = srem <32 x i8> undef, undef
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %V64i8 = srem <64 x i8> undef, undef
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16i8 = srem <16 x i8> undef, undef
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V32i8 = srem <32 x i8> undef, undef
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 277 for instruction: %V64i8 = srem <64 x i8> undef, undef
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; ALL-SIZE-LABEL: 'srem'
@@ -125,9 +125,9 @@ define i32 @urem() {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16i16 = urem <16 x i16> undef, undef
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 266 for instruction: %V32i16 = urem <32 x i16> undef, undef
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = urem i8 undef, undef
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16i8 = urem <16 x i8> undef, undef
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V32i8 = urem <32 x i8> undef, undef
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %V64i8 = urem <64 x i8> undef, undef
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16i8 = urem <16 x i8> undef, undef
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V32i8 = urem <32 x i8> undef, undef
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 277 for instruction: %V64i8 = urem <64 x i8> undef, undef
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; ALL-SIZE-LABEL: 'urem'
@@ -206,9 +206,9 @@ define i32 @srem_const() {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 266 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 7
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 277 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; ALL-SIZE-LABEL: 'srem_const'
@@ -287,9 +287,9 @@ define i32 @urem_const() {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 266 for instruction: %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = urem i8 undef, 7
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 277 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; ALL-SIZE-LABEL: 'urem_const'
@@ -368,9 +368,9 @@ define i32 @srem_uniformconst() {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16i16 = srem <16 x i16> undef, splat (i16 7)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 266 for instruction: %V32i16 = srem <32 x i16> undef, splat (i16 7)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 7
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16i8 = srem <16 x i8> undef, splat (i8 7)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V32i8 = srem <32 x i8> undef, splat (i8 7)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %V64i8 = srem <64 x i8> undef, splat (i8 7)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16i8 = srem <16 x i8> undef, splat (i8 7)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V32i8 = srem <32 x i8> undef, splat (i8 7)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 277 for instruction: %V64i8 = srem <64 x i8> undef, splat (i8 7)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; ALL-SIZE-LABEL: 'srem_uniformconst'
@@ -449,9 +449,9 @@ define i32 @urem_uniformconst() {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16i16 = urem <16 x i16> undef, splat (i16 7)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 266 for instruction: %V32i16 = urem <32 x i16> undef, splat (i16 7)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = urem i8 undef, 7
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16i8 = urem <16 x i8> undef, splat (i8 7)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V32i8 = urem <32 x i8> undef, splat (i8 7)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %V64i8 = urem <64 x i8> undef, splat (i8 7)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16i8 = urem <16 x i8> undef, splat (i8 7)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V32i8 = urem <32 x i8> undef, splat (i8 7)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 277 for instruction: %V64i8 = urem <64 x i8> undef, splat (i8 7)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; ALL-SIZE-LABEL: 'urem_uniformconst'
@@ -530,9 +530,9 @@ define i32 @srem_constpow2() {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16i16 = srem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 266 for instruction: %V32i16 = srem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16i8 = srem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V32i8 = srem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 277 for instruction: %V64i8 = srem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; ALL-SIZE-LABEL: 'srem_constpow2'
@@ -611,9 +611,9 @@ define i32 @urem_constpow2() {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16i16 = urem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 266 for instruction: %V32i16 = urem <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = urem i8 undef, 16
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16i8 = urem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V32i8 = urem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %V64i8 = urem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16i8 = urem <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V32i8 = urem <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 277 for instruction: %V64i8 = urem <64 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; ALL-SIZE-LABEL: 'urem_constpow2'
@@ -692,9 +692,9 @@ define i32 @srem_uniformconstpow2() {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16i16 = srem <16 x i16> undef, splat (i16 16)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 266 for instruction: %V32i16 = srem <32 x i16> undef, splat (i16 16)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16i8 = srem <16 x i8> undef, splat (i8 16)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V32i8 = srem <32 x i8> undef, splat (i8 16)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %V64i8 = srem <64 x i8> undef, splat (i8 16)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16i8 = srem <16 x i8> undef, splat (i8 16)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V32i8 = srem <32 x i8> undef, splat (i8 16)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 277 for instruction: %V64i8 = srem <64 x i8> undef, splat (i8 16)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; ALL-SIZE-LABEL: 'srem_uniformconstpow2'
@@ -773,9 +773,9 @@ define i32 @urem_uniformconstpow2() {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16i16 = urem <16 x i16> undef, splat (i16 16)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 266 for instruction: %V32i16 = urem <32 x i16> undef, splat (i16 16)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = urem i8 undef, 16
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16i8 = urem <16 x i8> undef, splat (i8 16)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V32i8 = urem <32 x i8> undef, splat (i8 16)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %V64i8 = urem <64 x i8> undef, splat (i8 16)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16i8 = urem <16 x i8> undef, splat (i8 16)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V32i8 = urem <32 x i8> undef, splat (i8 16)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 277 for instruction: %V64i8 = urem <64 x i8> undef, splat (i8 16)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; ALL-SIZE-LABEL: 'urem_uniformconstpow2'
@@ -854,9 +854,9 @@ define i32 @srem_constnegpow2() {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16i16 = srem <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 266 for instruction: %V32i16 = srem <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, -16
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16i8 = srem <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V32i8 = srem <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %V64i8 = srem <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16i8 = srem <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V32i8 = srem <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 277 for instruction: %V64i8 = srem <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; ALL-SIZE-LABEL: 'srem_constnegpow2'
@@ -935,9 +935,9 @@ define i32 @urem_constnegpow2() {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16i16 = urem <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 266 for instruction: %V32i16 = urem <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = urem i8 undef, -16
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16i8 = urem <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V32i8 = urem <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %V64i8 = urem <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16i8 = urem <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V32i8 = urem <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 277 for instruction: %V64i8 = urem <64 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; ALL-SIZE-LABEL: 'urem_constnegpow2'
@@ -1016,9 +1016,9 @@ define i32 @srem_uniformconstnegpow2() {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16i16 = srem <16 x i16> undef, splat (i16 -16)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 266 for instruction: %V32i16 = srem <32 x i16> undef, splat (i16 -16)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, -16
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16i8 = srem <16 x i8> undef, splat (i8 -16)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V32i8 = srem <32 x i8> undef, splat (i8 -16)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %V64i8 = srem <64 x i8> undef, splat (i8 -16)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16i8 = srem <16 x i8> undef, splat (i8 -16)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V32i8 = srem <32 x i8> undef, splat (i8 -16)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 277 for instruction: %V64i8 = srem <64 x i8> undef, splat (i8 -16)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; ALL-SIZE-LABEL: 'srem_uniformconstnegpow2'
@@ -1097,9 +1097,9 @@ define i32 @urem_uniformconstnegpow2() {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16i16 = urem <16 x i16> undef, splat (i16 -16)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 266 for instruction: %V32i16 = urem <32 x i16> undef, splat (i16 -16)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = urem i8 undef, -16
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16i8 = urem <16 x i8> undef, splat (i8 -16)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V32i8 = urem <32 x i8> undef, splat (i8 -16)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 522 for instruction: %V64i8 = urem <64 x i8> undef, splat (i8 -16)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16i8 = urem <16 x i8> undef, splat (i8 -16)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V32i8 = urem <32 x i8> undef, splat (i8 -16)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 277 for instruction: %V64i8 = urem <64 x i8> undef, splat (i8 -16)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; ALL-SIZE-LABEL: 'urem_uniformconstnegpow2'
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll b/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll
index a94c684291ba7..19e6eb9af1a22 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll
@@ -8,10 +8,10 @@
 define amdgpu_kernel void @shl() #0 {
 ; FAST64-LABEL: 'shl'
 ; FAST64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i8 = shl i8 undef, undef
-; FAST64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = shl <2 x i8> undef, undef
-; FAST64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i8 = shl <3 x i8> undef, undef
-; FAST64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shl <4 x i8> undef, undef
-; FAST64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5i8 = shl <5 x i8> undef, undef
+; FAST64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = shl <2 x i8> undef, undef
+; FAST64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = shl <3 x i8> undef, undef
+; FAST64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = shl <4 x i8> undef, undef
+; FAST64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5i8 = shl <5 x i8> undef, undef
 ; FAST64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i16 = shl i16 undef, undef
 ; FAST64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = shl <2 x i16> undef, undef
 ; FAST64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3i16 = shl <3 x i16> undef, undef
@@ -31,10 +31,10 @@ define amdgpu_kernel void @shl() #0 {
 ;
 ; SLOW64-LABEL: 'shl'
 ; SLOW64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i8 = shl i8 undef, undef
-; SLOW64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = shl <2 x i8> undef, undef
-; SLOW64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i8 = shl <3 x i8> undef, undef
-; SLOW64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shl <4 x i8> undef, undef
-; SLOW64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5i8 = shl <5 x i8> undef, undef
+; SLOW64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = shl <2 x i8> undef, undef
+; SLOW64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = shl <3 x i8> undef, undef
+; SLOW64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = shl <4 x i8> undef, undef
+; SLOW64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5i8 = shl <5 x i8> undef, undef
 ; SLOW64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i16 = shl i16 undef, undef
 ; SLOW64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16 = shl <2 x i16> undef, undef
 ; SLOW64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i16 = shl <3 x i16> undef, undef
@@ -54,10 +54,10 @@ define amdgpu_kernel void @shl() #0 {
 ;
 ; FAST64-SIZE-LABEL: 'shl'
 ; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i8 = shl i8 undef, undef
-; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = shl <2 x i8> undef, undef
-; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i8 = shl <3 x i8> undef, undef
-; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shl <4 x i8> undef, undef
-; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5i8 = shl <5 x i8> undef, undef
+; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = shl <2 x i8> undef, undef
+; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = shl <3 x i8> undef, undef
+; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = shl <4 x i8> undef, undef
+; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5i8 = shl <5 x i8> undef, undef
 ; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i16 = shl i16 undef, undef
 ; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = shl <2 x i16> undef, undef
 ; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3i16 = shl <3 x i16> undef, undef
@@ -77,10 +77,10 @@ define amdgpu_kernel void @shl() #0 {
 ;
 ; SLOW64-SIZE-LABEL: 'shl'
 ; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i8 = shl i8 undef, undef
-; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = shl <2 x i8> undef, undef
-; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i8 = shl <3 x i8> undef, undef
-; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shl <4 x i8> undef, undef
-; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5i8 = shl <5 x i8> undef, undef
+; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = shl <2 x i8> undef, undef
+; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = shl <3 x i8> undef, undef
+; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = shl <4 x i8> undef, undef
+; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5i8 = shl <5 x i8> undef, undef
 ; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i16 = shl i16 undef, undef
 ; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16 = shl <2 x i16> undef, undef
 ; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i16 = shl <3 x i16> undef, undef
@@ -124,10 +124,10 @@ define amdgpu_kernel void @shl() #0 {
 define amdgpu_kernel void @lshr() #0 {
 ; FAST64-LABEL: 'lshr'
 ; FAST64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i8 = lshr i8 undef, undef
-; FAST64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = lshr <2 x i8> undef, undef
-; FAST64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i8 = lshr <3 x i8> undef, undef
-; FAST64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = lshr <4 x i8> undef, undef
-; FAST64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5i8 = lshr <5 x i8> undef, undef
+; FAST64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = lshr <2 x i8> undef, undef
+; FAST64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = lshr <3 x i8> undef, undef
+; FAST64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = lshr <4 x i8> undef, undef
+; FAST64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5i8 = lshr <5 x i8> undef, undef
 ; FAST64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i16 = lshr i16 undef, undef
 ; FAST64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = lshr <2 x i16> undef, undef
 ; FAST64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3i16 = lshr <3 x i16> undef, undef
@@ -147,10 +147,10 @@ define amdgpu_kernel void @lshr() #0 {
 ;
 ; SLOW64-LABEL: 'lshr'
 ; SLOW64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i8 = lshr i8 undef, undef
-; SLOW64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = lshr <2 x i8> undef, undef
-; SLOW64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i8 = lshr <3 x i8> undef, undef
-; SLOW64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = lshr <4 x i8> undef, undef
-; SLOW64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5i8 = lshr <5 x i8> undef, undef
+; SLOW64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = lshr <2 x i8> undef, undef
+; SLOW64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = lshr <3 x i8> undef, undef
+; SLOW64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = lshr <4 x i8> undef, undef
+; SLOW64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5i8 = lshr <5 x i8> undef, undef
 ; SLOW64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i16 = lshr i16 undef, undef
 ; SLOW64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16 = lshr <2 x i16> undef, undef
 ; SLOW64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i16 = lshr <3 x i16> undef, undef
@@ -170,10 +170,10 @@ define amdgpu_kernel void @lshr() #0 {
 ;
 ; FAST64-SIZE-LABEL: 'lshr'
 ; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i8 = lshr i8 undef, undef
-; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = lshr <2 x i8> undef, undef
-; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i8 = lshr <3 x i8> undef, undef
-; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = lshr <4 x i8> undef, undef
-; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5i8 = lshr <5 x i8> undef, undef
+; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = lshr <2 x i8> undef, undef
+; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = lshr <3 x i8> undef, undef
+; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = lshr <4 x i8> undef, undef
+; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5i8 = lshr <5 x i8> undef, undef
 ; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i16 = lshr i16 undef, undef
 ; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = lshr <2 x i16> undef, undef
 ; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3i16 = lshr <3 x i16> undef, undef
@@ -193,10 +193,10 @@ define amdgpu_kernel void @lshr() #0 {
 ;
 ; SLOW64-SIZE-LABEL: 'lshr'
 ; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i8 = lshr i8 undef, undef
-; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = lshr <2 x i8> undef, undef
-; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i8 = lshr <3 x i8> undef, undef
-; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = lshr <4 x i8> undef, undef
-; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5i8 = lshr <5 x i8> undef, undef
+; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = lshr <2 x i8> undef, undef
+; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = lshr <3 x i8> undef, undef
+; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = lshr <4 x i8> undef, undef
+; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5i8 = lshr <5 x i8> undef, undef
 ; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i16 = lshr i16 undef, undef
 ; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16 = lshr <2 x i16> undef, undef
 ; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i16 = lshr <3 x i16> undef, undef
@@ -240,10 +240,10 @@ define amdgpu_kernel void @lshr() #0 {
 define amdgpu_kernel void @ashr() #0 {
 ; FAST64-LABEL: 'ashr'
 ; FAST64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i8 = ashr i8 undef, undef
-; FAST64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = ashr <2 x i8> undef, undef
-; FAST64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i8 = ashr <3 x i8> undef, undef
-; FAST64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = ashr <4 x i8> undef, undef
-; FAST64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5i8 = ashr <5 x i8> undef, undef
+; FAST64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = ashr <2 x i8> undef, undef
+; FAST64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = ashr <3 x i8> undef, undef
+; FAST64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = ashr <4 x i8> undef, undef
+; FAST64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5i8 = ashr <5 x i8> undef, undef
 ; FAST64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i16 = ashr i16 undef, undef
 ; FAST64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = ashr <2 x i16> undef, undef
 ; FAST64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3i16 = ashr <3 x i16> undef, undef
@@ -263,10 +263,10 @@ define amdgpu_kernel void @ashr() #0 {
 ;
 ; SLOW64-LABEL: 'ashr'
 ; SLOW64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i8 = ashr i8 undef, undef
-; SLOW64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = ashr <2 x i8> undef, undef
-; SLOW64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i8 = ashr <3 x i8> undef, undef
-; SLOW64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = ashr <4 x i8> undef, undef
-; SLOW64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5i8 = ashr <5 x i8> undef, undef
+; SLOW64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = ashr <2 x i8> undef, undef
+; SLOW64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = ashr <3 x i8> undef, undef
+; SLOW64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = ashr <4 x i8> undef, undef
+; SLOW64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5i8 = ashr <5 x i8> undef, undef
 ; SLOW64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i16 = ashr i16 undef, undef
 ; SLOW64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16 = ashr <2 x i16> undef, undef
 ; SLOW64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i16 = ashr <3 x i16> undef, undef
@@ -286,10 +286,10 @@ define amdgpu_kernel void @ashr() #0 {
 ;
 ; FAST64-SIZE-LABEL: 'ashr'
 ; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i8 = ashr i8 undef, undef
-; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = ashr <2 x i8> undef, undef
-; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i8 = ashr <3 x i8> undef, undef
-; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = ashr <4 x i8> undef, undef
-; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5i8 = ashr <5 x i8> undef, undef
+; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = ashr <2 x i8> undef, undef
+; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = ashr <3 x i8> undef, undef
+; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = ashr <4 x i8> undef, undef
+; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5i8 = ashr <5 x i8> undef, undef
 ; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i16 = ashr i16 undef, undef
 ; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = ashr <2 x i16> undef, undef
 ; FAST64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3i16 = ashr <3 x i16> undef, undef
@@ -309,10 +309,10 @@ define amdgpu_kernel void @ashr() #0 {
 ;
 ; SLOW64-SIZE-LABEL: 'ashr'
 ; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i8 = ashr i8 undef, undef
-; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = ashr <2 x i8> undef, undef
-; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i8 = ashr <3 x i8> undef, undef
-; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = ashr <4 x i8> undef, undef
-; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5i8 = ashr <5 x i8> undef, undef
+; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = ashr <2 x i8> undef, undef
+; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = ashr <3 x i8> undef, undef
+; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = ashr <4 x i8> undef, undef
+; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5i8 = ashr <5 x i8> undef, undef
 ; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i16 = ashr i16 undef, undef
 ; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16 = ashr <2 x i16> undef, undef
 ; SLOW64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i16 = ashr <3 x i16> undef, undef
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/vectorize-i8-as-i32.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/vectorize-i8-as-i32.ll
new file mode 100644
index 0000000000000..42bfd4a1585ef
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/vectorize-i8-as-i32.ll
@@ -0,0 +1,292 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer %s | FileCheck -check-prefix=GFX7 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer %s | FileCheck -check-prefix=GFX8 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer %s | FileCheck -check-prefix=GFX9 %s
+
+define protected amdgpu_kernel void @arith(<16 x i8> %invec, ptr %out, i32 %flag) {
+; GFX7-LABEL: define protected amdgpu_kernel void @arith(
+; GFX7-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX7-NEXT:  [[ENTRY:.*:]]
+; GFX7-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX7-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[TMP0]], splat (i8 1)
+; GFX7-NEXT:    [[TMP2:%.*]] = add <4 x i8> [[TMP1]], splat (i8 1)
+; GFX7-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; GFX7-NEXT:    [[TMP4:%.*]] = mul <4 x i8> [[TMP3]], splat (i8 1)
+; GFX7-NEXT:    [[TMP5:%.*]] = add <4 x i8> [[TMP4]], splat (i8 1)
+; GFX7-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; GFX7-NEXT:    [[TMP7:%.*]] = mul <4 x i8> [[TMP6]], splat (i8 1)
+; GFX7-NEXT:    [[TMP8:%.*]] = add <4 x i8> [[TMP7]], splat (i8 1)
+; GFX7-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; GFX7-NEXT:    [[TMP10:%.*]] = mul <4 x i8> [[TMP9]], splat (i8 1)
+; GFX7-NEXT:    [[TMP11:%.*]] = add <4 x i8> [[TMP10]], splat (i8 1)
+; GFX7-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX7-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX7-NEXT:    [[VECINS71:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GFX7-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX7-NEXT:    [[VECINS112:%.*]] = shufflevector <16 x i8> [[VECINS71]], <16 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
+; GFX7-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX7-NEXT:    [[VECINS153:%.*]] = shufflevector <16 x i8> [[VECINS112]], <16 x i8> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; GFX7-NEXT:    store <16 x i8> [[VECINS153]], ptr [[OUT]], align 16
+; GFX7-NEXT:    ret void
+;
+; GFX8-LABEL: define protected amdgpu_kernel void @arith(
+; GFX8-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX8-NEXT:  [[ENTRY:.*:]]
+; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX8-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[TMP0]], splat (i8 1)
+; GFX8-NEXT:    [[TMP2:%.*]] = add <4 x i8> [[TMP1]], splat (i8 1)
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; GFX8-NEXT:    [[TMP4:%.*]] = mul <4 x i8> [[TMP3]], splat (i8 1)
+; GFX8-NEXT:    [[TMP5:%.*]] = add <4 x i8> [[TMP4]], splat (i8 1)
+; GFX8-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; GFX8-NEXT:    [[TMP7:%.*]] = mul <4 x i8> [[TMP6]], splat (i8 1)
+; GFX8-NEXT:    [[TMP8:%.*]] = add <4 x i8> [[TMP7]], splat (i8 1)
+; GFX8-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; GFX8-NEXT:    [[TMP10:%.*]] = mul <4 x i8> [[TMP9]], splat (i8 1)
+; GFX8-NEXT:    [[TMP11:%.*]] = add <4 x i8> [[TMP10]], splat (i8 1)
+; GFX8-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8-NEXT:    [[VECINS71:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GFX8-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8-NEXT:    [[VECINS112:%.*]] = shufflevector <16 x i8> [[VECINS71]], <16 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
+; GFX8-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8-NEXT:    [[VECINS153:%.*]] = shufflevector <16 x i8> [[VECINS112]], <16 x i8> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; GFX8-NEXT:    store <16 x i8> [[VECINS153]], ptr [[OUT]], align 16
+; GFX8-NEXT:    ret void
+;
+; GFX9-LABEL: define protected amdgpu_kernel void @arith(
+; GFX9-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX9-NEXT:  [[ENTRY:.*:]]
+; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX9-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[TMP0]], splat (i8 1)
+; GFX9-NEXT:    [[TMP2:%.*]] = add <4 x i8> [[TMP1]], splat (i8 1)
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; GFX9-NEXT:    [[TMP4:%.*]] = mul <4 x i8> [[TMP3]], splat (i8 1)
+; GFX9-NEXT:    [[TMP5:%.*]] = add <4 x i8> [[TMP4]], splat (i8 1)
+; GFX9-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; GFX9-NEXT:    [[TMP7:%.*]] = mul <4 x i8> [[TMP6]], splat (i8 1)
+; GFX9-NEXT:    [[TMP8:%.*]] = add <4 x i8> [[TMP7]], splat (i8 1)
+; GFX9-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; GFX9-NEXT:    [[TMP10:%.*]] = mul <4 x i8> [[TMP9]], splat (i8 1)
+; GFX9-NEXT:    [[TMP11:%.*]] = add <4 x i8> [[TMP10]], splat (i8 1)
+; GFX9-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[VECINS71:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GFX9-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[VECINS112:%.*]] = shufflevector <16 x i8> [[VECINS71]], <16 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
+; GFX9-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[VECINS153:%.*]] = shufflevector <16 x i8> [[VECINS112]], <16 x i8> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; GFX9-NEXT:    store <16 x i8> [[VECINS153]], ptr [[OUT]], align 16
+; GFX9-NEXT:    ret void
+;
+; GFX8PLUS-LABEL: define protected amdgpu_kernel void @arith(
+; GFX8PLUS-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0:[0-9]+]] {
+; GFX8PLUS-NEXT:  [[ENTRY:.*:]]
+; GFX8PLUS-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX8PLUS-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[TMP0]], splat (i8 1)
+; GFX8PLUS-NEXT:    [[TMP2:%.*]] = add <4 x i8> [[TMP1]], splat (i8 1)
+; GFX8PLUS-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; GFX8PLUS-NEXT:    [[TMP4:%.*]] = mul <4 x i8> [[TMP3]], splat (i8 1)
+; GFX8PLUS-NEXT:    [[TMP5:%.*]] = add <4 x i8> [[TMP4]], splat (i8 1)
+; GFX8PLUS-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; GFX8PLUS-NEXT:    [[TMP7:%.*]] = mul <4 x i8> [[TMP6]], splat (i8 1)
+; GFX8PLUS-NEXT:    [[TMP8:%.*]] = add <4 x i8> [[TMP7]], splat (i8 1)
+; GFX8PLUS-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; GFX8PLUS-NEXT:    [[TMP10:%.*]] = mul <4 x i8> [[TMP9]], splat (i8 1)
+; GFX8PLUS-NEXT:    [[TMP11:%.*]] = add <4 x i8> [[TMP10]], splat (i8 1)
+; GFX8PLUS-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8PLUS-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8PLUS-NEXT:    [[VECINS71:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GFX8PLUS-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8PLUS-NEXT:    [[VECINS112:%.*]] = shufflevector <16 x i8> [[VECINS71]], <16 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
+; GFX8PLUS-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8PLUS-NEXT:    [[VECINS153:%.*]] = shufflevector <16 x i8> [[VECINS112]], <16 x i8> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; GFX8PLUS-NEXT:    store <16 x i8> [[VECINS153]], ptr [[OUT]], align 16
+; GFX8PLUS-NEXT:    ret void
+entry:
+  %el0 = extractelement <16 x i8> %invec, i64  0
+  %el1 = extractelement <16 x i8> %invec, i64  1
+  %el2 = extractelement <16 x i8> %invec, i64  2
+  %el3 = extractelement <16 x i8> %invec, i64  3
+  %el4 = extractelement <16 x i8> %invec, i64  4
+  %el5 = extractelement <16 x i8> %invec, i64  5
+  %el6 = extractelement <16 x i8> %invec, i64  6
+  %el7 = extractelement <16 x i8> %invec, i64  7
+  %el8 = extractelement <16 x i8> %invec, i64  8
+  %el9 = extractelement <16 x i8> %invec, i64  9
+  %el10 = extractelement <16 x i8> %invec, i64 10
+  %el11 = extractelement <16 x i8> %invec, i64 11
+  %el12 = extractelement <16 x i8> %invec, i64 12
+  %el13 = extractelement <16 x i8> %invec, i64 13
+  %el14 = extractelement <16 x i8> %invec, i64 14
+  %el15 = extractelement <16 x i8> %invec, i64 15
+  %mul0 = mul i8 %el0, 1
+  %mul1 = mul i8 %el1, 1
+  %mul2 = mul i8 %el2, 1
+  %mul3 = mul i8 %el3, 1
+  %mul4 = mul i8 %el4, 1
+  %mul5 = mul i8 %el5, 1
+  %mul6 = mul i8 %el6, 1
+  %mul7 = mul i8 %el7, 1
+  %mul8 = mul i8 %el8, 1
+  %mul9 = mul i8 %el9, 1
+  %mul10 = mul i8 %el10, 1
+  %mul11 = mul i8 %el11, 1
+  %mul12 = mul i8 %el12, 1
+  %mul13 = mul i8 %el13, 1
+  %mul14 = mul i8 %el14, 1
+  %mul15 = mul i8 %el15, 1
+  %add0 = add i8 %mul0, 1
+  %add1 = add i8 %mul1, 1
+  %add2 = add i8 %mul2, 1
+  %add3 = add i8 %mul3, 1
+  %add4 = add i8 %mul4, 1
+  %add5 = add i8 %mul5, 1
+  %add6 = add i8 %mul6, 1
+  %add7 = add i8 %mul7, 1
+  %add8 = add i8 %mul8, 1
+  %add9 = add i8 %mul9, 1
+  %add10 = add i8 %mul10, 1
+  %add11 = add i8 %mul11, 1
+  %add12 = add i8 %mul12, 1
+  %add13 = add i8 %mul13, 1
+  %add14 = add i8 %mul14, 1
+  %add15 = add i8 %mul15, 1
+  %vecins0 = insertelement <16 x i8> poison, i8 %add0, i64 0
+  %vecins1 = insertelement <16 x i8> %vecins0, i8 %add1, i64 1
+  %vecins2 = insertelement <16 x i8> %vecins1, i8 %add2, i64 2
+  %vecins3 = insertelement <16 x i8> %vecins2, i8 %add3, i64 3
+  %vecins4 = insertelement <16 x i8> %vecins3, i8 %add4, i64 4
+  %vecins5 = insertelement <16 x i8> %vecins4, i8 %add5, i64 5
+  %vecins6 = insertelement <16 x i8> %vecins5, i8 %add6, i64 6
+  %vecins7 = insertelement <16 x i8> %vecins6, i8 %add7, i64 7
+  %vecins8 = insertelement <16 x i8> %vecins7, i8 %add8, i64 8
+  %vecins9 = insertelement <16 x i8> %vecins8, i8 %add9, i64 9
+  %vecins10 = insertelement <16 x i8> %vecins9, i8 %add10, i64 10
+  %vecins11 = insertelement <16 x i8> %vecins10, i8 %add11, i64 11
+  %vecins12 = insertelement <16 x i8> %vecins11, i8 %add12, i64 12
+  %vecins13 = insertelement <16 x i8> %vecins12, i8 %add13, i64 13
+  %vecins14 = insertelement <16 x i8> %vecins13, i8 %add14, i64 14
+  %vecins15 = insertelement <16 x i8> %vecins14, i8 %add15, i64 15
+  store <16 x i8> %vecins15, ptr %out
+  ret void
+}
+
+define protected amdgpu_kernel void @phi(ptr addrspace(3) %inptr0, ptr addrspace(3) %inptr1, ptr %out, ptr %out1, i32 %flag) {
+; GFX7-LABEL: define protected amdgpu_kernel void @phi(
+; GFX7-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
+; GFX7-NEXT:  [[ENTRY:.*]]:
+; GFX7-NEXT:    [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
+; GFX7-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
+; GFX7-NEXT:    br label %[[DO_BODY:.*]]
+; GFX7:       [[DO_BODY]]:
+; GFX7-NEXT:    [[TMP1:%.*]] = phi <4 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
+; GFX7-NEXT:    [[TMP2]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
+; GFX7-NEXT:    [[VEC03:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX7-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX7-NEXT:    [[VEC13:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX7-NEXT:    store <16 x i8> [[VEC13]], ptr addrspace(3) [[INPTR1]], align 2
+; GFX7-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
+; GFX7-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
+; GFX7:       [[EXIT]]:
+; GFX7-NEXT:    store <16 x i8> [[VEC13]], ptr [[OUT]], align 16
+; GFX7-NEXT:    store <16 x i8> [[VEC03]], ptr [[OUT1]], align 16
+; GFX7-NEXT:    ret void
+;
+; GFX8-LABEL: define protected amdgpu_kernel void @phi(
+; GFX8-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
+; GFX8-NEXT:  [[ENTRY:.*]]:
+; GFX8-NEXT:    [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
+; GFX8-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
+; GFX8-NEXT:    br label %[[DO_BODY:.*]]
+; GFX8:       [[DO_BODY]]:
+; GFX8-NEXT:    [[TMP1:%.*]] = phi <4 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
+; GFX8-NEXT:    [[TMP2]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8-NEXT:    [[VEC131:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8-NEXT:    store <16 x i8> [[VEC131]], ptr addrspace(3) [[INPTR1]], align 2
+; GFX8-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
+; GFX8-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
+; GFX8:       [[EXIT]]:
+; GFX8-NEXT:    store <16 x i8> [[VEC131]], ptr [[OUT]], align 16
+; GFX8-NEXT:    store <16 x i8> [[TMP3]], ptr [[OUT1]], align 16
+; GFX8-NEXT:    ret void
+;
+; GFX9-LABEL: define protected amdgpu_kernel void @phi(
+; GFX9-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
+; GFX9-NEXT:  [[ENTRY:.*]]:
+; GFX9-NEXT:    [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
+; GFX9-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
+; GFX9-NEXT:    br label %[[DO_BODY:.*]]
+; GFX9:       [[DO_BODY]]:
+; GFX9-NEXT:    [[TMP1:%.*]] = phi <4 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
+; GFX9-NEXT:    [[TMP2]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    [[VEC131:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX9-NEXT:    store <16 x i8> [[VEC131]], ptr addrspace(3) [[INPTR1]], align 2
+; GFX9-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
+; GFX9-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
+; GFX9:       [[EXIT]]:
+; GFX9-NEXT:    store <16 x i8> [[VEC131]], ptr [[OUT]], align 16
+; GFX9-NEXT:    store <16 x i8> [[TMP3]], ptr [[OUT1]], align 16
+; GFX9-NEXT:    ret void
+;
+; GFX8PLUS-LABEL: define protected amdgpu_kernel void @phi(
+; GFX8PLUS-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
+; GFX8PLUS-NEXT:  [[ENTRY:.*]]:
+; GFX8PLUS-NEXT:    [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
+; GFX8PLUS-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
+; GFX8PLUS-NEXT:    br label %[[DO_BODY:.*]]
+; GFX8PLUS:       [[DO_BODY]]:
+; GFX8PLUS-NEXT:    [[TMP1:%.*]] = phi <4 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
+; GFX8PLUS-NEXT:    [[TMP2]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
+; GFX8PLUS-NEXT:    [[VEC03:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8PLUS-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8PLUS-NEXT:    [[VEC13:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; GFX8PLUS-NEXT:    store <16 x i8> [[VEC13]], ptr addrspace(3) [[INPTR1]], align 2
+; GFX8PLUS-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
+; GFX8PLUS-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
+; GFX8PLUS:       [[EXIT]]:
+; GFX8PLUS-NEXT:    store <16 x i8> [[VEC13]], ptr [[OUT]], align 16
+; GFX8PLUS-NEXT:    store <16 x i8> [[VEC03]], ptr [[OUT1]], align 16
+; GFX8PLUS-NEXT:    ret void
+entry:
+  %gep0 = getelementptr i8, ptr addrspace(3) %inptr0, i32 0
+  %ele0 = load i8, ptr addrspace(3) %gep0, align 8
+  %gep1 = getelementptr i8, ptr addrspace(3) %inptr0, i32 1
+  %ele1 = load i8, ptr addrspace(3) %gep1, align 1
+  %gep2 = getelementptr i8, ptr addrspace(3) %inptr0, i32 2
+  %ele2 = load i8, ptr addrspace(3) %gep2, align 2
+  %gep3 = getelementptr i8, ptr addrspace(3) %inptr0, i32 3
+  %ele3 = load i8, ptr addrspace(3) %gep3, align 1
+  br label %do.body
+
+do.body:
+  %phi0 = phi i8 [ %ele3, %entry ], [ %otherele3, %do.body ]
+  %phi1 = phi i8 [ %ele2, %entry ], [ %otherele2, %do.body ]
+  %phi2 = phi i8 [ %ele1, %entry ], [ %otherele1, %do.body ]
+  %phi3 = phi i8 [ %ele0, %entry ], [ %otherele0, %do.body ]
+  %otherele0 = load i8, ptr addrspace(3) %gep0, align 8
+  %otherele1 = load i8, ptr addrspace(3) %gep1, align 1
+  %otherele2 = load i8, ptr addrspace(3) %gep2, align 2
+  %otherele3 = load i8, ptr addrspace(3) %gep3, align 1
+  %vec00 = insertelement <16 x i8> poison, i8 %otherele0, i64 8
+  %vec01 = insertelement <16 x i8> %vec00, i8 %otherele1, i64 9
+  %vec02 = insertelement <16 x i8> %vec01, i8 %otherele2, i64 10
+  %vec03 = insertelement <16 x i8> %vec02, i8 %otherele3, i64 11
+  %vec10 = insertelement <16 x i8> poison, i8 %phi3, i64 8
+  %vec11 = insertelement <16 x i8> %vec10, i8 %phi2, i64 9
+  %vec12 = insertelement <16 x i8> %vec11, i8 %phi1, i64 10
+  %vec13 = insertelement <16 x i8> %vec12, i8 %phi0, i64 11
+  store <16 x i8> %vec13, ptr addrspace(3) %inptr1, align 2
+  %cmp = icmp eq i32 %flag, 0
+  br i1 %cmp, label %exit, label %do.body
+
+exit:
+  store <16 x i8> %vec13, ptr %out
+  store <16 x i8> %vec03, ptr %out1
+  ret void
+}
+