[llvm] 9068c20 - [AMDGPU][CostModel] Refine cost model for half- and quarter-rate instructions.

Sat Oct 24 09:53:25 PDT 2020

Author: dfukalov
Date: 2020-10-24T19:53:08+03:00
New Revision: 9068c209655efc597b31b23fc41630d82c5b98a4

URL: https://github.com/llvm/llvm-project/commit/9068c209655efc597b31b23fc41630d82c5b98a4
DIFF: https://github.com/llvm/llvm-project/commit/9068c209655efc597b31b23fc41630d82c5b98a4.diff

LOG: [AMDGPU][CostModel] Refine cost model for half- and quarter-rate instructions.

1. Throughput and codesize costs estimations was separated and updated.
2. Updated fdiv cost estimation for different cases.
3. Added scalarization processing for types that are treated as !isSimple() to
improve codesize estimation in getArithmeticInstrCost() and
getArithmeticInstrCost(). The code was borrowed from TCK_RecipThroughput path
of base implementation.

Next step is unify scalarization part in base class that is currently works for
TCK_RecipThroughput path only.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D89973

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
    llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
    llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
    llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll
    llvm/test/Analysis/CostModel/AMDGPU/fma.ll
    llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
    llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
    llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll
    llvm/test/Analysis/CostModel/AMDGPU/mul.ll
    llvm/test/Analysis/CostModel/AMDGPU/shifts.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 31585ed5b9e4..2186739a3a24 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -472,9 +472,50 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
     // FIXME: We're having to query the throughput cost so that the basic
     // implementation tries to generate legalize and scalarization costs. Maybe
     // we could hoist the scalarization code here?
-    return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput,
-                                         Opd1Info, Opd2Info, Opd1PropInfo,
-                                         Opd2PropInfo, Args, CxtI);
+    if (CostKind != TTI::TCK_CodeSize)
+      return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput,
+                                           Opd1Info, Opd2Info, Opd1PropInfo,
+                                           Opd2PropInfo, Args, CxtI);
+    // Scalarization
+
+    // Check if any of the operands are vector operands.
+    int ISD = TLI->InstructionOpcodeToISD(Opcode);
+    assert(ISD && "Invalid opcode");
+
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+
+    bool IsFloat = Ty->isFPOrFPVectorTy();
+    // Assume that floating point arithmetic operations cost twice as much as
+    // integer operations.
+    unsigned OpCost = (IsFloat ? 2 : 1);
+
+    if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
+      // The operation is legal. Assume it costs 1.
+      // TODO: Once we have extract/insert subvector cost we need to use them.
+      return LT.first * OpCost;
+    }
+
+    if (!TLI->isOperationExpand(ISD, LT.second)) {
+      // If the operation is custom lowered, then assume that the code is twice
+      // as expensive.
+      return LT.first * 2 * OpCost;
+    }
+
+    // Else, assume that we need to scalarize this op.
+    // TODO: If one of the types get legalized by splitting, handle this
+    // similarly to what getCastInstrCost() does.
+    if (auto *VTy = dyn_cast<VectorType>(Ty)) {
+      unsigned Num = cast<FixedVectorType>(VTy)->getNumElements();
+      unsigned Cost = getArithmeticInstrCost(
+          Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
+          Opd1PropInfo, Opd2PropInfo, Args, CxtI);
+      // Return the cost of multiple scalar invocation plus the cost of
+      // inserting and extracting the values.
+      return getScalarizationOverhead(VTy, Args) + Num * Cost;
+    }
+
+    // We don't know anything about this scalar instruction.
+    return OpCost;
   }
 
   // Legalize the type.
@@ -493,7 +534,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
   case ISD::SRL:
   case ISD::SRA:
     if (SLT == MVT::i64)
-      return get64BitInstrCost() * LT.first * NElts;
+      return get64BitInstrCost(CostKind) * LT.first * NElts;
 
     if (ST->has16BitInsts() && SLT == MVT::i16)
       NElts = (NElts + 1) / 2;
@@ -515,7 +556,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
 
     return LT.first * NElts * getFullRateInstrCost();
   case ISD::MUL: {
-    const int QuarterRateCost = getQuarterRateInstrCost();
+    const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
     if (SLT == MVT::i64) {
       const int FullRateCost = getFullRateInstrCost();
       return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
@@ -552,7 +593,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
   case ISD::FADD:
   case ISD::FSUB:
     if (SLT == MVT::f64)
-      return LT.first * NElts * get64BitInstrCost();
+      return LT.first * NElts * get64BitInstrCost(CostKind);
 
     if (ST->has16BitInsts() && SLT == MVT::f16)
       NElts = (NElts + 1) / 2;
@@ -565,7 +606,9 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
     // FIXME: frem should be handled separately. The fdiv in it is most of it,
     // but the current lowering is also not entirely correct.
     if (SLT == MVT::f64) {
-      int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
+      int Cost = 7 * get64BitInstrCost(CostKind) +
+                 getQuarterRateInstrCost(CostKind) +
+                 3 * getHalfRateInstrCost(CostKind);
       // Add cost of workaround.
       if (!ST->hasUsableDivScaleConditionOutput())
         Cost += 3 * getFullRateInstrCost();
@@ -577,7 +620,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
       // TODO: This is more complicated, unsafe flags etc.
       if ((SLT == MVT::f32 && !HasFP32Denormals) ||
           (SLT == MVT::f16 && ST->has16BitInsts())) {
-        return LT.first * getQuarterRateInstrCost() * NElts;
+        return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
       }
     }
 
@@ -587,12 +630,15 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
       // f32 fmul
       // v_cvt_f16_f32
       // f16 div_fixup
-      int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost();
+      int Cost =
+          4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
       return LT.first * Cost * NElts;
     }
 
     if (SLT == MVT::f32 || SLT == MVT::f16) {
-      int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
+      // 4 more v_cvt_* insts without f16 insts support
+      int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
+                 1 * getQuarterRateInstrCost(CostKind);
 
       if (!HasFP32Denormals) {
         // FP mode switches.
@@ -642,7 +688,48 @@ int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   Type *RetTy = ICA.getReturnType();
   EVT OrigTy = TLI->getValueType(DL, RetTy);
   if (!OrigTy.isSimple()) {
-    return BaseT::getIntrinsicInstrCost(ICA, CostKind);
+    if (CostKind != TTI::TCK_CodeSize)
+      return BaseT::getIntrinsicInstrCost(ICA, CostKind);
+
+    // TODO: Combine these two logic paths.
+    if (ICA.isTypeBasedOnly())
+      return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
+
+    Type *RetTy = ICA.getReturnType();
+    unsigned VF = ICA.getVectorFactor();
+    unsigned RetVF =
+        (RetTy->isVectorTy() ? cast<FixedVectorType>(RetTy)->getNumElements()
+                             : 1);
+    assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type");
+    const IntrinsicInst *I = ICA.getInst();
+    const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
+    FastMathFlags FMF = ICA.getFlags();
+    // Assume that we need to scalarize this intrinsic.
+    SmallVector<Type *, 4> Types;
+    for (const Value *Op : Args) {
+      Type *OpTy = Op->getType();
+      assert(VF == 1 || !OpTy->isVectorTy());
+      Types.push_back(VF == 1 ? OpTy : FixedVectorType::get(OpTy, VF));
+    }
+
+    if (VF > 1 && !RetTy->isVoidTy())
+      RetTy = FixedVectorType::get(RetTy, VF);
+
+    // Compute the scalarization overhead based on Args for a vector
+    // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
+    // CostModel will pass a vector RetTy and VF is 1.
+    unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();
+    if (RetVF > 1 || VF > 1) {
+      ScalarizationCost = 0;
+      if (!RetTy->isVoidTy())
+        ScalarizationCost +=
+            getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
+      ScalarizationCost += getOperandsScalarizationOverhead(Args, VF);
+    }
+
+    IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, Types, FMF,
+                                  ScalarizationCost, I);
+    return getIntrinsicInstrCost(Attrs, CostKind);
   }
 
   // Legalize the type.
@@ -654,16 +741,16 @@ int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
 
   if (SLT == MVT::f64)
-    return LT.first * NElts * get64BitInstrCost();
+    return LT.first * NElts * get64BitInstrCost(CostKind);
 
   if (ST->has16BitInsts() && SLT == MVT::f16)
     NElts = (NElts + 1) / 2;
 
   // TODO: Get more refined intrinsic costs?
-  unsigned InstRate = getQuarterRateInstrCost();
+  unsigned InstRate = getQuarterRateInstrCost(CostKind);
   if (ICA.getID() == Intrinsic::fma) {
-    InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost()
-                                   : getQuarterRateInstrCost();
+    InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
+                                   : getQuarterRateInstrCost(CostKind);
   }
 
   return LT.first * NElts * InstRate;
@@ -714,7 +801,7 @@ int GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
                                          CostKind);
 
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
-  return LT.first * getHalfRateInstrCost();
+  return LT.first * getHalfRateInstrCost(CostKind);
 }
 
 int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 29e30b6ef93d..22aa27e2ef4d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -115,21 +115,26 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
     return TargetTransformInfo::TCC_Basic;
   }
 
-  static inline int getHalfRateInstrCost() {
-    return 2 * TargetTransformInfo::TCC_Basic;
+  static inline int getHalfRateInstrCost(
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) {
+    return CostKind == TTI::TCK_CodeSize ? 2
+                                         : 2 * TargetTransformInfo::TCC_Basic;
   }
 
   // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
   // should be 2 or 4.
-  static inline int getQuarterRateInstrCost() {
-    return 3 * TargetTransformInfo::TCC_Basic;
+  static inline int getQuarterRateInstrCost(
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) {
+    return CostKind == TTI::TCK_CodeSize ? 2
+                                         : 4 * TargetTransformInfo::TCC_Basic;
   }
 
-   // On some parts, normal fp64 operations are half rate, and others
-   // quarter. This also applies to some integer operations.
-  inline int get64BitInstrCost() const {
-    return ST->hasHalfRate64Ops() ?
-      getHalfRateInstrCost() : getQuarterRateInstrCost();
+  // On some parts, normal fp64 operations are half rate, and others
+  // quarter. This also applies to some integer operations.
+  inline int get64BitInstrCost(
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const {
+    return ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
+                                  : getQuarterRateInstrCost(CostKind);
   }
 
 public:

diff  --git a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
index 1203182a83dd..c2959bbc62e0 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
@@ -1,9 +1,9 @@
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900  -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900  -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF16,SIZEALL,ALL %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF16,SIZEALL,ALL %s
 
-; ALL: 'fadd_f32'
+; ALL-LABEL: 'fadd_f32'
 ; ALL: estimated cost of 1 for {{.*}} fadd float
 define amdgpu_kernel void @fadd_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
   %vec = load float, float addrspace(1)* %vaddr
@@ -12,7 +12,7 @@ define amdgpu_kernel void @fadd_f32(float addrspace(1)* %out, float addrspace(1)
   ret void
 }
 
-; ALL: 'fadd_v2f32'
+; ALL-LABEL: 'fadd_v2f32'
 ; ALL: estimated cost of 2 for {{.*}} fadd <2 x float>
 define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
@@ -21,10 +21,8 @@ define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float
   ret void
 }
 
-; ALL: 'fadd_v3f32'
-; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening,
-; and 3 when it is legal.
-; ALL: estimated cost of {{[34]}} for {{.*}} fadd <3 x float>
+; ALL-LABEL: 'fadd_v3f32'
+; ALL: estimated cost of 3 for {{.*}} fadd <3 x float>
 define amdgpu_kernel void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %add = fadd <3 x float> %vec, %b
@@ -32,10 +30,8 @@ define amdgpu_kernel void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float
   ret void
 }
 
-; ALL: 'fadd_v5f32'
-; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening,
-; and 5 when it is legal.
-; ALL: estimated cost of {{[58]}} for {{.*}} fadd <5 x float>
+; ALL-LABEL: 'fadd_v5f32'
+; ALL: estimated cost of 5 for {{.*}} fadd <5 x float>
 define amdgpu_kernel void @fadd_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
   %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
   %add = fadd <5 x float> %vec, %b
@@ -43,9 +39,10 @@ define amdgpu_kernel void @fadd_v5f32(<5 x float> addrspace(1)* %out, <5 x float
   ret void
 }
 
-; ALL: 'fadd_f64'
+; ALL-LABEL: 'fadd_f64'
 ; FASTF64: estimated cost of 2 for {{.*}} fadd double
-; SLOWF64: estimated cost of 3 for {{.*}} fadd double
+; SLOWF64: estimated cost of 4 for {{.*}} fadd double
+; SIZEALL: estimated cost of 2 for {{.*}} fadd double
 define amdgpu_kernel void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %add = fadd double %vec, %b
@@ -53,9 +50,10 @@ define amdgpu_kernel void @fadd_f64(double addrspace(1)* %out, double addrspace(
   ret void
 }
 
-; ALL: 'fadd_v2f64'
+; ALL-LABEL: 'fadd_v2f64'
 ; FASTF64: estimated cost of 4 for {{.*}} fadd <2 x double>
-; SLOWF64: estimated cost of 6 for {{.*}} fadd <2 x double>
+; SLOWF64: estimated cost of 8 for {{.*}} fadd <2 x double>
+; SIZEALL: estimated cost of 4 for {{.*}} fadd <2 x double>
 define amdgpu_kernel void @fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
   %add = fadd <2 x double> %vec, %b
@@ -63,9 +61,10 @@ define amdgpu_kernel void @fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x doub
   ret void
 }
 
-; ALL: 'fadd_v3f64'
+; ALL-LABEL: 'fadd_v3f64'
 ; FASTF64: estimated cost of 6 for {{.*}} fadd <3 x double>
-; SLOWF64: estimated cost of 9 for {{.*}} fadd <3 x double>
+; SLOWF64: estimated cost of 12 for {{.*}} fadd <3 x double>
+; SIZEALL: estimated cost of 6 for {{.*}} fadd <3 x double>
 define amdgpu_kernel void @fadd_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
   %add = fadd <3 x double> %vec, %b
@@ -73,7 +72,7 @@ define amdgpu_kernel void @fadd_v3f64(<3 x double> addrspace(1)* %out, <3 x doub
   ret void
 }
 
-; ALL: 'fadd_f16'
+; ALL-LABEL: 'fadd_f16'
 ; ALL: estimated cost of 1 for {{.*}} fadd half
 define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
   %vec = load half, half addrspace(1)* %vaddr
@@ -82,7 +81,7 @@ define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half addrspace(1)*
   ret void
 }
 
-; ALL: 'fadd_v2f16'
+; ALL-LABEL: 'fadd_v2f16'
 ; SLOWF16: estimated cost of 2 for {{.*}} fadd <2 x half>
 ; FASTF16: estimated cost of 1 for {{.*}} fadd <2 x half>
 define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
@@ -92,7 +91,7 @@ define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
   ret void
 }
 
-; ALL: 'fadd_v3f16'
+; ALL-LABEL: 'fadd_v3f16'
 ; SLOWF16: estimated cost of 4 for {{.*}} fadd <3 x half>
 ; FASTF16: estimated cost of 2 for {{.*}} fadd <3 x half>
 define amdgpu_kernel void @fadd_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr, <3 x half> %b) #0 {
@@ -102,7 +101,7 @@ define amdgpu_kernel void @fadd_v3f16(<3 x half> addrspace(1)* %out, <3 x half>
   ret void
 }
 
-; ALL: 'fadd_v4f16'
+; ALL-LABEL: 'fadd_v4f16'
 ; SLOWF16: estimated cost of 4 for {{.*}} fadd <4 x half>
 ; FASTF16: estimated cost of 2 for {{.*}} fadd <4 x half>
 define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {

diff  --git a/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll b/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll
index e898f0d2c461..883db92932a8 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll
@@ -1,19 +1,18 @@
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,CIFASTF64,NOFP16,NOFP16-NOFP32DENORM,SLOWFP32DENORMS %s
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=ALL,CISLOWF64,NOFP16,NOFP16-NOFP32DENORM,SLOWFP32DENORMS  %s
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=ALL,SIFASTF64,NOFP32DENORM,NOFP16,NOFP16-NOFP32DENORM,SLOWFP32DENORMS  %s
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=verde < %s | FileCheck -check-prefixes=ALL,SISLOWF64,NOFP32DENORM,NOFP16,NOFP16-NOFP32DENORM,SLOWFP32DENORMS  %s
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,NOFP16,NOFP16-FP32DENORM,SLOWFP32DENORMS %s
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,FASTFP32DENORMS,FP16 %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,THRPTALL,CIFASTF64,NOFP16 %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=ALL,THRPTALL,CISLOWF64,NOFP16  %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=ALL,THRPTALL,SIFASTF64,NOFP16  %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=verde < %s | FileCheck -check-prefixes=ALL,THRPTALL,SISLOWF64,NOFP16  %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,THRPTALL,FP16,CISLOWF64 %s
 
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,CIFASTF64,NOFP16,NOFP16-NOFP32DENORM %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=ALL,CISLOWF64,NOFP16,NOFP16-NOFP32DENORM  %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=ALL,SIFASTF64,NOFP16,NOFP16-NOFP32DENORM  %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=verde < %s | FileCheck -check-prefixes=ALL,SISLOWF64,NOFP16,NOFP16-NOFP32DENORM  %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,SLOWFP32DENORMS,NOFP16,NOFP16-FP32DENORM %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,FASTFP32DENORMS,FP16 %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZECI,SIZENOF16 %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZECI,SIZENOF16 %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZESI,SIZENOF16  %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=verde < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZESI,SIZENOF16  %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZECI,SIZEF16 %s
 
-; ALL: 'fdiv_f32_ieee'
-; ALL: estimated cost of 10 for {{.*}} fdiv float
+; ALL-LABEL: 'fdiv_f32_ieee'
+; THRPTALL: estimated cost of 14 for {{.*}} fdiv float
+; SIZEALL: estimated cost of 12 for {{.*}} fdiv float
 define amdgpu_kernel void @fdiv_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
   %vec = load float, float addrspace(1)* %vaddr
   %add = fdiv float %vec, %b
@@ -21,8 +20,9 @@ define amdgpu_kernel void @fdiv_f32_ieee(float addrspace(1)* %out, float addrspa
   ret void
 }
 
-; ALL: 'fdiv_f32_ftzdaz'
-; ALL: estimated cost of 12 for {{.*}} fdiv float
+; ALL-LABEL: 'fdiv_f32_ftzdaz'
+; THRPTALL: estimated cost of 16 for {{.*}} fdiv float
+; SIZEALL: estimated cost of 14 for {{.*}} fdiv float
 define amdgpu_kernel void @fdiv_f32_ftzdaz(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #1 {
   %vec = load float, float addrspace(1)* %vaddr
   %add = fdiv float %vec, %b
@@ -30,8 +30,9 @@ define amdgpu_kernel void @fdiv_f32_ftzdaz(float addrspace(1)* %out, float addrs
   ret void
 }
 
-; ALL: 'fdiv_v2f32_ieee'
-; ALL: estimated cost of 20 for {{.*}} fdiv <2 x float>
+; ALL-LABEL: 'fdiv_v2f32_ieee'
+; THRPTALL: estimated cost of 28 for {{.*}} fdiv <2 x float>
+; SIZEALL: estimated cost of 24 for {{.*}} fdiv <2 x float>
 define amdgpu_kernel void @fdiv_v2f32_ieee(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %add = fdiv <2 x float> %vec, %b
@@ -39,8 +40,9 @@ define amdgpu_kernel void @fdiv_v2f32_ieee(<2 x float> addrspace(1)* %out, <2 x
   ret void
 }
 
-; ALL: 'fdiv_v2f32_ftzdaz'
-; ALL: estimated cost of 24 for {{.*}} fdiv <2 x float>
+; ALL-LABEL: 'fdiv_v2f32_ftzdaz'
+; THRPTALL: estimated cost of 32 for {{.*}} fdiv <2 x float>
+; SIZEALL: estimated cost of 28 for {{.*}} fdiv <2 x float>
 define amdgpu_kernel void @fdiv_v2f32_ftzdaz(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #1 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %add = fdiv <2 x float> %vec, %b
@@ -48,10 +50,9 @@ define amdgpu_kernel void @fdiv_v2f32_ftzdaz(<2 x float> addrspace(1)* %out, <2
   ret void
 }
 
-; ALL: 'fdiv_v3f32_ieee'
-; Allow for 48/40 when v3f32 is illegal and TargetLowering thinks it needs widening,
-; and 36/30 when it is legal.
-; ALL: estimated cost of {{30|40}} for {{.*}} fdiv <3 x float>
+; ALL-LABEL: 'fdiv_v3f32_ieee'
+; THRPTALL: estimated cost of 42 for {{.*}} fdiv <3 x float>
+; SIZEALL: estimated cost of 36 for {{.*}} fdiv <3 x float>
 define amdgpu_kernel void @fdiv_v3f32_ieee(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %add = fdiv <3 x float> %vec, %b
@@ -59,10 +60,9 @@ define amdgpu_kernel void @fdiv_v3f32_ieee(<3 x float> addrspace(1)* %out, <3 x
   ret void
 }
 
-; ALL: 'fdiv_v3f32_ftzdaz'
-; Allow for 48/40 when v3f32 is illegal and TargetLowering thinks it needs widening,
-; and 36/30 when it is legal.
-; ALL: estimated cost of {{36|48}} for {{.*}} fdiv <3 x float>
+; ALL-LABEL: 'fdiv_v3f32_ftzdaz'
+; THRPTALL: estimated cost of 48 for {{.*}} fdiv <3 x float>
+; SIZEALL: estimated cost of 42 for {{.*}} fdiv <3 x float>
 define amdgpu_kernel void @fdiv_v3f32_ftzdaz(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #1 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %add = fdiv <3 x float> %vec, %b
@@ -70,10 +70,9 @@ define amdgpu_kernel void @fdiv_v3f32_ftzdaz(<3 x float> addrspace(1)* %out, <3
   ret void
 }
 
-; ALL: 'fdiv_v5f32_ieee'
-; Allow for 96/80 when v5f32 is illegal and TargetLowering thinks it needs widening,
-; and 60/50 when it is legal.
-; ALL: estimated cost of {{80|50}} for {{.*}} fdiv <5 x float>
+; ALL-LABEL: 'fdiv_v5f32_ieee'
+; THRPTALL: estimated cost of 70 for {{.*}} fdiv <5 x float>
+; SIZEALL: estimated cost of 60 for {{.*}} fdiv <5 x float>
 define amdgpu_kernel void @fdiv_v5f32_ieee(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
   %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
   %add = fdiv <5 x float> %vec, %b
@@ -81,10 +80,9 @@ define amdgpu_kernel void @fdiv_v5f32_ieee(<5 x float> addrspace(1)* %out, <5 x
   ret void
 }
 
-; ALL: 'fdiv_v5f32_ftzdaz'
-; Allow for 96/80 when v5f32 is illegal and TargetLowering thinks it needs widening,
-; and 60/50 when it is legal.
-; ALL: estimated cost of {{96|60}} for {{.*}} fdiv <5 x float>
+; ALL-LABEL: 'fdiv_v5f32_ftzdaz'
+; THRPTALL: estimated cost of 80 for {{.*}} fdiv <5 x float>
+; SIZEALL: estimated cost of 70 for {{.*}} fdiv <5 x float>
 define amdgpu_kernel void @fdiv_v5f32_ftzdaz(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #1 {
   %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
   %add = fdiv <5 x float> %vec, %b
@@ -92,11 +90,13 @@ define amdgpu_kernel void @fdiv_v5f32_ftzdaz(<5 x float> addrspace(1)* %out, <5
   ret void
 }
 
-; ALL: 'fdiv_f64'
-; CIFASTF64: estimated cost of 29 for {{.*}} fdiv double
-; CISLOWF64: estimated cost of 33 for {{.*}} fdiv double
-; SIFASTF64: estimated cost of 32 for {{.*}} fdiv double
-; SISLOWF64: estimated cost of 36 for {{.*}} fdiv double
+; ALL-LABEL: 'fdiv_f64'
+; CIFASTF64: estimated cost of 24 for {{.*}} fdiv double
+; CISLOWF64: estimated cost of 38 for {{.*}} fdiv double
+; SIFASTF64: estimated cost of 27 for {{.*}} fdiv double
+; SISLOWF64: estimated cost of 41 for {{.*}} fdiv double
+; SIZECI: estimated cost of 22 for {{.*}} fdiv double
+; SIZESI: estimated cost of 25 for {{.*}} fdiv double
 define amdgpu_kernel void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %add = fdiv double %vec, %b
@@ -104,11 +104,13 @@ define amdgpu_kernel void @fdiv_f64(double addrspace(1)* %out, double addrspace(
   ret void
 }
 
-; ALL: 'fdiv_v2f64'
-; CIFASTF64: estimated cost of 58 for {{.*}} fdiv <2 x double>
-; CISLOWF64: estimated cost of 66 for {{.*}} fdiv <2 x double>
-; SIFASTF64: estimated cost of 64 for {{.*}} fdiv <2 x double>
-; SISLOWF64: estimated cost of 72 for {{.*}} fdiv <2 x double>
+; ALL-LABEL: 'fdiv_v2f64'
+; CIFASTF64: estimated cost of 48 for {{.*}} fdiv <2 x double>
+; CISLOWF64: estimated cost of 76 for {{.*}} fdiv <2 x double>
+; SIFASTF64: estimated cost of 54 for {{.*}} fdiv <2 x double>
+; SISLOWF64: estimated cost of 82 for {{.*}} fdiv <2 x double>
+; SIZECI: estimated cost of 44 for {{.*}} fdiv <2 x double>
+; SIZESI: estimated cost of 50 for {{.*}} fdiv <2 x double>
 define amdgpu_kernel void @fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
   %add = fdiv <2 x double> %vec, %b
@@ -116,11 +118,13 @@ define amdgpu_kernel void @fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x doub
   ret void
 }
 
-; ALL: 'fdiv_v3f64'
-; CIFASTF64: estimated cost of 87 for {{.*}} fdiv <3 x double>
-; CISLOWF64: estimated cost of 99 for {{.*}} fdiv <3 x double>
-; SIFASTF64: estimated cost of 96 for {{.*}} fdiv <3 x double>
-; SISLOWF64: estimated cost of 108 for {{.*}} fdiv <3 x double>
+; ALL-LABEL: 'fdiv_v3f64'
+; CIFASTF64: estimated cost of 72 for {{.*}} fdiv <3 x double>
+; CISLOWF64: estimated cost of 114 for {{.*}} fdiv <3 x double>
+; SIFASTF64: estimated cost of 81 for {{.*}} fdiv <3 x double>
+; SISLOWF64: estimated cost of 123 for {{.*}} fdiv <3 x double>
+; SIZECI: estimated cost of 66 for {{.*}} fdiv <3 x double>
+; SIZESI: estimated cost of 75 for {{.*}} fdiv <3 x double>
 define amdgpu_kernel void @fdiv_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
   %add = fdiv <3 x double> %vec, %b
@@ -128,9 +132,11 @@ define amdgpu_kernel void @fdiv_v3f64(<3 x double> addrspace(1)* %out, <3 x doub
   ret void
 }
 
-; ALL: 'fdiv_f16_f32_ieee'
-; NOFP16: estimated cost of 10 for {{.*}} fdiv half
-; FP16: estimated cost of 10 for {{.*}} fdiv half
+; ALL-LABEL: 'fdiv_f16_f32_ieee'
+; NOFP16: estimated cost of 14 for {{.*}} fdiv half
+; FP16: estimated cost of 12 for {{.*}} fdiv half
+; SIZENOF16: estimated cost of 12 for {{.*}} fdiv half
+; SIZEF16: estimated cost of 8 for {{.*}} fdiv half
 define amdgpu_kernel void @fdiv_f16_f32_ieee(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
   %vec = load half, half addrspace(1)* %vaddr
   %add = fdiv half %vec, %b
@@ -138,9 +144,11 @@ define amdgpu_kernel void @fdiv_f16_f32_ieee(half addrspace(1)* %out, half addrs
   ret void
 }
 
-; ALL: 'fdiv_f16_f32_ftzdaz'
-; NOFP16: estimated cost of 12 for {{.*}} fdiv half
-; FP16: estimated cost of 10 for {{.*}} fdiv half
+; ALL-LABEL: 'fdiv_f16_f32_ftzdaz'
+; NOFP16: estimated cost of 16 for {{.*}} fdiv half
+; FP16: estimated cost of 12 for {{.*}} fdiv half
+; SIZENOF16: estimated cost of 14 for {{.*}} fdiv half
+; SIZEF16: estimated cost of 8 for {{.*}} fdiv half
 define amdgpu_kernel void @fdiv_f16_f32_ftzdaz(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #1 {
   %vec = load half, half addrspace(1)* %vaddr
   %add = fdiv half %vec, %b
@@ -148,9 +156,11 @@ define amdgpu_kernel void @fdiv_f16_f32_ftzdaz(half addrspace(1)* %out, half add
   ret void
 }
 
-; ALL: 'fdiv_v2f16_f32_ieee'
-; NOFP16: estimated cost of 20 for {{.*}} fdiv <2 x half>
-; FP16: estimated cost of 20 for {{.*}} fdiv <2 x half>
+; ALL-LABEL: 'fdiv_v2f16_f32_ieee'
+; NOFP16: estimated cost of 28 for {{.*}} fdiv <2 x half>
+; FP16: estimated cost of 24 for {{.*}} fdiv <2 x half>
+; SIZENOF16: estimated cost of 24 for {{.*}} fdiv <2 x half>
+; SIZEF16: estimated cost of 16 for {{.*}} fdiv <2 x half>
 define amdgpu_kernel void @fdiv_v2f16_f32_ieee(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
   %add = fdiv <2 x half> %vec, %b
@@ -158,9 +168,11 @@ define amdgpu_kernel void @fdiv_v2f16_f32_ieee(<2 x half> addrspace(1)* %out, <2
   ret void
 }
 
-; ALL: 'fdiv_v2f16_f32_ftzdaz'
-; NOFP16: estimated cost of 24 for {{.*}} fdiv <2 x half>
-; FP16: estimated cost of 20 for {{.*}} fdiv <2 x half>
+; ALL-LABEL: 'fdiv_v2f16_f32_ftzdaz'
+; NOFP16: estimated cost of 32 for {{.*}} fdiv <2 x half>
+; FP16: estimated cost of 24 for {{.*}} fdiv <2 x half>
+; SIZENOF16: estimated cost of 28 for {{.*}} fdiv <2 x half>
+; SIZEF16: estimated cost of 16 for {{.*}} fdiv <2 x half>
 define amdgpu_kernel void @fdiv_v2f16_f32_ftzdaz(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #1 {
   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
   %add = fdiv <2 x half> %vec, %b
@@ -168,9 +180,11 @@ define amdgpu_kernel void @fdiv_v2f16_f32_ftzdaz(<2 x half> addrspace(1)* %out,
   ret void
 }
 
-; ALL: 'fdiv_v4f16_f32_ieee'
-; NOFP16: estimated cost of 40 for {{.*}} fdiv <4 x half>
-; FP16: estimated cost of 40 for {{.*}} fdiv <4 x half>
+; ALL-LABEL: 'fdiv_v4f16_f32_ieee'
+; NOFP16: estimated cost of 56 for {{.*}} fdiv <4 x half>
+; FP16: estimated cost of 48 for {{.*}} fdiv <4 x half>
+; SIZENOF16: estimated cost of 48 for {{.*}} fdiv <4 x half>
+; SIZEF16: estimated cost of 32 for {{.*}} fdiv <4 x half>
 define amdgpu_kernel void @fdiv_v4f16_f32_ieee(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
   %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
   %add = fdiv <4 x half> %vec, %b
@@ -178,9 +192,11 @@ define amdgpu_kernel void @fdiv_v4f16_f32_ieee(<4 x half> addrspace(1)* %out, <4
   ret void
 }
 
-; ALL: 'fdiv_v4f16_f32_ftzdaz'
-; NOFP16: estimated cost of 48 for {{.*}} fdiv <4 x half>
-; FP16: estimated cost of 40 for {{.*}} fdiv <4 x half>
+; ALL-LABEL: 'fdiv_v4f16_f32_ftzdaz'
+; NOFP16: estimated cost of 64 for {{.*}} fdiv <4 x half>
+; FP16: estimated cost of 48 for {{.*}} fdiv <4 x half>
+; SIZENOF16: estimated cost of 56 for {{.*}} fdiv <4 x half>
+; SIZEF16: estimated cost of 32 for {{.*}} fdiv <4 x half>
 define amdgpu_kernel void @fdiv_v4f16_f32_ftzdaz(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #1 {
   %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
   %add = fdiv <4 x half> %vec, %b
@@ -188,9 +204,9 @@ define amdgpu_kernel void @fdiv_v4f16_f32_ftzdaz(<4 x half> addrspace(1)* %out,
   ret void
 }
 
-; ALL: 'rcp_f32_ieee'
-; SLOWFP32DENORMS: estimated cost of 10 for {{.*}} fdiv float
-; FASTFP32DENORMS: estimated cost of 10 for {{.*}} fdiv float
+; ALL-LABEL: 'rcp_f32_ieee'
+; THRPTALL: estimated cost of 14 for {{.*}} fdiv float
+; SIZEALL: estimated cost of 12 for {{.*}} fdiv float
 define amdgpu_kernel void @rcp_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 {
   %vec = load float, float addrspace(1)* %vaddr
   %add = fdiv float 1.0, %vec
@@ -198,8 +214,9 @@ define amdgpu_kernel void @rcp_f32_ieee(float addrspace(1)* %out, float addrspac
   ret void
 }
 
-; ALL: 'rcp_f32_ftzdaz'
-; ALL: estimated cost of 3 for {{.*}} fdiv float
+; ALL-LABEL: 'rcp_f32_ftzdaz'
+; THRPTALL: estimated cost of 4 for {{.*}} fdiv float
+; SIZEALL: estimated cost of 2 for {{.*}} fdiv float
 define amdgpu_kernel void @rcp_f32_ftzdaz(float addrspace(1)* %out, float addrspace(1)* %vaddr) #1 {
   %vec = load float, float addrspace(1)* %vaddr
   %add = fdiv float 1.0, %vec
@@ -207,9 +224,11 @@ define amdgpu_kernel void @rcp_f32_ftzdaz(float addrspace(1)* %out, float addrsp
   ret void
 }
 
-; ALL: 'rcp_f16_f32_ieee'
-; NOFP16: estimated cost of 10 for {{.*}} fdiv half
-; FP16: estimated cost of 3 for {{.*}} fdiv half
+; ALL-LABEL: 'rcp_f16_f32_ieee'
+; NOFP16: estimated cost of 14 for {{.*}} fdiv half
+; FP16: estimated cost of 4 for {{.*}} fdiv half
+; SIZENOF16: estimated cost of 12 for {{.*}} fdiv half
+; SIZEF16: estimated cost of 2 for {{.*}} fdiv half
 define amdgpu_kernel void @rcp_f16_f32_ieee(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 {
   %vec = load half, half addrspace(1)* %vaddr
   %add = fdiv half 1.0, %vec
@@ -217,9 +236,9 @@ define amdgpu_kernel void @rcp_f16_f32_ieee(half addrspace(1)* %out, half addrsp
   ret void
 }
 
-; ALL: 'rcp_f16_f32_ftzdaz'
-; NOFP16: estimated cost of 3 for {{.*}} fdiv half
-; FP16: estimated cost of 3 for {{.*}} fdiv half
+; ALL-LABEL: 'rcp_f16_f32_ftzdaz'
+; THRPTALL: estimated cost of 4 for {{.*}} fdiv half
+; SIZEALL: estimated cost of 2 for {{.*}} fdiv half
 define amdgpu_kernel void @rcp_f16_f32_ftzdaz(half addrspace(1)* %out, half addrspace(1)* %vaddr) #1 {
   %vec = load half, half addrspace(1)* %vaddr
   %add = fdiv half 1.0, %vec
@@ -227,11 +246,13 @@ define amdgpu_kernel void @rcp_f16_f32_ftzdaz(half addrspace(1)* %out, half addr
   ret void
 }
 
-; ALL: 'rcp_f64'
-; CIFASTF64: estimated cost of 29 for {{.*}} fdiv double
-; CISLOWF64: estimated cost of 33 for {{.*}} fdiv double
-; SIFASTF64: estimated cost of 32 for {{.*}} fdiv double
-; SISLOWF64: estimated cost of 36 for {{.*}} fdiv double
+; ALL-LABEL: 'rcp_f64'
+; CIFASTF64: estimated cost of 24 for {{.*}} fdiv double
+; CISLOWF64: estimated cost of 38 for {{.*}} fdiv double
+; SIFASTF64: estimated cost of 27 for {{.*}} fdiv double
+; SISLOWF64: estimated cost of 41 for {{.*}} fdiv double
+; SIZECI: estimated cost of 22 for {{.*}} fdiv double
+; SIZESI: estimated cost of 25 for {{.*}} fdiv double
 define amdgpu_kernel void @rcp_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %add = fdiv double 1.0, %vec
@@ -239,9 +260,9 @@ define amdgpu_kernel void @rcp_f64(double addrspace(1)* %out, double addrspace(1
   ret void
 }
 
-; ALL: 'rcp_v2f32_ieee'
-; SLOWFP32DENORMS: estimated cost of 20 for {{.*}} fdiv <2 x float>
-; FASTFP32DENORMS: estimated cost of 20 for {{.*}} fdiv <2 x float>
+; ALL-LABEL: 'rcp_v2f32_ieee'
+; THRPTALL: estimated cost of 28 for {{.*}} fdiv <2 x float>
+; SIZEALL: estimated cost of 24 for {{.*}} fdiv <2 x float>
 define amdgpu_kernel void @rcp_v2f32_ieee(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %add = fdiv <2 x float> <float 1.0, float 1.0>, %vec
@@ -249,8 +270,9 @@ define amdgpu_kernel void @rcp_v2f32_ieee(<2 x float> addrspace(1)* %out, <2 x f
   ret void
 }
 
-; ALL: 'rcp_v2f32_ftzdaz'
-; ALL: estimated cost of 6 for {{.*}} fdiv <2 x float>
+; ALL-LABEL: 'rcp_v2f32_ftzdaz'
+; THRPTALL: estimated cost of 8 for {{.*}} fdiv <2 x float>
+; SIZEALL: estimated cost of 4 for {{.*}} fdiv <2 x float>
 define amdgpu_kernel void @rcp_v2f32_ftzdaz(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #1 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %add = fdiv <2 x float> <float 1.0, float 1.0>, %vec
@@ -258,9 +280,11 @@ define amdgpu_kernel void @rcp_v2f32_ftzdaz(<2 x float> addrspace(1)* %out, <2 x
   ret void
 }
 
-; ALL: 'rcp_v2f16_f32_ieee'
-; NOFP16: estimated cost of 20 for {{.*}} fdiv <2 x half>
-; FP16: estimated cost of 6 for {{.*}} fdiv <2 x half>
+; ALL-LABEL: 'rcp_v2f16_f32_ieee'
+; NOFP16: estimated cost of 28 for {{.*}} fdiv <2 x half>
+; FP16: estimated cost of 8 for {{.*}} fdiv <2 x half>
+; SIZENOF16: estimated cost of 24 for {{.*}} fdiv <2 x half>
+; SIZEF16: estimated cost of 4 for {{.*}} fdiv <2 x half>
 define amdgpu_kernel void @rcp_v2f16_f32_ieee(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
   %add = fdiv <2 x half> <half 1.0, half 1.0>, %vec
@@ -268,9 +292,9 @@ define amdgpu_kernel void @rcp_v2f16_f32_ieee(<2 x half> addrspace(1)* %out, <2
   ret void
 }
 
-; ALL: 'rcp_v2f16_f32_ftzdaz'
-; NOFP16: estimated cost of 6 for {{.*}} fdiv <2 x half>
-; FP16: estimated cost of 6 for {{.*}} fdiv <2 x half>
+; ALL-LABEL: 'rcp_v2f16_f32_ftzdaz'
+; THRPTALL: estimated cost of 8 for {{.*}} fdiv <2 x half>
+; SIZEALL: estimated cost of 4 for {{.*}} fdiv <2 x half>
 define amdgpu_kernel void @rcp_v2f16_f32_ftzdaz(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #1 {
   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
   %add = fdiv <2 x half> <half 1.0, half 1.0>, %vec

diff  --git a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
index 462163d2f03e..41cbe0fac7f6 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
@@ -1,11 +1,12 @@
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900  -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FAST32,FASTF16,ALL %s
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOW32,SLOWF16,ALL %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FAST32,FASTF16,ALL %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOW32,SLOWF16,ALL %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900  -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF32,FASTF16,ALL %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF32,SLOWF16,ALL %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZEF16 %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZENOF16 %s
 
 ; ALL-LABEL: 'fma_f32'
-; SLOW32: estimated cost of 3 for {{.*}} call float @llvm.fma.f32
-; FAST32: estimated cost of 2 for {{.*}} call float @llvm.fma.f32
+; SLOWF32: estimated cost of 4 for {{.*}} call float @llvm.fma.f32
+; FASTF32: estimated cost of 2 for {{.*}} call float @llvm.fma.f32
+; SIZEALL: estimated cost of 2 for {{.*}} call float @llvm.fma.f32
 define amdgpu_kernel void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 {
   %vec = load float, float addrspace(1)* %vaddr
   %fma = call float @llvm.fma.f32(float %vec, float %vec, float %vec) #1
@@ -14,8 +15,9 @@ define amdgpu_kernel void @fma_f32(float addrspace(1)* %out, float addrspace(1)*
 }
 
 ; ALL-LABEL: 'fma_v2f32'
-; SLOW32: estimated cost of 6 for {{.*}} call <2 x float> @llvm.fma.v2f32
-; FAST32: estimated cost of 4 for {{.*}} call <2 x float> @llvm.fma.v2f32
+; SLOWF32: estimated cost of 8 for {{.*}} call <2 x float> @llvm.fma.v2f32
+; FASTF32: estimated cost of 4 for {{.*}} call <2 x float> @llvm.fma.v2f32
+; SIZEALL: estimated cost of 4 for {{.*}} call <2 x float> @llvm.fma.v2f32
 define amdgpu_kernel void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %fma = call <2 x float> @llvm.fma.v2f32(<2 x float> %vec, <2 x float> %vec, <2 x float> %vec) #1
@@ -24,8 +26,9 @@ define amdgpu_kernel void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float>
 }
 
 ; ALL-LABEL: 'fma_v3f32'
-; SLOW32: estimated cost of 9 for {{.*}} call <3 x float> @llvm.fma.v3f32
-; FAST32: estimated cost of 6 for {{.*}} call <3 x float> @llvm.fma.v3f32
+; SLOWF32: estimated cost of 12 for {{.*}} call <3 x float> @llvm.fma.v3f32
+; FASTF32: estimated cost of 6 for {{.*}} call <3 x float> @llvm.fma.v3f32
+; SIZEALL: estimated cost of 6 for {{.*}} call <3 x float> @llvm.fma.v3f32
 define amdgpu_kernel void @fma_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %fma = call <3 x float> @llvm.fma.v3f32(<3 x float> %vec, <3 x float> %vec, <3 x float> %vec) #1
@@ -34,8 +37,9 @@ define amdgpu_kernel void @fma_v3f32(<3 x float> addrspace(1)* %out, <3 x float>
 }
 
 ; ALL-LABEL: 'fma_v5f32'
-; SLOW32: estimated cost of 15 for {{.*}} call <5 x float> @llvm.fma.v5f32
-; FAST32: estimated cost of 10 for {{.*}} call <5 x float> @llvm.fma.v5f32
+; SLOWF32: estimated cost of 20 for {{.*}} call <5 x float> @llvm.fma.v5f32
+; FASTF32: estimated cost of 10 for {{.*}} call <5 x float> @llvm.fma.v5f32
+; SIZEALL: estimated cost of 10 for {{.*}} call <5 x float> @llvm.fma.v5f32
 define amdgpu_kernel void @fma_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr) #0 {
   %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
   %fma = call <5 x float> @llvm.fma.v5f32(<5 x float> %vec, <5 x float> %vec, <5 x float> %vec) #1
@@ -44,8 +48,9 @@ define amdgpu_kernel void @fma_v5f32(<5 x float> addrspace(1)* %out, <5 x float>
 }
 
 ; ALL-LABEL: 'fma_f64'
-; SLOW64: estimated cost of 3 for {{.*}} call double @llvm.fma.f64
-; FAST64: estimated cost of 2 for {{.*}} call double @llvm.fma.f64
+; SLOWF64: estimated cost of 4 for {{.*}} call double @llvm.fma.f64
+; FASTF64: estimated cost of 2 for {{.*}} call double @llvm.fma.f64
+; SIZEALL: estimated cost of 2 for {{.*}} call double @llvm.fma.f64
 define amdgpu_kernel void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %fma = call double @llvm.fma.f64(double %vec, double %vec, double %vec) #1
@@ -54,8 +59,9 @@ define amdgpu_kernel void @fma_f64(double addrspace(1)* %out, double addrspace(1
 }
 
 ; ALL-LABEL: 'fma_v2f64'
-; SLOW64: estimated cost of 6 for {{.*}} call <2 x double> @llvm.fma.v2f64
-; FAST64: estimated cost of 4 for {{.*}} call <2 x double> @llvm.fma.v2f64
+; SLOWF64: estimated cost of 8 for {{.*}} call <2 x double> @llvm.fma.v2f64
+; FASTF64: estimated cost of 4 for {{.*}} call <2 x double> @llvm.fma.v2f64
+; SIZEALL: estimated cost of 4 for {{.*}} call <2 x double> @llvm.fma.v2f64
 define amdgpu_kernel void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 {
   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
   %fma = call <2 x double> @llvm.fma.v2f64(<2 x double> %vec, <2 x double> %vec, <2 x double> %vec) #1
@@ -64,8 +70,9 @@ define amdgpu_kernel void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x doubl
 }
 
 ; ALL-LABEL: 'fma_v3f64'
-; SLOW64: estimated cost of 9 for {{.*}} call <3 x double> @llvm.fma.v3f64
-; FAST64: estimated cost of 6 for {{.*}} call <3 x double> @llvm.fma.v3f64
+; SLOWF64: estimated cost of 12 for {{.*}} call <3 x double> @llvm.fma.v3f64
+; FASTF64: estimated cost of 6 for {{.*}} call <3 x double> @llvm.fma.v3f64
+; SIZEALL: estimated cost of 6 for {{.*}} call <3 x double> @llvm.fma.v3f64
 define amdgpu_kernel void @fma_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
   %fma = call <3 x double> @llvm.fma.v3f64(<3 x double> %vec, <3 x double> %vec, <3 x double> %vec) #1
@@ -74,8 +81,9 @@ define amdgpu_kernel void @fma_v3f64(<3 x double> addrspace(1)* %out, <3 x doubl
 }
 
 ; ALL-LABEL: 'fma_f16'
-; SLOW16: estimated cost of 3 for {{.*}} call half @llvm.fma.f16
-; FAST16: estimated cost of 2 for {{.*}} call half @llvm.fma.f16
+; SLOWF16: estimated cost of 4 for {{.*}} call half @llvm.fma.f16
+; FASTF16: estimated cost of 2 for {{.*}} call half @llvm.fma.f16
+; SIZEALL: estimated cost of 2 for {{.*}} call half @llvm.fma.f16
 define amdgpu_kernel void @fma_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 {
   %vec = load half, half addrspace(1)* %vaddr
   %fma = call half @llvm.fma.f16(half %vec, half %vec, half %vec) #1
@@ -84,8 +92,10 @@ define amdgpu_kernel void @fma_f16(half addrspace(1)* %out, half addrspace(1)* %
 }
 
 ; ALL-LABEL: 'fma_v2f16'
-; SLOW16: estimated cost of 6 for {{.*}} call <2 x half> @llvm.fma.v2f16
-; FAST16: estimated cost of 2 for {{.*}} call <2 x half> @llvm.fma.v2f16
+; SLOWF16: estimated cost of 8 for {{.*}} call <2 x half> @llvm.fma.v2f16
+; FASTF16: estimated cost of 2 for {{.*}} call <2 x half> @llvm.fma.v2f16
+; SIZEF16: estimated cost of 2 for {{.*}} call <2 x half> @llvm.fma.v2f16
+; SIZENOF16: estimated cost of 4 for {{.*}} call <2 x half> @llvm.fma.v2f16
 define amdgpu_kernel void @fma_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
   %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %vec, <2 x half> %vec, <2 x half> %vec) #1
@@ -94,8 +104,10 @@ define amdgpu_kernel void @fma_v2f16(<2 x half> addrspace(1)* %out, <2 x half> a
 }
 
 ; ALL-LABEL: 'fma_v3f16'
-; SLOW16: estimated cost of 12 for {{.*}} call <3 x half> @llvm.fma.v3f16
-; FAST16: estimated cost of 4 for {{.*}} call <3 x half> @llvm.fma.v3f16
+; SLOWF16: estimated cost of 16 for {{.*}} call <3 x half> @llvm.fma.v3f16
+; FASTF16: estimated cost of 4 for {{.*}} call <3 x half> @llvm.fma.v3f16
+; SIZEF16: estimated cost of 4 for {{.*}} call <3 x half> @llvm.fma.v3f16
+; SIZENOF16: estimated cost of 8 for {{.*}} call <3 x half> @llvm.fma.v3f16
 define amdgpu_kernel void @fma_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) #0 {
   %vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr
   %fma = call <3 x half> @llvm.fma.v3f16(<3 x half> %vec, <3 x half> %vec, <3 x half> %vec) #1

diff  --git a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
index ea07089b4492..ba855b2665cc 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,FASTF16 %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SLOWF16 %s
 
 ; ALL-LABEL: 'fmul_f32'
 ; ALL: estimated cost of 1 for {{.*}} fmul float
@@ -22,9 +22,7 @@ define amdgpu_kernel void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float
 }
 
 ; ALL-LABEL: 'fmul_v3f32'
-; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening,
-; and 3 when it is legal.
-; ALL: estimated cost of {{[34]}} for {{.*}} fmul <3 x float>
+; ALL: estimated cost of 3 for {{.*}} fmul <3 x float>
 define amdgpu_kernel void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %add = fmul <3 x float> %vec, %b
@@ -33,9 +31,7 @@ define amdgpu_kernel void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float
 }
 
 ; ALL-LABEL: 'fmul_v5f32'
-; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening,
-; and 5 when it is legal.
-; ALL: estimated cost of {{[58]}} for {{.*}} fmul <5 x float>
+; ALL: estimated cost of 5 for {{.*}} fmul <5 x float>
 define amdgpu_kernel void @fmul_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
   %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
   %add = fmul <5 x float> %vec, %b
@@ -45,7 +41,8 @@ define amdgpu_kernel void @fmul_v5f32(<5 x float> addrspace(1)* %out, <5 x float
 
 ; ALL-LABEL: 'fmul_f64'
 ; FASTF64: estimated cost of 2 for {{.*}} fmul double
-; SLOWF64: estimated cost of 3 for {{.*}} fmul double
+; SLOWF64: estimated cost of 4 for {{.*}} fmul double
+; SIZEALL: estimated cost of 2 for {{.*}} fmul double
 define amdgpu_kernel void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %add = fmul double %vec, %b
@@ -55,7 +52,8 @@ define amdgpu_kernel void @fmul_f64(double addrspace(1)* %out, double addrspace(
 
 ; ALL-LABEL: 'fmul_v2f64'
 ; FASTF64: estimated cost of 4 for {{.*}} fmul <2 x double>
-; SLOWF64: estimated cost of 6 for {{.*}} fmul <2 x double>
+; SLOWF64: estimated cost of 8 for {{.*}} fmul <2 x double>
+; SIZEALL: estimated cost of 4 for {{.*}} fmul <2 x double>
 define amdgpu_kernel void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
   %add = fmul <2 x double> %vec, %b
@@ -65,7 +63,8 @@ define amdgpu_kernel void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x doub
 
 ; ALL-LABEL: 'fmul_v3f64'
 ; FASTF64: estimated cost of 6 for {{.*}} fmul <3 x double>
-; SLOWF64: estimated cost of 9 for {{.*}} fmul <3 x double>
+; SLOWF64: estimated cost of 12 for {{.*}} fmul <3 x double>
+; SIZEALL: estimated cost of 6 for {{.*}} fmul <3 x double>
 define amdgpu_kernel void @fmul_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
   %add = fmul <3 x double> %vec, %b

diff  --git a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
index 8bc6ebcd7da0..287bba8f83b1 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
@@ -1,9 +1,9 @@
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s
-; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s
-; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s
+; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZEALL,FASTF16,ALL %s
+; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SIZEALL,SLOWF16,ALL %s
 
-; ALL: 'fsub_f32'
+; ALL-LABEL: 'fsub_f32'
 ; ALL: estimated cost of 1 for {{.*}} fsub float
 define amdgpu_kernel void @fsub_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
   %vec = load float, float addrspace(1)* %vaddr
@@ -12,7 +12,7 @@ define amdgpu_kernel void @fsub_f32(float addrspace(1)* %out, float addrspace(1)
   ret void
 }
 
-; ALL: 'fsub_v2f32'
+; ALL-LABEL: 'fsub_v2f32'
 ; ALL: estimated cost of 2 for {{.*}} fsub <2 x float>
 define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
@@ -21,10 +21,8 @@ define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float
   ret void
 }
 
-; ALL: 'fsub_v3f32'
-; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening,
-; and 3 when it is legal.
-; ALL: estimated cost of {{[34]}} for {{.*}} fsub <3 x float>
+; ALL-LABEL: 'fsub_v3f32'
+; ALL: estimated cost of 3 for {{.*}} fsub <3 x float>
 define amdgpu_kernel void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %add = fsub <3 x float> %vec, %b
@@ -32,10 +30,8 @@ define amdgpu_kernel void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float
   ret void
 }
 
-; ALL: 'fsub_v5f32'
-; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening,
-; and 5 when it is legal.
-; ALL: estimated cost of {{[58]}} for {{.*}} fsub <5 x float>
+; ALL-LABEL: 'fsub_v5f32'
+; ALL: estimated cost of 5 for {{.*}} fsub <5 x float>
 define amdgpu_kernel void @fsub_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
   %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
   %add = fsub <5 x float> %vec, %b
@@ -43,9 +39,10 @@ define amdgpu_kernel void @fsub_v5f32(<5 x float> addrspace(1)* %out, <5 x float
   ret void
 }
 
-; ALL: 'fsub_f64'
+; ALL-LABEL: 'fsub_f64'
 ; FASTF64: estimated cost of 2 for {{.*}} fsub double
-; SLOWF64: estimated cost of 3 for {{.*}} fsub double
+; SLOWF64: estimated cost of 4 for {{.*}} fsub double
+; SIZEALL: estimated cost of 2 for {{.*}} fsub double
 define amdgpu_kernel void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %add = fsub double %vec, %b
@@ -53,9 +50,10 @@ define amdgpu_kernel void @fsub_f64(double addrspace(1)* %out, double addrspace(
   ret void
 }
 
-; ALL: 'fsub_v2f64'
+; ALL-LABEL: 'fsub_v2f64'
 ; FASTF64: estimated cost of 4 for {{.*}} fsub <2 x double>
-; SLOWF64: estimated cost of 6 for {{.*}} fsub <2 x double>
+; SLOWF64: estimated cost of 8 for {{.*}} fsub <2 x double>
+; SIZEALL: estimated cost of 4 for {{.*}} fsub <2 x double>
 define amdgpu_kernel void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
   %add = fsub <2 x double> %vec, %b
@@ -63,9 +61,10 @@ define amdgpu_kernel void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x doub
   ret void
 }
 
-; ALL: 'fsub_v3f64'
+; ALL-LABEL: 'fsub_v3f64'
 ; FASTF64: estimated cost of 6 for {{.*}} fsub <3 x double>
-; SLOWF64: estimated cost of 9 for {{.*}} fsub <3 x double>
+; SLOWF64: estimated cost of 12 for {{.*}} fsub <3 x double>
+; SIZEALL: estimated cost of 6 for {{.*}} fsub <3 x double>
 define amdgpu_kernel void @fsub_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
   %add = fsub <3 x double> %vec, %b
@@ -73,7 +72,7 @@ define amdgpu_kernel void @fsub_v3f64(<3 x double> addrspace(1)* %out, <3 x doub
   ret void
 }
 
-; ALL: 'fsub_f16'
+; ALL-LABEL: 'fsub_f16'
 ; ALL: estimated cost of 1 for {{.*}} fsub half
 define amdgpu_kernel void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
   %vec = load half, half addrspace(1)* %vaddr
@@ -82,7 +81,7 @@ define amdgpu_kernel void @fsub_f16(half addrspace(1)* %out, half addrspace(1)*
   ret void
 }
 
-; ALL: 'fsub_v2f16'
+; ALL-LABEL: 'fsub_v2f16'
 ; SLOWF16: estimated cost of 2 for {{.*}} fsub <2 x half>
 ; FASTF16: estimated cost of 1 for {{.*}} fsub <2 x half>
 define amdgpu_kernel void @fsub_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
@@ -92,7 +91,7 @@ define amdgpu_kernel void @fsub_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
   ret void
 }
 
-; ALL: 'fsub_v3f16'
+; ALL-LABEL: 'fsub_v3f16'
 ; SLOWF16: estimated cost of 4 for {{.*}} fsub <3 x half>
 ; FASTF16: estimated cost of 2 for {{.*}} fsub <3 x half>
 define amdgpu_kernel void @fsub_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr, <3 x half> %b) #0 {
@@ -102,7 +101,7 @@ define amdgpu_kernel void @fsub_v3f16(<3 x half> addrspace(1)* %out, <3 x half>
   ret void
 }
 
-; ALL: 'fsub_v4f16'
+; ALL-LABEL: 'fsub_v4f16'
 ; SLOWF16: estimated cost of 4 for {{.*}} fsub <4 x half>
 ; FASTF16: estimated cost of 2 for {{.*}} fsub <4 x half>
 define amdgpu_kernel void @fsub_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {

diff  --git a/llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll b/llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll
index 21067738af23..5fbd7835351e 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fused_costs.ll
@@ -1,11 +1,11 @@
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=FUSED,NOCONTRACT,ALL %s
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=on < %s | FileCheck -check-prefixes=SLOW,NOCONTRACT,ALL %s
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=FUSED,CONTRACT,ALL %s
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=GFX1030,NOCONTRACT,ALL %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=FUSED32,FUSED16,NOCONTRACT,ALL %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=on < %s | FileCheck -check-prefixes=SLOW,NOCONTRACT,ALL %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=FUSED32,FUSED16,CONTRACT,ALL %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=GFX1030,NOCONTRACT,ALL %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=FUSED,NOCONTRACT,THRPTALL,ALL %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=on < %s | FileCheck -check-prefixes=SLOW,NOCONTRACT,THRPTALL,ALL %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=FUSED,CONTRACT,THRPTALL,ALL %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=GFX1030,NOCONTRACT,THRPTALL,ALL %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=FUSED,SZNOCONTRACT,SIZEALL,ALL %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=on < %s | FileCheck -check-prefixes=SLOW,SZNOCONTRACT,SIZEALL,ALL %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=FUSED,CONTRACT,SIZEALL,ALL %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=GFX1030,SZNOCONTRACT,SIZEALL,ALL %s
 
 target triple = "amdgcn--"
 
@@ -113,8 +113,10 @@ define <2 x half> @fmul_fsub_v2f16(<2 x half> %r0, <2 x half> %r1, <2 x half> %r
 
 ; ALL-LABEL: 'fmul_fadd_f64':
 ; CONTRACT: estimated cost of 0 for instruction:   %mul = fmul double
-; NOCONTRACT: estimated cost of 3 for instruction:   %mul = fmul double
-; ALL: estimated cost of 3 for instruction:   %add = fadd double
+; NOCONTRACT: estimated cost of 4 for instruction:   %mul = fmul double
+; SZNOCONTRACT: estimated cost of 2 for instruction:   %mul = fmul double
+; THRPTALL: estimated cost of 4 for instruction:   %add = fadd double
+; SIZEALL: estimated cost of 2 for instruction:   %add = fadd double
 define double @fmul_fadd_f64(double %r0, double %r1, double %r2) #0 {
   %mul = fmul double %r0, %r1
   %add = fadd double %mul, %r2
@@ -123,7 +125,8 @@ define double @fmul_fadd_f64(double %r0, double %r1, double %r2) #0 {
 
 ; ALL-LABEL: 'fmul_fadd_contract_f64':
 ; ALL: estimated cost of 0 for instruction:   %mul = fmul contract double
-; ALL: estimated cost of 3 for instruction:   %add = fadd contract double
+; THRPTALL: estimated cost of 4 for instruction:   %add = fadd contract double
+; SIZEALL: estimated cost of 2 for instruction:   %add = fadd contract double
 define double @fmul_fadd_contract_f64(double %r0, double %r1, double %r2) #0 {
   %mul = fmul contract double %r0, %r1
   %add = fadd contract double %mul, %r2
@@ -132,8 +135,10 @@ define double @fmul_fadd_contract_f64(double %r0, double %r1, double %r2) #0 {
 
 ; ALL-LABEL: 'fmul_fadd_v2f64':
 ; CONTRACT: estimated cost of 0 for instruction:   %mul = fmul <2 x double>
-; NOCONTRACT: estimated cost of 6 for instruction:   %mul = fmul <2 x double>
-; ALL: estimated cost of 6 for instruction:   %add = fadd <2 x double>
+; NOCONTRACT: estimated cost of 8 for instruction:   %mul = fmul <2 x double>
+; SZNOCONTRACT: estimated cost of 4 for instruction:   %mul = fmul <2 x double>
+; THRPTALL: estimated cost of 8 for instruction:   %add = fadd <2 x double>
+; SIZEALL: estimated cost of 4 for instruction:   %add = fadd <2 x double>
 define <2 x double> @fmul_fadd_v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2) #0 {
   %mul = fmul <2 x double> %r0, %r1
   %add = fadd <2 x double> %mul, %r2
@@ -142,8 +147,10 @@ define <2 x double> @fmul_fadd_v2f64(<2 x double> %r0, <2 x double> %r1, <2 x do
 
 ; ALL-LABEL: 'fmul_fsub_f64':
 ; CONTRACT: estimated cost of 0 for instruction:   %mul = fmul double
-; NOCONTRACT: estimated cost of 3 for instruction:   %mul = fmul double
-; ALL: estimated cost of 3 for instruction:   %sub = fsub double
+; NOCONTRACT: estimated cost of 4 for instruction:   %mul = fmul double
+; SZNOCONTRACT: estimated cost of 2 for instruction:   %mul = fmul double
+; THRPTALL: estimated cost of 4 for instruction:   %sub = fsub double
+; SIZEALL: estimated cost of 2 for instruction:   %sub = fsub double
 define double @fmul_fsub_f64(double %r0, double %r1, double %r2) #0 {
   %mul = fmul double %r0, %r1
   %sub = fsub double %mul, %r2
@@ -152,8 +159,10 @@ define double @fmul_fsub_f64(double %r0, double %r1, double %r2) #0 {
 
 ; ALL-LABEL: 'fmul_fsub_v2f64':
 ; CONTRACT: estimated cost of 0 for instruction:   %mul = fmul <2 x double>
-; NOCONTRACT: estimated cost of 6 for instruction:   %mul = fmul <2 x double>
-; ALL: estimated cost of 6 for instruction:   %sub = fsub <2 x double>
+; NOCONTRACT: estimated cost of 8 for instruction:   %mul = fmul <2 x double>
+; SZNOCONTRACT: estimated cost of 4 for instruction:   %mul = fmul <2 x double>
+; THRPTALL: estimated cost of 8 for instruction:   %sub = fsub <2 x double>
+; SIZEALL: estimated cost of 4 for instruction:   %sub = fsub <2 x double>
 define <2 x double> @fmul_fsub_v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2) #0 {
   %mul = fmul <2 x double> %r0, %r1
   %sub = fsub <2 x double> %mul, %r2

diff  --git a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll
index fa36d391f9c3..e4ca0685708f 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll
@@ -1,10 +1,11 @@
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW16,ALL %s
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=FAST16,ALL %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW16,ALL %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=FAST16,ALL %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW16,THRPTALL,ALL %s
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=FAST16,THRPTALL,ALL %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SIZESLOW16,SIZEALL,ALL %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=SIZEFAST16,SIZEALL,ALL %s
 
-; ALL: 'mul_i32'
-; ALL: estimated cost of 3 for {{.*}} mul i32
+; ALL-LABEL: 'mul_i32'
+; THRPTALL: estimated cost of 4 for {{.*}} mul i32
+; SIZEALL: estimated cost of 2 for {{.*}} mul i32
 define amdgpu_kernel void @mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %mul = mul i32 %vec, %b
@@ -12,8 +13,9 @@ define amdgpu_kernel void @mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %va
   ret void
 }
 
-; ALL: 'mul_v2i32'
-; ALL: estimated cost of 6 for {{.*}} mul <2 x i32>
+; ALL-LABEL: 'mul_v2i32'
+; THRPTALL: estimated cost of 8 for {{.*}} mul <2 x i32>
+; SIZEALL: estimated cost of 4 for {{.*}} mul <2 x i32>
 define amdgpu_kernel void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
   %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
   %mul = mul <2 x i32> %vec, %b
@@ -21,10 +23,9 @@ define amdgpu_kernel void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> add
   ret void
 }
 
-; ALL: 'mul_v3i32'
-; Allow for 12 when v3i32 is illegal and TargetLowering thinks it needs widening,
-; and 9 when it is legal.
-; ALL: estimated cost of {{9|12}} for {{.*}} mul <3 x i32>
+; ALL-LABEL: 'mul_v3i32'
+; THRPTALL: estimated cost of 12 for {{.*}} mul <3 x i32>
+; SIZEALL: estimated cost of 6 for {{.*}} mul <3 x i32>
 define amdgpu_kernel void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
   %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
   %mul = mul <3 x i32> %vec, %b
@@ -32,10 +33,9 @@ define amdgpu_kernel void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> add
   ret void
 }
 
-; ALL: 'mul_v5i32'
-; Allow for 24 when v5i32 is illegal and TargetLowering thinks it needs widening,
-; and 15 when it is legal.
-; ALL: estimated cost of {{15|24}} for {{.*}} mul <5 x i32>
+; ALL-LABEL: 'mul_v5i32'
+; THRPTALL: estimated cost of 20 for {{.*}} mul <5 x i32>
+; SIZEALL: estimated cost of 10 for {{.*}} mul <5 x i32>
 define amdgpu_kernel void @mul_v5i32(<5 x i32> addrspace(1)* %out, <5 x i32> addrspace(1)* %vaddr, <5 x i32> %b) #0 {
   %vec = load <5 x i32>, <5 x i32> addrspace(1)* %vaddr
   %mul = mul <5 x i32> %vec, %b
@@ -43,8 +43,9 @@ define amdgpu_kernel void @mul_v5i32(<5 x i32> addrspace(1)* %out, <5 x i32> add
   ret void
 }
 
-; ALL: 'mul_v4i32'
-; ALL: estimated cost of 12 for {{.*}} mul <4 x i32>
+; ALL-LABEL: 'mul_v4i32'
+; THRPTALL: estimated cost of 16 for {{.*}} mul <4 x i32>
+; SIZEALL: estimated cost of 8 for {{.*}} mul <4 x i32>
 define amdgpu_kernel void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
   %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
   %mul = mul <4 x i32> %vec, %b
@@ -52,8 +53,9 @@ define amdgpu_kernel void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> add
   ret void
 }
 
-; ALL: 'mul_i64'
-; ALL: estimated cost of 16 for {{.*}} mul i64
+; ALL-LABEL: 'mul_i64'
+; THRPTALL: estimated cost of 20 for {{.*}} mul i64
+; SIZEALL: estimated cost of 12 for {{.*}} mul i64
 define amdgpu_kernel void @mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %mul = mul i64 %vec, %b
@@ -61,8 +63,9 @@ define amdgpu_kernel void @mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %va
   ret void
 }
 
-; ALL: 'mul_v2i64'
-; ALL: estimated cost of 32 for {{.*}} mul <2 x i64>
+; ALL-LABEL: 'mul_v2i64'
+; THRPTALL: estimated cost of 40 for {{.*}} mul <2 x i64>
+; SIZEALL: estimated cost of 24 for {{.*}} mul <2 x i64>
 define amdgpu_kernel void @mul_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
   %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
   %mul = mul <2 x i64> %vec, %b
@@ -70,8 +73,9 @@ define amdgpu_kernel void @mul_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> add
   ret void
 }
 
-; ALL: 'mul_v3i64'
-; ALL: estimated cost of 48 for {{.*}} mul <3 x i64>
+; ALL-LABEL: 'mul_v3i64'
+; THRPTALL: estimated cost of 60 for {{.*}} mul <3 x i64>
+; SIZEALL: estimated cost of 36 for {{.*}} mul <3 x i64>
 define amdgpu_kernel void @mul_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
   %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
   %mul = mul <3 x i64> %vec, %b
@@ -79,8 +83,9 @@ define amdgpu_kernel void @mul_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> add
   ret void
 }
 
-; ALL: 'mul_v4i64'
-; ALL: estimated cost of 64 for {{.*}} mul <4 x i64>
+; ALL-LABEL: 'mul_v4i64'
+; THRPTALL: estimated cost of 80 for {{.*}} mul <4 x i64>
+; SIZEALL: estimated cost of 48 for {{.*}} mul <4 x i64>
 define amdgpu_kernel void @mul_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
   %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
   %mul = mul <4 x i64> %vec, %b
@@ -89,8 +94,9 @@ define amdgpu_kernel void @mul_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> add
 }
 
 
-; ALL: 'mul_v8i64'
-; ALL: estimated cost of 256 for {{.*}} mul <8 x i64>
+; ALL-LABEL: 'mul_v8i64'
+; THRPTALL: estimated cost of 320 for {{.*}} mul <8 x i64>
+; SIZEALL: estimated cost of 192 for {{.*}} mul <8 x i64>
 define amdgpu_kernel void @mul_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr, <8 x i64> %b) #0 {
   %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr
   %mul = mul <8 x i64> %vec, %b
@@ -98,8 +104,9 @@ define amdgpu_kernel void @mul_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> add
   ret void
 }
 
-; ALL: 'mul_i16'
-; ALL: estimated cost of 3 for {{.*}} mul i16
+; ALL-LABEL: 'mul_i16'
+; THRPTALL: estimated cost of 4 for {{.*}} mul i16
+; SIZEALL: estimated cost of 2 for {{.*}} mul i16
 define amdgpu_kernel void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
   %vec = load i16, i16 addrspace(1)* %vaddr
   %mul = mul i16 %vec, %b
@@ -107,9 +114,11 @@ define amdgpu_kernel void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %va
   ret void
 }
 
-; ALL: 'mul_v2i16'
-; SLOW16: estimated cost of 6 for {{.*}} mul <2 x i16>
-; FAST16: estimated cost of 3 for {{.*}} mul <2 x i16>
+; ALL-LABEL: 'mul_v2i16'
+; SLOW16: estimated cost of 8 for {{.*}} mul <2 x i16>
+; FAST16: estimated cost of 4 for {{.*}} mul <2 x i16>
+; SIZESLOW16: estimated cost of 4 for {{.*}} mul <2 x i16>
+; SIZEFAST16: estimated cost of 2 for {{.*}} mul <2 x i16>
 define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
   %mul = mul <2 x i16> %vec, %b
@@ -117,9 +126,11 @@ define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> add
   ret void
 }
 
-; ALL: 'mul_v3i16'
-; SLOW16: estimated cost of 12 for {{.*}} mul <3 x i16>
-; FAST16: estimated cost of 6 for {{.*}} mul <3 x i16>
+; ALL-LABEL: 'mul_v3i16'
+; SLOW16: estimated cost of 16 for {{.*}} mul <3 x i16>
+; FAST16: estimated cost of 8 for {{.*}} mul <3 x i16>
+; SIZESLOW16: estimated cost of 8 for {{.*}} mul <3 x i16>
+; SIZEFAST16: estimated cost of 4 for {{.*}} mul <3 x i16>
 define amdgpu_kernel void @mul_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %vaddr, <3 x i16> %b) #0 {
   %vec = load <3 x i16>, <3 x i16> addrspace(1)* %vaddr
   %mul = mul <3 x i16> %vec, %b

diff  --git a/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll b/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll
index 55f547fe3f76..42936644e590 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll
@@ -1,9 +1,9 @@
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,FAST64,FAST16 %s
 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SLOW64,SLOW16 %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,FAST64,FAST16 %s
-; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SLOW64,SLOW16 %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,FAST16 %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SLOW16 %s
 
-; ALL: 'shl_i32'
+; ALL-LABEL: 'shl_i32'
 ; ALL: estimated cost of 1 for {{.*}} shl i32
 define amdgpu_kernel void @shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
@@ -12,9 +12,10 @@ define amdgpu_kernel void @shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %va
   ret void
 }
 
-; ALL: 'shl_i64'
+; ALL-LABEL: 'shl_i64'
 ; FAST64: estimated cost of 2 for {{.*}} shl i64
-; SLOW64: estimated cost of 3 for {{.*}} shl i64
+; SLOW64: estimated cost of 4 for {{.*}} shl i64
+; SIZEALL: estimated cost of 2 for {{.*}} shl i64
 define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %or = shl i64 %vec, %b
@@ -22,7 +23,7 @@ define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %va
   ret void
 }
 
-; ALL: 'shl_i16'
+; ALL-LABEL: 'shl_i16'
 ; ALL: estimated cost of 1 for {{.*}} shl i16
 define amdgpu_kernel void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
   %vec = load i16, i16 addrspace(1)* %vaddr
@@ -31,7 +32,7 @@ define amdgpu_kernel void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %va
   ret void
 }
 
-; ALL: 'shl_v2i16'
+; ALL-LABEL: 'shl_v2i16'
 ; SLOW16: estimated cost of 2 for {{.*}} shl <2 x i16>
 ; FAST16: estimated cost of 1 for {{.*}} shl <2 x i16>
 define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
@@ -41,7 +42,7 @@ define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> add
   ret void
 }
 
-; ALL: 'lshr_i32'
+; ALL-LABEL: 'lshr_i32'
 ; ALL: estimated cost of 1 for {{.*}} lshr i32
 define amdgpu_kernel void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
@@ -50,9 +51,10 @@ define amdgpu_kernel void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %v
   ret void
 }
 
-; ALL: 'lshr_i64'
+; ALL-LABEL: 'lshr_i64'
 ; FAST64: estimated cost of 2 for {{.*}} lshr i64
-; SLOW64: estimated cost of 3 for {{.*}} lshr i64
+; SLOW64: estimated cost of 4 for {{.*}} lshr i64
+; SIZEALL: estimated cost of 2 for {{.*}} lshr i64
 define amdgpu_kernel void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %or = lshr i64 %vec, %b
@@ -60,7 +62,7 @@ define amdgpu_kernel void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %v
   ret void
 }
 
-; ALL: 'lshr_i16'
+; ALL-LABEL: 'lshr_i16'
 ; ALL: estimated cost of 1 for {{.*}} lshr i16
 define amdgpu_kernel void @lshr_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
   %vec = load i16, i16 addrspace(1)* %vaddr
@@ -69,7 +71,7 @@ define amdgpu_kernel void @lshr_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %v
   ret void
 }
 
-; ALL: 'lshr_v2i16'
+; ALL-LABEL: 'lshr_v2i16'
 ; SLOW16: estimated cost of 2 for {{.*}} lshr <2 x i16>
 ; FAST16: estimated cost of 1 for {{.*}} lshr <2 x i16>
 define amdgpu_kernel void @lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
@@ -79,7 +81,7 @@ define amdgpu_kernel void @lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> ad
   ret void
 }
 
-; ALL: 'ashr_i32'
+; ALL-LABEL: 'ashr_i32'
 ; ALL: estimated cost of 1 for {{.*}} ashr i32
 define amdgpu_kernel void @ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
@@ -88,9 +90,9 @@ define amdgpu_kernel void @ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %v
   ret void
 }
 
-; ALL: 'ashr_i64'
+; ALL-LABEL: 'ashr_i64'
 ; FAST64: estimated cost of 2 for {{.*}} ashr i64
-; SLOW64: estimated cost of 3 for {{.*}} ashr i64
+; SLOW64: estimated cost of 4 for {{.*}} ashr i64
 define amdgpu_kernel void @ashr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %or = ashr i64 %vec, %b
@@ -98,7 +100,7 @@ define amdgpu_kernel void @ashr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %v
   ret void
 }
 
-; ALL: 'ashr_i16'
+; ALL-LABEL: 'ashr_i16'
 ; ALL: estimated cost of 1 for {{.*}} ashr i16
 define amdgpu_kernel void @ashr_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
   %vec = load i16, i16 addrspace(1)* %vaddr
@@ -107,7 +109,7 @@ define amdgpu_kernel void @ashr_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %v
   ret void
 }
 
-; ALL: 'ashr_v2i16'
+; ALL-LABEL: 'ashr_v2i16'
 ; SLOW16: estimated cost of 2 for {{.*}} ashr <2 x i16>
 ; FAST16: estimated cost of 1 for {{.*}} ashr <2 x i16>
 define amdgpu_kernel void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {