[llvm] 1ccc499 - [AArch64] Add getCFInstrCost, treat branches as free for throughput.

Tue Jun 30 12:34:48 PDT 2020

Author: Florian Hahn
Date: 2020-06-30T20:34:04+01:00
New Revision: 1ccc49924aeb7bb5fbd2eb956243d16db3a02b08

URL: https://github.com/llvm/llvm-project/commit/1ccc49924aeb7bb5fbd2eb956243d16db3a02b08
DIFF: https://github.com/llvm/llvm-project/commit/1ccc49924aeb7bb5fbd2eb956243d16db3a02b08.diff

LOG: [AArch64] Add getCFInstrCost, treat branches as free for throughput.

D79164/2596da31740f changed getCFInstrCost to return 1 per default.
AArch64 did not have its own implementation, hence the throughput cost
of CFI instructions is overestimated. On most cores, most branches should
be predicated and essentially free throughput wise.

This restores a 9% performance regression on a SPEC2006 benchmark on
AArch64 with -O3 LTO & PGO.

This patch effectively restores pre 2596da31740f behavior for AArch64
and undoes the AArch64 test changes of the patch.

Reviewers: samparker, dmgreen, anemet

Reviewed By: samparker

Differential Revision: https://reviews.llvm.org/D82755

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
    llvm/test/Analysis/CostModel/AArch64/aggregates.ll
    llvm/test/Analysis/CostModel/AArch64/cast.ll
    llvm/test/Analysis/CostModel/AArch64/cmp.ll
    llvm/test/Analysis/CostModel/AArch64/select.ll
    llvm/test/Analysis/CostModel/AArch64/shuffle-broadcast.ll
    llvm/test/Analysis/CostModel/AArch64/store.ll
    llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
    llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
    llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 0fa2748ebe7c..e637284e226e 100644

--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -469,6 +469,14 @@ int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
   return Cost + getCastInstrCost(Opcode, Dst, Src, CostKind);
 }
 
+unsigned AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
+                                        TTI::TargetCostKind CostKind) {
+  if (CostKind != TTI::TCK_RecipThroughput)
+    return Opcode == Instruction::PHI ? 0 : 1;
+  // Branches are assumed to be predicted.
+  return CostKind == TTI::TCK_RecipThroughput ? 0 : 1;
+}
+
 int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
                                        unsigned Index) {
   assert(Val->isVectorTy() && "This must be a vector type");

diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 55d4c01ee1e3..27afb2e5a7d6 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -120,6 +120,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
   int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
                                unsigned Index);
 
+  unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
+
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
 
   int getArithmeticInstrCost(

diff  --git a/llvm/test/Analysis/CostModel/AArch64/aggregates.ll b/llvm/test/Analysis/CostModel/AArch64/aggregates.ll
index 6ce57f966ed1..35d232b3b69a 100644
--- a/llvm/test/Analysis/CostModel/AArch64/aggregates.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/aggregates.ll
@@ -4,63 +4,119 @@
 ; RUN: opt < %s -mtriple=aarch64-unknown-linux-gnu -cost-model -cost-kind=code-size -analyze | FileCheck %s --check-prefixes=ALL,CODESIZE
 
 define i32 @extract_first_i32({i32, i32} %agg) {
-; ALL-LABEL: 'extract_first_i32'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, i32 } %agg, 0
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %r
+; THROUGHPUT-LABEL: 'extract_first_i32'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, i32 } %agg, 0
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
+;
+; LATENCY-LABEL: 'extract_first_i32'
+; LATENCY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, i32 } %agg, 0
+; LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %r
+;
+; CODESIZE-LABEL: 'extract_first_i32'
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, i32 } %agg, 0
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %r
 ;
   %r = extractvalue {i32, i32} %agg, 0
   ret i32 %r
 }
 
 define i32 @extract_second_i32({i32, i32} %agg) {
-; ALL-LABEL: 'extract_second_i32'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, i32 } %agg, 1
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %r
+; THROUGHPUT-LABEL: 'extract_second_i32'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, i32 } %agg, 1
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
+;
+; LATENCY-LABEL: 'extract_second_i32'
+; LATENCY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, i32 } %agg, 1
+; LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %r
+;
+; CODESIZE-LABEL: 'extract_second_i32'
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, i32 } %agg, 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %r
 ;
   %r = extractvalue {i32, i32} %agg, 1
   ret i32 %r
 }
 
 define i32 @extract_i32({i32, i1} %agg) {
-; ALL-LABEL: 'extract_i32'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, i1 } %agg, 0
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %r
+; THROUGHPUT-LABEL: 'extract_i32'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, i1 } %agg, 0
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
+;
+; LATENCY-LABEL: 'extract_i32'
+; LATENCY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, i1 } %agg, 0
+; LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %r
+;
+; CODESIZE-LABEL: 'extract_i32'
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, i1 } %agg, 0
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %r
 ;
   %r = extractvalue {i32, i1} %agg, 0
   ret i32 %r
 }
 
 define i1 @extract_i1({i32, i1} %agg) {
-; ALL-LABEL: 'extract_i1'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, i1 } %agg, 1
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i1 %r
+; THROUGHPUT-LABEL: 'extract_i1'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, i1 } %agg, 1
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i1 %r
+;
+; LATENCY-LABEL: 'extract_i1'
+; LATENCY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, i1 } %agg, 1
+; LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i1 %r
+;
+; CODESIZE-LABEL: 'extract_i1'
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, i1 } %agg, 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i1 %r
 ;
   %r = extractvalue {i32, i1} %agg, 1
   ret i1 %r
 }
 
 define float @extract_float({i32, float} %agg) {
-; ALL-LABEL: 'extract_float'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, float } %agg, 1
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float %r
+; THROUGHPUT-LABEL: 'extract_float'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, float } %agg, 1
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
+;
+; LATENCY-LABEL: 'extract_float'
+; LATENCY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, float } %agg, 1
+; LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float %r
+;
+; CODESIZE-LABEL: 'extract_float'
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, float } %agg, 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret float %r
 ;
   %r = extractvalue {i32, float} %agg, 1
   ret float %r
 }
 
 define [42 x i42] @extract_array({i32, [42 x i42]} %agg) {
-; ALL-LABEL: 'extract_array'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, [42 x i42] } %agg, 1
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret [42 x i42] %r
+; THROUGHPUT-LABEL: 'extract_array'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, [42 x i42] } %agg, 1
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret [42 x i42] %r
+;
+; LATENCY-LABEL: 'extract_array'
+; LATENCY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, [42 x i42] } %agg, 1
+; LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret [42 x i42] %r
+;
+; CODESIZE-LABEL: 'extract_array'
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, [42 x i42] } %agg, 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret [42 x i42] %r
 ;
   %r = extractvalue {i32, [42 x i42]} %agg, 1
   ret [42 x i42] %r
 }
 
 define <42 x i42> @extract_vector({i32, <42 x i42>} %agg) {
-; ALL-LABEL: 'extract_vector'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, <42 x i42> } %agg, 1
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <42 x i42> %r
+; THROUGHPUT-LABEL: 'extract_vector'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, <42 x i42> } %agg, 1
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <42 x i42> %r
+;
+; LATENCY-LABEL: 'extract_vector'
+; LATENCY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, <42 x i42> } %agg, 1
+; LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <42 x i42> %r
+;
+; CODESIZE-LABEL: 'extract_vector'
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, <42 x i42> } %agg, 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <42 x i42> %r
 ;
   %r = extractvalue {i32, <42 x i42>} %agg, 1
   ret <42 x i42> %r
@@ -69,9 +125,17 @@ define <42 x i42> @extract_vector({i32, <42 x i42>} %agg) {
 %T1 = type { i32, float, <4 x i1> }
 
 define %T1 @extract_struct({i32, %T1} %agg) {
-; ALL-LABEL: 'extract_struct'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, %T1 } %agg, 1
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret %T1 %r
+; THROUGHPUT-LABEL: 'extract_struct'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, %T1 } %agg, 1
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret %T1 %r
+;
+; LATENCY-LABEL: 'extract_struct'
+; LATENCY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, %T1 } %agg, 1
+; LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret %T1 %r
+;
+; CODESIZE-LABEL: 'extract_struct'
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = extractvalue { i32, %T1 } %agg, 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret %T1 %r
 ;
   %r = extractvalue {i32, %T1} %agg, 1
   ret %T1 %r

diff  --git a/llvm/test/Analysis/CostModel/AArch64/cast.ll b/llvm/test/Analysis/CostModel/AArch64/cast.ll
index 24e514b87096..2255f84f9ff4 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cast.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cast.ll
@@ -267,7 +267,7 @@ define i32 @casts_no_users() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %r0 = sext i1 undef to i8
   %r1 = zext i1 undef to i8
@@ -609,7 +609,7 @@ define i32 @casts_with_users(i8 %a, i16 %b, i32 %c, i64 %d, i1 %e) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 %r23, i16* undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %r24, i32* undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i64 %r25, i64* undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %r12
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r12
 ;
   %r0 = sext i8 %a to i16
   %r1 = sext i8 %a to i32
@@ -683,7 +683,7 @@ define i32 @bitcasts() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f = bitcast double undef to i64
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %g = bitcast half undef to i16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %h = bitcast i16 undef to half
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %a = bitcast i32 undef to i32
   %b = bitcast float undef to float
@@ -731,7 +731,7 @@ define i32 @load_extends() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %loadi8 = load i8, i8* undef
   %loadi16 = load i16, i16* undef
@@ -786,7 +786,7 @@ define i32 @store_truncs() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 %r4, i16* undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r5 = trunc i16 undef to i8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i8 %r5, i8* undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %r0 = trunc i64 undef to i8
   store i8 %r0, i8* undef

diff  --git a/llvm/test/Analysis/CostModel/AArch64/cmp.ll b/llvm/test/Analysis/CostModel/AArch64/cmp.ll
index 7e8fc30141d6..c8512bb2664c 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cmp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cmp.ll
@@ -17,7 +17,7 @@ define i32 @cmps() {
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %a10 = fcmp olt <8 x half> undef, undef
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a11 = fcmp oge <4 x float> undef, undef
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a12 = fcmp oge <2 x double> undef, undef
-; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; CHECK-SIZE-LABEL: 'cmps'
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a0 = icmp slt i8 undef, undef

diff  --git a/llvm/test/Analysis/CostModel/AArch64/select.ll b/llvm/test/Analysis/CostModel/AArch64/select.ll
index 6ec4cff5cf84..25af9af1c6e9 100644
--- a/llvm/test/Analysis/CostModel/AArch64/select.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/select.ll
@@ -18,7 +18,7 @@ define void @select() {
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
 ; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-SIZE-LABEL: 'select'
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = select i1 undef, i8 undef, i8 undef

diff  --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-broadcast.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-broadcast.ll
index 61f28cbea5af..355ed520575f 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-broadcast.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-broadcast.ll
@@ -14,7 +14,7 @@ define void @broadcast() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v17 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v7 = shufflevector <2 x i8> undef, <2 x i8>undef, <2 x i32> zeroinitializer
   %v8 = shufflevector <4 x i8> undef, <4 x i8>undef, <4 x i32> zeroinitializer

diff  --git a/llvm/test/Analysis/CostModel/AArch64/store.ll b/llvm/test/Analysis/CostModel/AArch64/store.ll
index 8e23268bada7..63741756fa0f 100644
--- a/llvm/test/Analysis/CostModel/AArch64/store.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/store.ll
@@ -27,7 +27,7 @@ define void @getMemoryOpCost() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i8> undef, <4 x i8>* undef, align 4
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %1 = load <2 x i8>, <2 x i8>* undef, align 2
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %2 = load <4 x i8>, <4 x i8>* undef, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SIZE-LABEL: 'getMemoryOpCost'
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i64> undef, <4 x i64>* undef, align 4
@@ -69,7 +69,7 @@ define void @getMemoryOpCost() {
 ; SLOW_MISALIGNED_128_STORE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i8> undef, <4 x i8>* undef, align 4
 ; SLOW_MISALIGNED_128_STORE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %1 = load <2 x i8>, <2 x i8>* undef, align 2
 ; SLOW_MISALIGNED_128_STORE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %2 = load <4 x i8>, <4 x i8>* undef, align 4
-; SLOW_MISALIGNED_128_STORE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SLOW_MISALIGNED_128_STORE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   store <4 x i64> undef, <4 x i64> * undef
   store <8 x i32> undef, <8 x i32> * undef

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
index 3a7bbbf795aa..eb12803a3447 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
@@ -7,12 +7,12 @@ target triple = "aarch64--linux-gnu"
 
 ; This test checks that we correctly compute the scalarized operands for a
 ; user-specified vectorization factor when interleaving is disabled. We use the
-; "optsize" attribute to disable all interleaving calculations.  A cost of 5
+; "optsize" attribute to disable all interleaving calculations.  A cost of 4
 ; for %tmp4 indicates that we would scalarize it's operand (%tmp3), giving
 ; %tmp4 a lower scalarization overhead.
 ;
 ; COST-LABEL:  predicated_udiv_scalarized_operand
-; COST:        LV: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i64 %tmp2, %tmp3
+; COST:        LV: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i64 %tmp2, %tmp3
 ;
 ; CHECK-LABEL: @predicated_udiv_scalarized_operand(
 ; CHECK:       vector.body:

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
index 877d8a844305..80d2e282176a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
@@ -8,7 +8,7 @@
 ; Check scalar cost for extractvalue. The constant and loop invariant operands are free,
 ; leaving cost 3 for scalarizing the result + 2 for executing the op with VF 2.
 
-; CM: LV: Scalar loop costs: 9.
+; CM: LV: Scalar loop costs: 7.
 ; CM: LV: Found an estimated cost of 5 for VF 2 For instruction:   %a = extractvalue { i64, i64 } %sv, 0
 ; CM-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction:   %b = extractvalue { i64, i64 } %sv, 1
 
@@ -57,7 +57,7 @@ exit:
 ; Similar to the test case above, but checks getVectorCallCost as well.
 declare float @pow(float, float) readnone nounwind
 
-; CM: LV: Scalar loop costs: 18.
+; CM: LV: Scalar loop costs: 16.
 ; CM: LV: Found an estimated cost of 5 for VF 2 For instruction:   %a = extractvalue { float, float } %sv, 0
 ; CM-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction:   %b = extractvalue { float, float } %sv, 1
 

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
index 57a3dfdbc848..b0ebb4edf2ad 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
@@ -19,7 +19,7 @@ target triple = "aarch64--linux-gnu"
 ;   (udiv(2) + extractelement(6) + insertelement(3)) / 2 = 5
 ;
 ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
-; CHECK: Found an estimated cost of 6 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
+; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
 ;
 define i32 @predicated_udiv(i32* %a, i32* %b, i1 %c, i64 %n) {
 entry:
@@ -101,7 +101,7 @@ for.end:
 ; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x
 ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
 ; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x
-; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
+; CHECK: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
 ;
 define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) {
 entry:
@@ -198,8 +198,8 @@ for.end:
 ; CHECK:     Scalarizing: %tmp5 = sub i32 %tmp4, %x
 ; CHECK:     Scalarizing and predicating: store i32 %tmp5, i32* %tmp0, align 4
 ; CHECK:     Found an estimated cost of 1 for VF 2 For instruction: %tmp2 = add i32 %tmp1, %x
-; CHECK:     Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2
-; CHECK:     Found an estimated cost of 6 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2
+; CHECK:     Found an estimated cost of 5 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2
+; CHECK:     Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2
 ; CHECK:     Found an estimated cost of 2 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x
 ; CHECK:     Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp5, i32* %tmp0, align 4
 ;