[llvm] [LoopVectorize][NFC] Rewrite tests to check output of vplan cost model (PR #113697)

Fri Oct 25 07:41:58 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-systemz

Author: David Sherwood (david-arm)

<details>
<summary>Changes</summary>

Currently it's not possible to improve the cost model for tail-folded
loops because as soon as you add a VPInstruction::computeCost function
that adds the costs of instructions such as VPInstruction::ActiveLaneMask
and VPInstruction::ExplicitVectorLength the assert in LoopVectorizationPlanner::computeBestVF fails for some tests. This is
because the VF chosen by the legacy cost model doesn't match the vplan
cost model. See PR #90191. This assert is currently inhibiting attempts
to improve the cost model.

I would like to remove the assert since we've been using the vplan
cost model for 2 months now and that feels long enough to me. However,
in order to do that we have to fix up a whole bunch of tests that rely
upon the legacy cost model output. I've tried my best to update
these tests to use vplan output instead.

There are still a whole bunch of vectoriser tests in

Analysis/CostModel/X86

that depend upon the legacy cost model, and also I feel they shouldn't
really live there either. These can be fixed up in a separate patch!

---

Patch is 76.70 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/113697.diff


25 Files Affected:

- (modified) llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll (+1-1) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll (+13-6) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll (+7-7) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll (+2-2) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll (+4-4) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll (-2) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll (+1-1) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll (+26-21) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll (+13-9) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll (+6-6) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll (+14-6) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll (+16-18) 
- (modified) llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll (+181-128) 
- (modified) llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll (+4-4) 
- (modified) llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll (+2-2) 
- (modified) llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll (+2-2) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/force-vect-msg.ll (+1-1) 
- (modified) llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll (+18-13) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/fneg-cost.ll (+2-2) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll (+14-14) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll (+22-15) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll (+5-5) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll (+2-2) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll (+2-2) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll (+4-4) 


``````````diff

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
index dd3d4215a3f63a..e6d93ea192e56b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
@@ -13,7 +13,7 @@ target triple = "aarch64--linux-gnu"
 ; %var4 a lower scalarization overhead.
 ;
 ; COST-LABEL:  predicated_udiv_scalarized_operand
-; COST:        LV: Found an estimated cost of 5 for VF 2 For instruction: %var4 = udiv i64 %var2, %var3
+; COST:        Cost of 5 for VF 2: profitable to scalarize   %var4 = udiv i64 %var2, %var3
 ;
 ;
 define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) optsize {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
index 28830f9bcd11e1..aa78113ebaa48c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
@@ -11,9 +11,14 @@
 ; CM: LV: Found uniform instruction:   %a = extractvalue { i64, i64 } %sv, 0
 ; CM: LV: Found uniform instruction:   %b = extractvalue { i64, i64 } %sv, 1
 
+; Ensure the extractvalue + add instructions are hoisted out
+; CM: vector.ph:
+; CM:  CLONE ir<%a> = extractvalue ir<%sv>
+; CM:  CLONE ir<%b> = extractvalue ir<%sv>
+; CM:  WIDEN ir<%add> = add ir<%a>, ir<%b>
+; CM:  Successor(s): vector loop
+
 ; CM: LV: Scalar loop costs: 5.
-; CM: LV: Found an estimated cost of 0 for VF 2 For instruction:   %a = extractvalue { i64, i64 } %sv, 0
-; CM-NEXT: LV: Found an estimated cost of 0 for VF 2 For instruction:   %b = extractvalue { i64, i64 } %sv, 1
 
 ; Check that the extractvalue operands are actually free in vector code.
 
@@ -58,12 +63,14 @@ exit:
 ; Similar to the test case above, but checks getVectorCallCost as well.
 declare float @powf(float, float) readnone nounwind
 
-; CM: LV: Found uniform instruction:   %a = extractvalue { float, float } %sv, 0
-; CM: LV: Found uniform instruction:   %b = extractvalue { float, float } %sv, 1
+; Ensure the extractvalue + add instructions are hoisted out
+; CM: vector.ph:
+; CM:  CLONE ir<%a> = extractvalue ir<%sv>
+; CM:  CLONE ir<%b> = extractvalue ir<%sv>
+; CM:  WIDEN ir<%add> = add ir<%a>, ir<%b>
+; CM:  Successor(s): vector loop
 
 ; CM: LV: Scalar loop costs: 14.
-; CM: LV: Found an estimated cost of 0 for VF 2 For instruction:   %a = extractvalue { float, float } %sv, 0
-; CM-NEXT: LV: Found an estimated cost of 0 for VF 2 For instruction:   %b = extractvalue { float, float } %sv, 1
 
 ; FORCED-LABEL: define void @test_getVectorCallCost
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
index c9990487665b98..064eead452852d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
@@ -8,9 +8,9 @@ target triple = "aarch64--linux-gnu"
 
 ; CHECK-COST-LABEL: sadd
 ; CHECK-COST: Found an estimated cost of 6 for VF 1 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Found an estimated cost of 4 for VF 2 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
+; CHECK-COST: Cost of 4 for VF 2: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
+; CHECK-COST: Cost of 1 for VF 4: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
+; CHECK-COST: Cost of 1 for VF 8: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
 
 define void @saddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr nocapture noalias %pDst, i32 %blockSize) #0 {
 ; CHECK-LABEL: @saddsat(
@@ -95,10 +95,10 @@ while.end:                                        ; preds = %while.body, %entry
 
 ; CHECK-COST-LABEL: umin
 ; CHECK-COST: Found an estimated cost of 2 for VF 1 For instruction:   %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 2 For instruction:   %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction:   %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction:   %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 16 For instruction:   %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
+; CHECK-COST: Cost of 1 for VF 2: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)
+; CHECK-COST: Cost of 1 for VF 4: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)
+; CHECK-COST: Cost of 1 for VF 8: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)
+; CHECK-COST: Cost of 1 for VF 16: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)
 
 define void @umin(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocapture noalias %pDst, i32 %blockSize) #0 {
 ; CHECK-LABEL: @umin(
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
index 6b4cfa091c45e7..93bc131ee5c5a4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
@@ -5,8 +5,8 @@
 target triple = "aarch64-unknown-linux-gnu"
 
 ; CHECK-COST: Checking a loop in 'fixed_width'
-; CHECK-COST: Found an estimated cost of 10 for VF 2 For instruction:   store i32 2, ptr %arrayidx1, align 4
-; CHECK-COST: Found an estimated cost of 20 for VF 4 For instruction:   store i32 2, ptr %arrayidx1, align 4
+; CHECK-COST: Cost of 10 for VF 2: WIDEN store vp<%6>, ir<2>, vp<%5>
+; CHECK-COST: Cost of 20 for VF 4: WIDEN store vp<%6>, ir<2>, vp<%5>
 ; CHECK-COST: Selecting VF: 1.
 
 ; We should decide this loop is not worth vectorising using fixed width vectors
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll b/llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll
index 5b90456a4f458b..05e0b7ca7520f2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll
@@ -10,10 +10,10 @@ target triple = "aarch64"
 ; due to invalid cost decisions. The loop below has a low maximum trip count,
 ; so will be masked.
 
-; COST: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %0 = load
-; COST: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %0 = load
-; COST: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %0 = load
-; COST: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %0 = load
+; COST: Cost of 3000000 for VF 2: REPLICATE ir<%0> = load
+; COST: Cost of 3000000 for VF 4: REPLICATE ir<%0> = load
+; COST: Cost of 3000000 for VF 8: REPLICATE ir<%0> = load
+; COST: Cost of 3000000 for VF 16: REPLICATE ir<%0> = load
 ; COST: LV: Selecting VF: 1.
 
 define i32 @test(ptr nocapture noundef readonly %pInVec, ptr nocapture noundef readonly %pInA1, ptr nocapture noundef readonly %pInA2, ptr nocapture noundef readonly %pInA3, ptr nocapture noundef readonly %pInA4, i32 noundef %numCols) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
index 785241d342ddc5..089d279d152455 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
@@ -6,7 +6,6 @@ target triple = "aarch64--linux-gnu"
 
 ; CHECK-LABEL: all_scalar
 ; CHECK:       LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
-; CHECK:       LV: Found an estimated cost of 1 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2
 ; CHECK:       LV: Not considering vector loop of width 2 because it will not generate any vector instructions
 ;
 define void @all_scalar(ptr %a, i64 %n) {
@@ -27,7 +26,6 @@ for.end:
 
 ; CHECK-LABEL: PR33193
 ; CHECK:       LV: Found scalar instruction: %i.next = zext i32 %j.next to i64
-; CHECK:       LV: Found an estimated cost of 0 for VF 8 For instruction: %i.next = zext i32 %j.next to i64
 ; CHECK:       LV: Not considering vector loop of width 8 because it will not generate any vector instructions
 %struct.a = type { i32, i8 }
 define void @PR33193(ptr %a, i64 %n) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll
index 92b043a9c29d50..fb5ff66989a679 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-fp-ext-trunc-illegal-type.ll
@@ -9,7 +9,7 @@ target triple = "aarch64-unknown-linux-gnu"
 ;; registers required for a <vscale x 4 x fp128> when trying to maximize
 ;; vector bandwidth with SVE.
 
-; CHECK: LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %load.ext = fpext double %load.in to fp128
+; CHECK: Cost of Invalid for VF vscale x 2: WIDEN-CAST ir<%load.ext> = fpext  ir<%load.in> to fp128
 
 define void @load_ext_trunc_store(ptr readonly %in, ptr noalias %out, i64 %N) {
 ; CHECK-LABEL: define void @load_ext_trunc_store(
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
index f28f77bf1b1558..36c35d5a154d94 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
@@ -1,49 +1,55 @@
 ; REQUIRES: asserts
 ; RUN: opt -mtriple=aarch64 -mattr=+sve \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
+; RUN:     | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16
 
 ; RUN: opt -mtriple=aarch64 -mattr=+sve -mcpu=generic \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
+; RUN:     | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16
 
 ; RUN: opt -mtriple=aarch64 -mcpu=neoverse-v1 \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE4
+; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE16
 
 ; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4
+; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE16
 
-; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \
+; RUN: opt -mtriple=aarch64 -mcpu=neoverse-v2 \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4
+; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-V2,VF-16
+
+; GENERIC: Cost for VF vscale x 2: 11
+; GENERIC: Cost for VF vscale x 4: 11
+; GENERIC: LV: Selecting VF: vscale x 16
 
-; GENERIC: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2).
-; GENERIC: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2).
+; NEOVERSE-V1: Cost for VF vscale x 2: 11
+; NEOVERSE-V1: Cost for VF vscale x 4: 11
+; NEOVERSE-V1: LV: Selecting VF: vscale x 16
 
-; NEOVERSE-V1: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2).
-; NEOVERSE-V1: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2).
+; NEOVERSE-N2: Cost for VF vscale x 2: 11
+; NEOVERSE-N2: Cost for VF vscale x 4: 11
+; NEOVERSE-N2: LV: Selecting VF: vscale x 16
 
-; NEOVERSE-N2: LV: Vector loop of width vscale x 2 costs: 6 (assuming a minimum vscale of 1).
-; NEOVERSE-N2: LV: Vector loop of width vscale x 4 costs: 3 (assuming a minimum vscale of 1).
+; NEOVERSE-V2: Cost for VF vscale x 2: 11
+; NEOVERSE-V2: Cost for VF vscale x 4: 11
+; NEOVERSE-V2: LV: Selecting VF: 16
 
-; VF-4: <4 x i32>
-; VF-VSCALE4: <16 x i32>
+; VF-16: <16 x i8>
+; VF-VSCALE16: <vscale x 16 x i8>
 define void @test0(ptr %a, ptr %b, ptr %c) #0 {
 entry:
   br label %loop
 
 loop:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
-  %arrayidx = getelementptr inbounds i32, ptr %c, i64 %iv
-  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i8, ptr %c, i64 %iv
+  %0 = load i8, ptr %arrayidx, align 4
   %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %iv
   %1 = load i8, ptr %arrayidx2, align 4
-  %zext = zext i8 %1 to i32
-  %add = add nsw i32 %zext, %0
-  %arrayidx5 = getelementptr inbounds i32, ptr %a, i64 %iv
-  store i32 %add, ptr %arrayidx5, align 4
+  %add = add nsw i8 %0, %1
+  %arrayidx5 = getelementptr inbounds i8, ptr %a, i64 %iv
+  store i8 %add, ptr %arrayidx5, align 4
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, 1024
   br i1 %exitcond.not, label %exit, label %loop
@@ -51,4 +57,3 @@ loop:
 exit:
   ret void
 }
-
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll
index 2bcc93127da1e0..6165a73e77f238 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll
@@ -6,13 +6,15 @@ target triple = "arm64-apple-ios5.0.0"
 
 define void @selects_1(ptr nocapture %dst, i32 %A, i32 %B, i32 %C, i32 %N) {
 ; CHECK: LV: Checking a loop in 'selects_1'
-; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %cond = select i1 %cmp1, i32 10, i32 %and
-; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %cond6 = select i1 %cmp2, i32 30, i32 %and
-; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %cond11 = select i1 %cmp7, i32 %cond, i32 %cond6
 
-; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %cond = select i1 %cmp1, i32 10, i32 %and
-; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %cond6 = select i1 %cmp2, i32 30, i32 %and
-; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %cond11 = select i1 %cmp7, i32 %cond, i32 %cond6
+; CHECK: Cost of 1 for VF 2: WIDEN-SELECT ir<%cond> = select ir<%cmp1>, ir<10>, ir<%and>
+; CHECK: Cost of 1 for VF 2: WIDEN-SELECT ir<%cond6> = select ir<%cmp2>, ir<30>, ir<%and>
+; CHECK: Cost of 1 for VF 2: WIDEN-SELECT ir<%cond11> = select ir<%cmp7>, ir<%cond>, ir<%cond6>
+
+; CHECK: Cost of 1 for VF 4: WIDEN-SELECT ir<%cond> = select ir<%cmp1>, ir<10>, ir<%and>
+; CHECK: Cost of 1 for VF 4: WIDEN-SELECT ir<%cond6> = select ir<%cmp2>, ir<30>, ir<%and>
+; CHECK: Cost of 1 for VF 4: WIDEN-SELECT ir<%cond11> = select ir<%cmp7>, ir<%cond>, ir<%cond6>
+
 ; CHECK: LV: Selecting VF: 4
 
 entry:
@@ -48,9 +50,11 @@ for.cond.cleanup:                                 ; preds = %for.cond.cleanup.lo
 
 define i32 @multi_user_cmp(ptr readonly %a, i64 noundef %n) {
 ; CHECK: LV: Checking a loop in 'multi_user_cmp'
-; CHECK: LV: Found an estimated cost of 4 for VF 16 For instruction:   %cmp1 = fcmp olt float %load1, 0.000000e+00
-; CHECK: LV: Found an estimated cost of 1 for VF 16 For instruction:   %.any.0.off0 = select i1 %cmp1, i1 true, i1 %any.0.off09
-; CHECK: LV: Found an estimated cost of 1 for VF 16 For instruction:   %all.off = select i1 %cmp1, i1 %all.off.next, i1 false
+; CHECK: Cost of 1 for VF 16:
+; CHECK:  any-of reduction   %all.off = select i1 %cmp1, i1 %all.off.next, i1 false
+; CHECK: Cost of 1 for VF 16:
+; CHECK:  any-of reduction   %.any.0.off0 = select i1 %cmp1, i1 true, i1 %any.0.off09
+; CHECK: Cost of 4 for VF 16: WIDEN ir<%cmp1> = fcmp olt ir<%load1>, ir<0.000000e+00>
 ; CHECK: LV: Selecting VF: 16.
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
index 961fa59cadd360..b418905e514af0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
@@ -4,9 +4,9 @@
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction:   %indvars.iv1294 = phi i7 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
-; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction:   %addi7 = add i7 %indvars.iv1294, 0
-; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction:   %indvars.iv.next1295 = add i7 %indvars.iv1294, 1
+; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction   %indvars.iv.next1295 = add i7 %indvars.iv1294, 1
+; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction   %indvars.iv1294 = phi i7 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
+; DEBUG: Cost of Invalid for VF vscale x 1: WIDEN ir<%addi7> = add ir<%indvars.iv1294>, ir<0>
 
 define void @induction_i7(ptr %dst) #0 {
 ; CHECK-LABEL: define void @induction_i7(
@@ -71,9 +71,9 @@ for.end:                                          ; preds = %for.body
 }
 
 
-; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction:   %indvars.iv1294 = phi i3 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
-; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction:   %zexti3 = zext i3 %indvars.iv1294 to i64
-; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction:   %indvars.iv.next1295 = add i3 %indvars.iv1294, 1
+; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction   %indvars.iv.next1295 = add i3 %indvars.iv1294, 1
+; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction   %indvars.iv1294 = phi i3 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
+; DEBUG: Cost of Invalid for VF vscale x 1: WIDEN-CAST ir<%zexti3> = zext  ir<%indvars.iv1294> to i64
 
 define void @induction_i3_zext(ptr %dst) #0 {
 ; CHECK-LABEL: define void @induction_i3_zext(
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll
index 83bd07a3ec02cf..b84e8de678140f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll
@@ -7,10 +7,15 @@
 
 target triple="aarch64-unknown-linux-gnu"
 
-; CHECK: Found an estimated cost of 4 for VF vscale x 2 For instruction:   %add = fadd float %0, %sum.07
-; CHECK: Found an estimated cost of 8 for VF vscale x 4 For instruction:   %add = fadd float %0, %sum.07
-; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 2 for VF vscale x 2 For instruction:   %add = fadd float %0, %sum.07
-; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 4 for VF vscale x 4 For instruction:   %add = fadd float %0, %sum.07
+; CHECK-LABEL: LV: Checking a loop in 'fadd_strict32'
+; CHECK: Cost of 4 for VF vscale x 2:
+; CHECK:  in-loop reduction   %add = fadd float %0, %sum.07
+; CHECK: Cost of 8 for VF vscale x 4:
+; CHECK:  in-loop reduction   %add = fadd float %0, %sum.07
+; CHECK-CPU-NEOVERSE-N2: Cost of 2 for VF vscale x 2:
+; CHECK-CPU-NEOVERSE-N2:  in-loop reduction   %add = fadd float %0, %sum.07
+; CHECK-CPU-NEOVERSE-N2: Cost of 4 for VF vscale x 4:
+; CHECK-CPU-NEOVERSE-N2:  in-loop reduction   %add = fadd float %0, %sum.07
 
 define float @fadd_strict32(ptr noalias nocapture readonly %a, i64 %n) #0 {
 entry:
@@ -31,8 +36,11 @@ for.end:
 }
 
 
-; CHECK: Found an estimated cost of 4 for VF vscale x 2 For instruction:   %add = fadd double ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/113697