[llvm] 1fbb328 - [ARM] MVE ICmp costing tests. NFC

Thu Feb 18 02:50:55 PST 2021

Author: David Green
Date: 2021-02-18T10:50:34Z
New Revision: 1fbb3287fcda77d69a405dea17d7d11e1d3ac152

URL: https://github.com/llvm/llvm-project/commit/1fbb3287fcda77d69a405dea17d7d11e1d3ac152
DIFF: https://github.com/llvm/llvm-project/commit/1fbb3287fcda77d69a405dea17d7d11e1d3ac152.diff

LOG: [ARM] MVE ICmp costing tests. NFC

Added: 
    llvm/test/Analysis/CostModel/ARM/mve-cmp.ll
    llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll

Modified: 
    llvm/test/CodeGen/ARM/vselect_imax.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/Analysis/CostModel/ARM/mve-cmp.ll b/llvm/test/Analysis/CostModel/ARM/mve-cmp.ll
new file mode 100644
index 000000000000..0ca8bc0470b4

--- /dev/null
+++ b/llvm/test/Analysis/CostModel/ARM/mve-cmp.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -cost-model -analyze -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: opt -cost-model -analyze -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefixes=CHECK
+
+define void @icmp() {
+; CHECK-LABEL: 'icmp'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2i8 = icmp slt <2 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8 = icmp slt <4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = icmp slt <8 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8 = icmp slt <16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8 = icmp slt <32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2i16 = icmp slt <2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = icmp slt <4 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16 = icmp slt <8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16 = icmp slt <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2i32 = icmp slt <2 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32 = icmp slt <4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32 = icmp slt <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32 = icmp slt <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2i64 = icmp slt <2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4i64 = icmp slt <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v8i64 = icmp slt <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2i8 = icmp slt <2 x i8> undef, undef
+  %v4i8 = icmp slt <4 x i8> undef, undef
+  %v8i8 = icmp slt <8 x i8> undef, undef
+  %v16i8 = icmp slt <16 x i8> undef, undef
+  %v32i8 = icmp slt <32 x i8> undef, undef
+
+  %v2i16 = icmp slt <2 x i16> undef, undef
+  %v4i16 = icmp slt <4 x i16> undef, undef
+  %v8i16 = icmp slt <8 x i16> undef, undef
+  %v16i16 = icmp slt <16 x i16> undef, undef
+
+  %v2i32 = icmp slt <2 x i32> undef, undef
+  %v4i32 = icmp slt <4 x i32> undef, undef
+  %v8i32 = icmp slt <8 x i32> undef, undef
+  %v16i32 = icmp slt <16 x i32> undef, undef
+
+  %v2i64 = icmp slt <2 x i64> undef, undef
+  %v4i64 = icmp slt <4 x i64> undef, undef
+  %v8i64 = icmp slt <8 x i64> undef, undef
+
+  ret void
+}
+
+define void @fcmp() {
+; CHECK-LABEL: 'fcmp'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp olt <2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp olt <4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp olt <8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f16 = fcmp olt <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fcmp olt <2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fcmp olt <4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fcmp olt <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f32 = fcmp olt <16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = fcmp olt <2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f64 = fcmp olt <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v8f64 = fcmp olt <8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2f16 = fcmp olt <2 x half> undef, undef
+  %v4f16 = fcmp olt <4 x half> undef, undef
+  %v8f16 = fcmp olt <8 x half> undef, undef
+  %v16f16 = fcmp olt <16 x half> undef, undef
+
+  %v2f32 = fcmp olt <2 x float> undef, undef
+  %v4f32 = fcmp olt <4 x float> undef, undef
+  %v8f32 = fcmp olt <8 x float> undef, undef
+  %v16f32 = fcmp olt <16 x float> undef, undef
+
+  %v2f64 = fcmp olt <2 x double> undef, undef
+  %v4f64 = fcmp olt <4 x double> undef, undef
+  %v8f64 = fcmp olt <8 x double> undef, undef
+
+  ret void
+}

diff  --git a/llvm/test/CodeGen/ARM/vselect_imax.ll b/llvm/test/CodeGen/ARM/vselect_imax.ll
index 03e212c75567..31f19e03bf66 100644
--- a/llvm/test/CodeGen/ARM/vselect_imax.ll
+++ b/llvm/test/CodeGen/ARM/vselect_imax.ll
@@ -21,6 +21,7 @@ define void @func_blend10(%T0_10* %loadaddr, %T0_10* %loadaddr2,
 ; CHECK: vmin.s16
 ; CHECK: vmin.s16
 ; COST: func_blend10
+; COST: cost of 2 {{.*}} icmp
 ; COST: cost of 2 {{.*}} select
   %r = select %T1_10 %c, %T0_10 %v0, %T0_10 %v1
   store %T0_10 %r, %T0_10* %storeaddr
@@ -37,6 +38,7 @@ define void @func_blend14(%T0_14* %loadaddr, %T0_14* %loadaddr2,
 ; CHECK: vmin.s32
 ; CHECK: vmin.s32
 ; COST: func_blend14
+; COST: cost of 2 {{.*}} icmp
 ; COST: cost of 2 {{.*}} select
   %r = select %T1_14 %c, %T0_14 %v0, %T0_14 %v1
   store %T0_14 %r, %T0_14* %storeaddr
@@ -53,6 +55,7 @@ define void @func_blend15(%T0_15* %loadaddr, %T0_15* %loadaddr2,
   %v1 = load %T0_15, %T0_15* %loadaddr2
   %c = icmp slt %T0_15 %v0, %v1
 ; COST: func_blend15
+; COST: cost of 4 {{.*}} icmp
 ; COST: cost of 4 {{.*}} select
   %r = select %T1_15 %c, %T0_15 %v0, %T0_15 %v1
   store %T0_15 %r, %T0_15* %storeaddr
@@ -127,6 +130,7 @@ define void @func_blend18(%T0_18* %loadaddr, %T0_18* %loadaddr2,
   %v1 = load %T0_18, %T0_18* %loadaddr2
   %c = icmp slt %T0_18 %v0, %v1
 ; COST: func_blend18
+; COST: cost of 2 {{.*}} icmp
 ; COST: cost of 19 {{.*}} select
   %r = select %T1_18 %c, %T0_18 %v0, %T0_18 %v1
   store %T0_18 %r, %T0_18* %storeaddr
@@ -256,6 +260,7 @@ define void @func_blend19(%T0_19* %loadaddr, %T0_19* %loadaddr2,
   %v1 = load %T0_19, %T0_19* %loadaddr2
   %c = icmp slt %T0_19 %v0, %v1
 ; COST: func_blend19
+; COST: cost of 4 {{.*}} icmp
 ; COST: cost of 50 {{.*}} select
   %r = select %T1_19 %c, %T0_19 %v0, %T0_19 %v1
   store %T0_19 %r, %T0_19* %storeaddr
@@ -511,6 +516,7 @@ define void @func_blend20(%T0_20* %loadaddr, %T0_20* %loadaddr2,
   %v1 = load %T0_20, %T0_20* %loadaddr2
   %c = icmp slt %T0_20 %v0, %v1
 ; COST: func_blend20
+; COST: cost of 8 {{.*}} icmp
 ; COST: cost of 100 {{.*}} select
   %r = select %T1_20 %c, %T0_20 %v0, %T0_20 %v1
   store %T0_20 %r, %T0_20* %storeaddr

diff  --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
new file mode 100644
index 000000000000..45730abcea33
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
@@ -0,0 +1,240 @@
+; RUN: opt -loop-vectorize -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-arm-none-eabi"
+
+; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %arrayidx = getelementptr inbounds i16, i16* %s, i32 %i.016
+; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load i16, i16* %arrayidx, align 2
+; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %conv = sext i16 %1 to i32
+; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %cmp2 = icmp sgt i32 %conv, %conv1
+; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   br i1 %cmp2, label %if.then, label %for.inc
+; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %conv6 = add i16 %1, %0
+; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %arrayidx7 = getelementptr inbounds i16, i16* %d, i32 %i.016
+; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %conv6, i16* %arrayidx7, align 2
+; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   br label %for.inc
+; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %inc = add nuw nsw i32 %i.016, 1
+; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %exitcond.not = icmp eq i32 %inc, %n
+; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+; CHECK: LV: Scalar loop costs: 5.
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %arrayidx = getelementptr inbounds i16, i16* %s, i32 %i.016
+; CHECK: LV: Found an estimated cost of 10 for VF 2 For instruction:   %1 = load i16, i16* %arrayidx, align 2
+; CHECK: LV: Found an estimated cost of 4 for VF 2 For instruction:   %conv = sext i16 %1 to i32
+; CHECK: LV: Found an estimated cost of 12 for VF 2 For instruction:   %cmp2 = icmp sgt i32 %conv, %conv1
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   br i1 %cmp2, label %if.then, label %for.inc
+; CHECK: LV: Found an estimated cost of 14 for VF 2 For instruction:   %conv6 = add i16 %1, %0
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %arrayidx7 = getelementptr inbounds i16, i16* %d, i32 %i.016
+; CHECK: LV: Found an estimated cost of 16 for VF 2 For instruction:   store i16 %conv6, i16* %arrayidx7, align 2
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   br label %for.inc
+; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %inc = add nuw nsw i32 %i.016, 1
+; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %exitcond.not = icmp eq i32 %inc, %n
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+; CHECK: LV: Vector loop of width 2 costs: 29.
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %arrayidx = getelementptr inbounds i16, i16* %s, i32 %i.016
+; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load i16, i16* %arrayidx, align 2
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %conv = sext i16 %1 to i32
+; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %cmp2 = icmp sgt i32 %conv, %conv1
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   br i1 %cmp2, label %if.then, label %for.inc
+; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %conv6 = add i16 %1, %0
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %arrayidx7 = getelementptr inbounds i16, i16* %d, i32 %i.016
+; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   store i16 %conv6, i16* %arrayidx7, align 2
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   br label %for.inc
+; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %inc = add nuw nsw i32 %i.016, 1
+; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %exitcond.not = icmp eq i32 %inc, %n
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+; CHECK: LV: Vector loop of width 4 costs: 2.
+; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %arrayidx = getelementptr inbounds i16, i16* %s, i32 %i.016
+; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load i16, i16* %arrayidx, align 2
+; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %conv = sext i16 %1 to i32
+; CHECK: LV: Found an estimated cost of 4 for VF 8 For instruction:   %cmp2 = icmp sgt i32 %conv, %conv1
+; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   br i1 %cmp2, label %if.then, label %for.inc
+; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %conv6 = add i16 %1, %0
+; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %arrayidx7 = getelementptr inbounds i16, i16* %d, i32 %i.016
+; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   store i16 %conv6, i16* %arrayidx7, align 2
+; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   br label %for.inc
+; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction:   %inc = add nuw nsw i32 %i.016, 1
+; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction:   %exitcond.not = icmp eq i32 %inc, %n
+; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+; CHECK: LV: Vector loop of width 8 costs: 1.
+; CHECK: LV: Selecting VF: 8.
+define void @expensive_icmp(i16* noalias nocapture %d, i16* nocapture readonly %s, i32 %n, i16 zeroext %m) #0 {
+entry:
+  %cmp15 = icmp sgt i32 %n, 0
+  br i1 %cmp15, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %conv1 = zext i16 %m to i32
+  %0 = trunc i32 %n to i16
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.inc, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.inc
+  %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+  %arrayidx = getelementptr inbounds i16, i16* %s, i32 %i.016
+  %1 = load i16, i16* %arrayidx, align 2
+  %conv = sext i16 %1 to i32
+  %cmp2 = icmp sgt i32 %conv, %conv1
+  br i1 %cmp2, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %conv6 = add i16 %1, %0
+  %arrayidx7 = getelementptr inbounds i16, i16* %d, i32 %i.016
+  store i16 %conv6, i16* %arrayidx7, align 2
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %inc = add nuw nsw i32 %i.016, 1
+  %exitcond.not = icmp eq i32 %inc, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
+; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %pSrcA.addr.011 = phi i8* [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
+; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %pDst.addr.010 = phi i8* [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
+; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %pSrcB.addr.09 = phi i8* [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
+; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %incdec.ptr = getelementptr inbounds i8, i8* %pSrcA.addr.011, i32 1
+; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %0 = load i8, i8* %pSrcA.addr.011, align 1
+; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %conv1 = sext i8 %0 to i32
+; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %incdec.ptr2 = getelementptr inbounds i8, i8* %pSrcB.addr.09, i32 1
+; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load i8, i8* %pSrcB.addr.09, align 1
+; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %conv3 = sext i8 %1 to i32
+; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %mul = mul nsw i32 %conv3, %conv1
+; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %shr = ashr i32 %mul, 7
+; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %2 = icmp slt i32 %shr, 127
+; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %spec.select.i = select i1 %2, i32 %shr, i32 127
+; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %conv4 = trunc i32 %spec.select.i to i8
+; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   %incdec.ptr5 = getelementptr inbounds i8, i8* %pDst.addr.010, i32 1
+; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %conv4, i8* %pDst.addr.010, align 1
+; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %dec = add i32 %blkCnt.012, -1
+; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %cmp.not = icmp eq i32 %dec, 0
+; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   br i1 %cmp.not, label %while.end.loopexit, label %while.body
+; CHECK: LV: Scalar loop costs: 9.
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %pSrcA.addr.011 = phi i8* [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %pDst.addr.010 = phi i8* [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %pSrcB.addr.09 = phi i8* [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %incdec.ptr = getelementptr inbounds i8, i8* %pSrcA.addr.011, i32 1
+; CHECK: LV: Found an estimated cost of 10 for VF 2 For instruction:   %0 = load i8, i8* %pSrcA.addr.011, align 1
+; CHECK: LV: Found an estimated cost of 4 for VF 2 For instruction:   %conv1 = sext i8 %0 to i32
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %incdec.ptr2 = getelementptr inbounds i8, i8* %pSrcB.addr.09, i32 1
+; CHECK: LV: Found an estimated cost of 10 for VF 2 For instruction:   %1 = load i8, i8* %pSrcB.addr.09, align 1
+; CHECK: LV: Found an estimated cost of 4 for VF 2 For instruction:   %conv3 = sext i8 %1 to i32
+; CHECK: LV: Found an estimated cost of 14 for VF 2 For instruction:   %mul = mul nsw i32 %conv3, %conv1
+; CHECK: LV: Found an estimated cost of 10 for VF 2 For instruction:   %shr = ashr i32 %mul, 7
+; CHECK: LV: Found an estimated cost of 12 for VF 2 For instruction:   %2 = icmp slt i32 %shr, 127
+; CHECK: LV: Found an estimated cost of 12 for VF 2 For instruction:   %spec.select.i = select i1 %2, i32 %shr, i32 127
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %conv4 = trunc i32 %spec.select.i to i8
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %incdec.ptr5 = getelementptr inbounds i8, i8* %pDst.addr.010, i32 1
+; CHECK: LV: Found an estimated cost of 10 for VF 2 For instruction:   store i8 %conv4, i8* %pDst.addr.010, align 1
+; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %dec = add i32 %blkCnt.012, -1
+; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction:   %cmp.not = icmp eq i32 %dec, 0
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   br i1 %cmp.not, label %while.end.loopexit, label %while.body
+; CHECK: LV: Vector loop of width 2 costs: 44.
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %pSrcA.addr.011 = phi i8* [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %pDst.addr.010 = phi i8* [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %pSrcB.addr.09 = phi i8* [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %incdec.ptr = getelementptr inbounds i8, i8* %pSrcA.addr.011, i32 1
+; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %0 = load i8, i8* %pSrcA.addr.011, align 1
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %conv1 = sext i8 %0 to i32
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %incdec.ptr2 = getelementptr inbounds i8, i8* %pSrcB.addr.09, i32 1
+; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load i8, i8* %pSrcB.addr.09, align 1
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %conv3 = sext i8 %1 to i32
+; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %mul = mul nsw i32 %conv3, %conv1
+; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %shr = ashr i32 %mul, 7
+; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %2 = icmp slt i32 %shr, 127
+; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   %spec.select.i = select i1 %2, i32 %shr, i32 127
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %conv4 = trunc i32 %spec.select.i to i8
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %incdec.ptr5 = getelementptr inbounds i8, i8* %pDst.addr.010, i32 1
+; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction:   store i8 %conv4, i8* %pDst.addr.010, align 1
+; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %dec = add i32 %blkCnt.012, -1
+; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %cmp.not = icmp eq i32 %dec, 0
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   br i1 %cmp.not, label %while.end.loopexit, label %while.body
+; CHECK: LV: Vector loop of width 4 costs: 4.
+; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
+; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %pSrcA.addr.011 = phi i8* [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
+; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %pDst.addr.010 = phi i8* [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
+; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %pSrcB.addr.09 = phi i8* [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
+; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %incdec.ptr = getelementptr inbounds i8, i8* %pSrcA.addr.011, i32 1
+; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %0 = load i8, i8* %pSrcA.addr.011, align 1
+; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %conv1 = sext i8 %0 to i32
+; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %incdec.ptr2 = getelementptr inbounds i8, i8* %pSrcB.addr.09, i32 1
+; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load i8, i8* %pSrcB.addr.09, align 1
+; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %conv3 = sext i8 %1 to i32
+; CHECK: LV: Found an estimated cost of 4 for VF 8 For instruction:   %mul = mul nsw i32 %conv3, %conv1
+; CHECK: LV: Found an estimated cost of 4 for VF 8 For instruction:   %shr = ashr i32 %mul, 7
+; CHECK: LV: Found an estimated cost of 4 for VF 8 For instruction:   %2 = icmp slt i32 %shr, 127
+; CHECK: LV: Found an estimated cost of 4 for VF 8 For instruction:   %spec.select.i = select i1 %2, i32 %shr, i32 127
+; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %conv4 = trunc i32 %spec.select.i to i8
+; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   %incdec.ptr5 = getelementptr inbounds i8, i8* %pDst.addr.010, i32 1
+; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   store i8 %conv4, i8* %pDst.addr.010, align 1
+; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction:   %dec = add i32 %blkCnt.012, -1
+; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction:   %cmp.not = icmp eq i32 %dec, 0
+; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction:   br i1 %cmp.not, label %while.end.loopexit, label %while.body
+; CHECK: LV: Vector loop of width 8 costs: 3.
+; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
+; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   %pSrcA.addr.011 = phi i8* [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
+; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   %pDst.addr.010 = phi i8* [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
+; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   %pSrcB.addr.09 = phi i8* [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
+; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   %incdec.ptr = getelementptr inbounds i8, i8* %pSrcA.addr.011, i32 1
+; CHECK: LV: Found an estimated cost of 2 for VF 16 For instruction:   %0 = load i8, i8* %pSrcA.addr.011, align 1
+; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction:   %conv1 = sext i8 %0 to i32
+; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   %incdec.ptr2 = getelementptr inbounds i8, i8* %pSrcB.addr.09, i32 1
+; CHECK: LV: Found an estimated cost of 2 for VF 16 For instruction:   %1 = load i8, i8* %pSrcB.addr.09, align 1
+; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction:   %conv3 = sext i8 %1 to i32
+; CHECK: LV: Found an estimated cost of 8 for VF 16 For instruction:   %mul = mul nsw i32 %conv3, %conv1
+; CHECK: LV: Found an estimated cost of 8 for VF 16 For instruction:   %shr = ashr i32 %mul, 7
+; CHECK: LV: Found an estimated cost of 8 for VF 16 For instruction:   %2 = icmp slt i32 %shr, 127
+; CHECK: LV: Found an estimated cost of 8 for VF 16 For instruction:   %spec.select.i = select i1 %2, i32 %shr, i32 127
+; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction:   %conv4 = trunc i32 %spec.select.i to i8
+; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   %incdec.ptr5 = getelementptr inbounds i8, i8* %pDst.addr.010, i32 1
+; CHECK: LV: Found an estimated cost of 2 for VF 16 For instruction:   store i8 %conv4, i8* %pDst.addr.010, align 1
+; CHECK: LV: Found an estimated cost of 1 for VF 16 For instruction:   %dec = add i32 %blkCnt.012, -1
+; CHECK: LV: Found an estimated cost of 1 for VF 16 For instruction:   %cmp.not = icmp eq i32 %dec, 0
+; CHECK: LV: Found an estimated cost of 0 for VF 16 For instruction:   br i1 %cmp.not, label %while.end.loopexit, label %while.body
+; CHECK: LV: Vector loop of width 16 costs: 3.
+; CHECK: LV: Selecting VF: 16.
+define void @cheap_icmp(i8* nocapture readonly %pSrcA, i8* nocapture readonly %pSrcB, i8* nocapture %pDst, i32 %blockSize) #0 {
+entry:
+  %cmp.not8 = icmp eq i32 %blockSize, 0
+  br i1 %cmp.not8, label %while.end, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ]
+  %pSrcA.addr.011 = phi i8* [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ]
+  %pDst.addr.010 = phi i8* [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ]
+  %pSrcB.addr.09 = phi i8* [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ]
+  %incdec.ptr = getelementptr inbounds i8, i8* %pSrcA.addr.011, i32 1
+  %0 = load i8, i8* %pSrcA.addr.011, align 1
+  %conv1 = sext i8 %0 to i32
+  %incdec.ptr2 = getelementptr inbounds i8, i8* %pSrcB.addr.09, i32 1
+  %1 = load i8, i8* %pSrcB.addr.09, align 1
+  %conv3 = sext i8 %1 to i32
+  %mul = mul nsw i32 %conv3, %conv1
+  %shr = ashr i32 %mul, 7
+  %2 = icmp slt i32 %shr, 127
+  %spec.select.i = select i1 %2, i32 %shr, i32 127
+  %conv4 = trunc i32 %spec.select.i to i8
+  %incdec.ptr5 = getelementptr inbounds i8, i8* %pDst.addr.010, i32 1
+  store i8 %conv4, i8* %pDst.addr.010, align 1
+  %dec = add i32 %blkCnt.012, -1
+  %cmp.not = icmp eq i32 %dec, 0
+  br i1 %cmp.not, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:                               ; preds = %while.body
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  ret void
+}
+
+attributes #0 = { "target-features"="+mve.fp" }