[llvm] be5c15c - [NFC][Costmodel][LV][X86] Refresh one or two interleaved load/store tests

Roman Lebedev via llvm-commits llvm-commits at lists.llvm.org
Fri Apr 15 07:43:43 PDT 2022


Author: Roman Lebedev
Date: 2022-04-15T17:43:18+03:00
New Revision: be5c15c7aee1ef4964c88031555ed6b0f59ebc23

URL: https://github.com/llvm/llvm-project/commit/be5c15c7aee1ef4964c88031555ed6b0f59ebc23
DIFF: https://github.com/llvm/llvm-project/commit/be5c15c7aee1ef4964c88031555ed6b0f59ebc23.diff

LOG: [NFC][Costmodel][LV][X86] Refresh one or two interleaved load/store tests

Added: 
    llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-5.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-7.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-8.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-5.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-7.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-7.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-8.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-5.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-7.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-8.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-5.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-7.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-5.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-7.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-8.ll

Modified: 
    llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-2.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-3.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-4.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-6.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-2.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-3.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-4.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-6.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-2.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-3.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-4.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-6.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-2.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-3.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-4.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-6.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-2.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-3.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-4.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-6.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-2.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-3.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-4.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-5.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-6.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-2.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-3.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-4.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-6.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-2.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-3.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-4.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-6.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-2.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-3.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-4.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-6.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-2.ll
index d601b552b1377..1c276e74decc9 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-2.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load float, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +11,39 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x float] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 28 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 56 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
 ;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load float, float* %in0, align 4
-; SSE2: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load float, float* %in0, align 4
-; SSE2: LV: Found an estimated cost of 4 for VF 4 For instruction:   %v0 = load float, float* %in0, align 4
-; SSE2: LV: Found an estimated cost of 28 for VF 8 For instruction:   %v0 = load float, float* %in0, align 4
-; SSE2: LV: Found an estimated cost of 56 for VF 16 For instruction:   %v0 = load float, float* %in0, align 4
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 38 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 76 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 152 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX1: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX1: LV: Found an estimated cost of 3 for VF 4 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX1: LV: Found an estimated cost of 38 for VF 8 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX1: LV: Found an estimated cost of 76 for VF 16 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX1: LV: Found an estimated cost of 152 for VF 32 For instruction:   %v0 = load float, float* %in0, align 4
-;;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX2: LV: Found an estimated cost of 3 for VF 4 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX2: LV: Found an estimated cost of 6 for VF 8 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX2: LV: Found an estimated cost of 12 for VF 16 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX2: LV: Found an estimated cost of 24 for VF 32 For instruction:   %v0 = load float, float* %in0, align 4
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 6 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 12 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 24 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX512: LV: Found an estimated cost of 3 for VF 4 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX512: LV: Found an estimated cost of 3 for VF 8 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX512: LV: Found an estimated cost of 5 for VF 16 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX512: LV: Found an estimated cost of 22 for VF 32 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX512: LV: Found an estimated cost of 92 for VF 64 For instruction:   %v0 = load float, float* %in0, align 4
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 3 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 5 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 22 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 92 for VF 64 For instruction: %v0 = load float, ptr %in0, align 4
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %v0 = load float, float* %in0, align 4
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -52,17 +53,17 @@ for.body:
   %iv.0 = add nuw nsw i64 %iv, 0
   %iv.1 = add nuw nsw i64 %iv, 1
 
-  %in0 = getelementptr inbounds [1024 x float], [1024 x float]* @A, i64 0, i64 %iv.0
-  %in1 = getelementptr inbounds [1024 x float], [1024 x float]* @A, i64 0, i64 %iv.1
+  %in0 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.1
 
-  %v0 = load float, float* %in0
-  %v1 = load float, float* %in1
+  %v0 = load float, ptr %in0
+  %v1 = load float, ptr %in1
 
   %reduce.add.0 = fadd float %v0, %v1
 
   %reduce.add.0.narrow = fptoui float %reduce.add.0 to i8
 
-  %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
   store i8 %reduce.add.0.narrow, i8* %out
 
   %iv.next = add nuw nsw i64 %iv.0, 2

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-3.ll
index 2d0dbff1ab179..efdd3bcd1d5f6 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-3.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load float, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +11,39 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x float] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 21 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 42 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 84 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
 ;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load float, float* %in0, align 4
-; SSE2: LV: Found an estimated cost of 9 for VF 2 For instruction:   %v0 = load float, float* %in0, align 4
-; SSE2: LV: Found an estimated cost of 21 for VF 4 For instruction:   %v0 = load float, float* %in0, align 4
-; SSE2: LV: Found an estimated cost of 42 for VF 8 For instruction:   %v0 = load float, float* %in0, align 4
-; SSE2: LV: Found an estimated cost of 84 for VF 16 For instruction:   %v0 = load float, float* %in0, align 4
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 24 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 57 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 114 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 228 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX1: LV: Found an estimated cost of 12 for VF 2 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX1: LV: Found an estimated cost of 24 for VF 4 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX1: LV: Found an estimated cost of 57 for VF 8 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX1: LV: Found an estimated cost of 114 for VF 16 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX1: LV: Found an estimated cost of 228 for VF 32 For instruction:   %v0 = load float, float* %in0, align 4
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 44 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX2: LV: Found an estimated cost of 5 for VF 4 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX2: LV: Found an estimated cost of 10 for VF 8 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX2: LV: Found an estimated cost of 20 for VF 16 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX2: LV: Found an estimated cost of 44 for VF 32 For instruction:   %v0 = load float, float* %in0, align 4
-
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX512: LV: Found an estimated cost of 4 for VF 4 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX512: LV: Found an estimated cost of 6 for VF 8 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX512: LV: Found an estimated cost of 12 for VF 16 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX512: LV: Found an estimated cost of 51 for VF 32 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX512: LV: Found an estimated cost of 210 for VF 64 For instruction:   %v0 = load float, float* %in0, align 4
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 6 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 12 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 51 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 210 for VF 64 For instruction: %v0 = load float, ptr %in0, align 4
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %v0 = load float, float* %in0, align 4
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -53,20 +54,20 @@ for.body:
   %iv.1 = add nuw nsw i64 %iv, 1
   %iv.2 = add nuw nsw i64 %iv, 2
 
-  %in0 = getelementptr inbounds [1024 x float], [1024 x float]* @A, i64 0, i64 %iv.0
-  %in1 = getelementptr inbounds [1024 x float], [1024 x float]* @A, i64 0, i64 %iv.1
-  %in2 = getelementptr inbounds [1024 x float], [1024 x float]* @A, i64 0, i64 %iv.2
+  %in0 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.2
 
-  %v0 = load float, float* %in0
-  %v1 = load float, float* %in1
-  %v2 = load float, float* %in2
+  %v0 = load float, ptr %in0
+  %v1 = load float, ptr %in1
+  %v2 = load float, ptr %in2
 
   %reduce.add.0 = fadd float %v0, %v1
   %reduce.add.1 = fadd float %reduce.add.0, %v2
 
   %reduce.add.1.narrow = fptoui float %reduce.add.1 to i8
 
-  %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
   store i8 %reduce.add.1.narrow, i8* %out
 
   %iv.next = add nuw nsw i64 %iv.0, 3

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-4.ll
index 10d0cb565979f..b2a5b0d5558e9 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-4.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load float, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,38 +11,38 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x float] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
-;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load float, float* %in0, align 4
-; SSE2: LV: Found an estimated cost of 12 for VF 2 For instruction:   %v0 = load float, float* %in0, align 4
-; SSE2: LV: Found an estimated cost of 28 for VF 4 For instruction:   %v0 = load float, float* %in0, align 4
-; SSE2: LV: Found an estimated cost of 56 for VF 8 For instruction:   %v0 = load float, float* %in0, align 4
-; SSE2: LV: Found an estimated cost of 112 for VF 16 For instruction:   %v0 = load float, float* %in0, align 4
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 28 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 56 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 112 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX1: LV: Found an estimated cost of 15 for VF 2 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX1: LV: Found an estimated cost of 34 for VF 4 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX1: LV: Found an estimated cost of 76 for VF 8 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX1: LV: Found an estimated cost of 152 for VF 16 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX1: LV: Found an estimated cost of 304 for VF 32 For instruction:   %v0 = load float, float* %in0, align 4
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 34 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 76 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 152 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 304 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX2: LV: Found an estimated cost of 84 for VF 32 For instruction:   %v0 = load float, float* %in0, align 4
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 10 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 40 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 84 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX512: LV: Found an estimated cost of 5 for VF 4 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX512: LV: Found an estimated cost of 8 for VF 8 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX512: LV: Found an estimated cost of 22 for VF 16 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX512: LV: Found an estimated cost of 92 for VF 32 For instruction:   %v0 = load float, float* %in0, align 4
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 8 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 22 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 92 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %v0 = load float, float* %in0, align 4
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -53,15 +54,15 @@ for.body:
   %iv.2 = add nuw nsw i64 %iv, 2
   %iv.3 = add nuw nsw i64 %iv, 3
 
-  %in0 = getelementptr inbounds [1024 x float], [1024 x float]* @A, i64 0, i64 %iv.0
-  %in1 = getelementptr inbounds [1024 x float], [1024 x float]* @A, i64 0, i64 %iv.1
-  %in2 = getelementptr inbounds [1024 x float], [1024 x float]* @A, i64 0, i64 %iv.2
-  %in3 = getelementptr inbounds [1024 x float], [1024 x float]* @A, i64 0, i64 %iv.3
+  %in0 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.3
 
-  %v0 = load float, float* %in0
-  %v1 = load float, float* %in1
-  %v2 = load float, float* %in2
-  %v3 = load float, float* %in3
+  %v0 = load float, ptr %in0
+  %v1 = load float, ptr %in1
+  %v2 = load float, ptr %in2
+  %v3 = load float, ptr %in3
 
   %reduce.add.0 = fadd float %v0, %v1
   %reduce.add.1 = fadd float %reduce.add.0, %v2
@@ -69,7 +70,7 @@ for.body:
 
   %reduce.add.2.narrow = fptoui float %reduce.add.2 to i8
 
-  %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
   store i8 %reduce.add.2.narrow, i8* %out
 
   %iv.next = add nuw nsw i64 %iv.0, 4

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll
new file mode 100644
index 0000000000000..0b470aae97ff1
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll
@@ -0,0 +1,83 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load float, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x float] zeroinitializer, align 128
+ at B = global [1024 x i8] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 70 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 18 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 41 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 95 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 190 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 160 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+;
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 9 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 35 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 145 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+
+  %in0 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.3
+  %in4 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.4
+
+  %v0 = load float, ptr %in0
+  %v1 = load float, ptr %in1
+  %v2 = load float, ptr %in2
+  %v3 = load float, ptr %in3
+  %v4 = load float, ptr %in4
+
+  %reduce.add.0 = fadd float %v0, %v1
+  %reduce.add.1 = fadd float %reduce.add.0, %v2
+  %reduce.add.2 = fadd float %reduce.add.1, %v3
+  %reduce.add.3 = fadd float %reduce.add.2, %v4
+
+  %reduce.add.3.narrow = fptoui float %reduce.add.3 to i8
+
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
+  store i8 %reduce.add.3.narrow, i8* %out
+
+  %iv.next = add nuw nsw i64 %iv.0, 5
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-6.ll
index 1572f3dcc8f42..435cb2f70e406 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-6.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load float, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,35 +11,35 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x float] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
-;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load float, float* %in0, align 4
-; SSE2: LV: Found an estimated cost of 18 for VF 2 For instruction:   %v0 = load float, float* %in0, align 4
-; SSE2: LV: Found an estimated cost of 42 for VF 4 For instruction:   %v0 = load float, float* %in0, align 4
-; SSE2: LV: Found an estimated cost of 84 for VF 8 For instruction:   %v0 = load float, float* %in0, align 4
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 18 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 42 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 84 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX1: LV: Found an estimated cost of 21 for VF 2 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX1: LV: Found an estimated cost of 51 for VF 4 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX1: LV: Found an estimated cost of 114 for VF 8 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX1: LV: Found an estimated cost of 228 for VF 16 For instruction:   %v0 = load float, float* %in0, align 4
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 51 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 114 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 228 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX2: LV: Found an estimated cost of 8 for VF 2 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX2: LV: Found an estimated cost of 18 for VF 4 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX2: LV: Found an estimated cost of 37 for VF 8 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX2: LV: Found an estimated cost of 76 for VF 16 For instruction:   %v0 = load float, float* %in0, align 4
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 18 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 37 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 76 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX512: LV: Found an estimated cost of 7 for VF 2 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX512: LV: Found an estimated cost of 11 for VF 4 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX512: LV: Found an estimated cost of 21 for VF 8 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX512: LV: Found an estimated cost of 51 for VF 16 For instruction:   %v0 = load float, float* %in0, align 4
-; AVX512: LV: Found an estimated cost of 210 for VF 32 For instruction:   %v0 = load float, float* %in0, align 4
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 11 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 21 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 51 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 210 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %v0 = load float, float* %in0, align 4
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -52,19 +53,19 @@ for.body:
   %iv.4 = add nuw nsw i64 %iv, 4
   %iv.5 = add nuw nsw i64 %iv, 5
 
-  %in0 = getelementptr inbounds [1024 x float], [1024 x float]* @A, i64 0, i64 %iv.0
-  %in1 = getelementptr inbounds [1024 x float], [1024 x float]* @A, i64 0, i64 %iv.1
-  %in2 = getelementptr inbounds [1024 x float], [1024 x float]* @A, i64 0, i64 %iv.2
-  %in3 = getelementptr inbounds [1024 x float], [1024 x float]* @A, i64 0, i64 %iv.3
-  %in4 = getelementptr inbounds [1024 x float], [1024 x float]* @A, i64 0, i64 %iv.4
-  %in5 = getelementptr inbounds [1024 x float], [1024 x float]* @A, i64 0, i64 %iv.5
+  %in0 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.3
+  %in4 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.4
+  %in5 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.5
 
-  %v0 = load float, float* %in0
-  %v1 = load float, float* %in1
-  %v2 = load float, float* %in2
-  %v3 = load float, float* %in3
-  %v4 = load float, float* %in4
-  %v5 = load float, float* %in5
+  %v0 = load float, ptr %in0
+  %v1 = load float, ptr %in1
+  %v2 = load float, ptr %in2
+  %v3 = load float, ptr %in3
+  %v4 = load float, ptr %in4
+  %v5 = load float, ptr %in5
 
   %reduce.add.0 = fadd float %v0, %v1
   %reduce.add.1 = fadd float %reduce.add.0, %v2
@@ -74,7 +75,7 @@ for.body:
 
   %reduce.add.4.narrow = fptoui float %reduce.add.4 to i8
 
-  %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
   store i8 %reduce.add.4.narrow, i8* %out
 
   %iv.next = add nuw nsw i64 %iv.0, 6

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll
new file mode 100644
index 0000000000000..add73d68ebbcc
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll
@@ -0,0 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load float, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x float] zeroinitializer, align 128
+ at B = global [1024 x i8] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 49 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 98 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 27 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 58 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 133 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 266 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 49 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 224 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+;
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 35 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 70 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+  %iv.5 = add nuw nsw i64 %iv, 5
+  %iv.6 = add nuw nsw i64 %iv, 6
+
+  %in0 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.3
+  %in4 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.4
+  %in5 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.5
+  %in6 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.6
+
+  %v0 = load float, ptr %in0
+  %v1 = load float, ptr %in1
+  %v2 = load float, ptr %in2
+  %v3 = load float, ptr %in3
+  %v4 = load float, ptr %in4
+  %v5 = load float, ptr %in5
+  %v6 = load float, ptr %in6
+
+  %reduce.add.0 = fadd float %v0, %v1
+  %reduce.add.1 = fadd float %reduce.add.0, %v2
+  %reduce.add.2 = fadd float %reduce.add.1, %v3
+  %reduce.add.3 = fadd float %reduce.add.2, %v4
+  %reduce.add.4 = fadd float %reduce.add.3, %v5
+  %reduce.add.5 = fadd float %reduce.add.4, %v6
+
+  %reduce.add.5.narrow = fptoui float %reduce.add.5 to i8
+
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
+  store i8 %reduce.add.5.narrow, i8* %out
+
+  %iv.next = add nuw nsw i64 %iv.0, 7
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll
new file mode 100644
index 0000000000000..9f6ff5e3162c4
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load float, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x float] zeroinitializer, align 128
+ at B = global [1024 x i8] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 30 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 68 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 152 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 304 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 256 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+;
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 14 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 92 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+  %iv.5 = add nuw nsw i64 %iv, 5
+  %iv.6 = add nuw nsw i64 %iv, 6
+  %iv.7 = add nuw nsw i64 %iv, 7
+
+  %in0 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.3
+  %in4 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.4
+  %in5 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.5
+  %in6 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.6
+  %in7 = getelementptr inbounds [1024 x float], ptr @A, i64 0, i64 %iv.7
+
+  %v0 = load float, ptr %in0
+  %v1 = load float, ptr %in1
+  %v2 = load float, ptr %in2
+  %v3 = load float, ptr %in3
+  %v4 = load float, ptr %in4
+  %v5 = load float, ptr %in5
+  %v6 = load float, ptr %in6
+  %v7 = load float, ptr %in7
+
+  %reduce.add.0 = fadd float %v0, %v1
+  %reduce.add.1 = fadd float %reduce.add.0, %v2
+  %reduce.add.2 = fadd float %reduce.add.1, %v3
+  %reduce.add.3 = fadd float %reduce.add.2, %v4
+  %reduce.add.4 = fadd float %reduce.add.3, %v5
+  %reduce.add.5 = fadd float %reduce.add.4, %v6
+  %reduce.add.6 = fadd float %reduce.add.5, %v7
+
+  %reduce.add.6.narrow = fptoui float %reduce.add.6 to i8
+
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
+  store i8 %reduce.add.6.narrow, i8* %out
+
+  %iv.next = add nuw nsw i64 %iv.0, 8
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll
index 728db12229942..cc2818e14c358 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load double, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +11,39 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x double] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 24 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 48 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
 ;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load double, double* %in0, align 8
-; SSE2: LV: Found an estimated cost of 4 for VF 2 For instruction:   %v0 = load double, double* %in0, align 8
-; SSE2: LV: Found an estimated cost of 12 for VF 4 For instruction:   %v0 = load double, double* %in0, align 8
-; SSE2: LV: Found an estimated cost of 24 for VF 8 For instruction:   %v0 = load double, double* %in0, align 8
-; SSE2: LV: Found an estimated cost of 48 for VF 16 For instruction:   %v0 = load double, double* %in0, align 8
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 16 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 32 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 64 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 128 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX1: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX1: LV: Found an estimated cost of 16 for VF 4 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX1: LV: Found an estimated cost of 32 for VF 8 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX1: LV: Found an estimated cost of 64 for VF 16 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX1: LV: Found an estimated cost of 128 for VF 32 For instruction:   %v0 = load double, double* %in0, align 8
-;;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX2: LV: Found an estimated cost of 6 for VF 4 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX2: LV: Found an estimated cost of 12 for VF 8 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX2: LV: Found an estimated cost of 24 for VF 16 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX2: LV: Found an estimated cost of 48 for VF 32 For instruction:   %v0 = load double, double* %in0, align 8
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 24 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 48 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX512: LV: Found an estimated cost of 3 for VF 4 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX512: LV: Found an estimated cost of 5 for VF 8 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX512: LV: Found an estimated cost of 22 for VF 16 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX512: LV: Found an estimated cost of 80 for VF 32 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX512: LV: Found an estimated cost of 160 for VF 64 For instruction:   %v0 = load double, double* %in0, align 8
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 5 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 22 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 80 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 160 for VF 64 For instruction: %v0 = load double, ptr %in0, align 8
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %v0 = load double, double* %in0, align 8
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -52,17 +53,17 @@ for.body:
   %iv.0 = add nuw nsw i64 %iv, 0
   %iv.1 = add nuw nsw i64 %iv, 1
 
-  %in0 = getelementptr inbounds [1024 x double], [1024 x double]* @A, i64 0, i64 %iv.0
-  %in1 = getelementptr inbounds [1024 x double], [1024 x double]* @A, i64 0, i64 %iv.1
+  %in0 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.1
 
-  %v0 = load double, double* %in0
-  %v1 = load double, double* %in1
+  %v0 = load double, ptr %in0
+  %v1 = load double, ptr %in1
 
   %reduce.add.0 = fadd double %v0, %v1
 
   %reduce.add.0.narrow = fptoui double %reduce.add.0 to i8
 
-  %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
   store i8 %reduce.add.0.narrow, i8* %out
 
   %iv.next = add nuw nsw i64 %iv.0, 2

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll
index a61b9ad739caf..af01ed53a107c 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load double, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,36 +11,36 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x double] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
-;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load double, double* %in0, align 8
-; SSE2: LV: Found an estimated cost of 9 for VF 2 For instruction:   %v0 = load double, double* %in0, align 8
-; SSE2: LV: Found an estimated cost of 18 for VF 4 For instruction:   %v0 = load double, double* %in0, align 8
-; SSE2: LV: Found an estimated cost of 36 for VF 8 For instruction:   %v0 = load double, double* %in0, align 8
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 18 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 36 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX1: LV: Found an estimated cost of 10 for VF 2 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX1: LV: Found an estimated cost of 24 for VF 4 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX1: LV: Found an estimated cost of 48 for VF 8 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX1: LV: Found an estimated cost of 96 for VF 16 For instruction:   %v0 = load double, double* %in0, align 8
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 10 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 24 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 96 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction:   %v0 = load double, double* %in0, align 8
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 32 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX512: LV: Found an estimated cost of 6 for VF 4 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX512: LV: Found an estimated cost of 12 for VF 8 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX512: LV: Found an estimated cost of 51 for VF 16 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX512: LV: Found an estimated cost of 120 for VF 32 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX512: LV: Found an estimated cost of 240 for VF 64 For instruction:   %v0 = load double, double* %in0, align 8
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 51 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 120 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 240 for VF 64 For instruction: %v0 = load double, ptr %in0, align 8
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %v0 = load double, double* %in0, align 8
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -50,20 +51,20 @@ for.body:
   %iv.1 = add nuw nsw i64 %iv, 1
   %iv.2 = add nuw nsw i64 %iv, 2
 
-  %in0 = getelementptr inbounds [1024 x double], [1024 x double]* @A, i64 0, i64 %iv.0
-  %in1 = getelementptr inbounds [1024 x double], [1024 x double]* @A, i64 0, i64 %iv.1
-  %in2 = getelementptr inbounds [1024 x double], [1024 x double]* @A, i64 0, i64 %iv.2
+  %in0 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.2
 
-  %v0 = load double, double* %in0
-  %v1 = load double, double* %in1
-  %v2 = load double, double* %in2
+  %v0 = load double, ptr %in0
+  %v1 = load double, ptr %in1
+  %v2 = load double, ptr %in2
 
   %reduce.add.0 = fadd double %v0, %v1
   %reduce.add.1 = fadd double %reduce.add.0, %v2
 
   %reduce.add.1.narrow = fptoui double %reduce.add.1 to i8
 
-  %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
   store i8 %reduce.add.1.narrow, i8* %out
 
   %iv.next = add nuw nsw i64 %iv.0, 3

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll
index 24b2f1d9ae81d..8eaa635358501 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load double, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,35 +11,35 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x double] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
-;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load double, double* %in0, align 8
-; SSE2: LV: Found an estimated cost of 12 for VF 2 For instruction:   %v0 = load double, double* %in0, align 8
-; SSE2: LV: Found an estimated cost of 24 for VF 4 For instruction:   %v0 = load double, double* %in0, align 8
-; SSE2: LV: Found an estimated cost of 48 for VF 8 For instruction:   %v0 = load double, double* %in0, align 8
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 24 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX1: LV: Found an estimated cost of 14 for VF 2 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX1: LV: Found an estimated cost of 32 for VF 4 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX1: LV: Found an estimated cost of 64 for VF 8 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX1: LV: Found an estimated cost of 128 for VF 16 For instruction:   %v0 = load double, double* %in0, align 8
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 14 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 32 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 64 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 128 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX2: LV: Found an estimated cost of 8 for VF 2 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX2: LV: Found an estimated cost of 12 for VF 4 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX2: LV: Found an estimated cost of 28 for VF 8 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX2: LV: Found an estimated cost of 56 for VF 16 For instruction:   %v0 = load double, double* %in0, align 8
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 28 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 56 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX512: LV: Found an estimated cost of 8 for VF 4 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX512: LV: Found an estimated cost of 22 for VF 8 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX512: LV: Found an estimated cost of 80 for VF 16 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX512: LV: Found an estimated cost of 160 for VF 32 For instruction:   %v0 = load double, double* %in0, align 8
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 22 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 80 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 160 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %v0 = load double, double* %in0, align 8
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -50,15 +51,15 @@ for.body:
   %iv.2 = add nuw nsw i64 %iv, 2
   %iv.3 = add nuw nsw i64 %iv, 3
 
-  %in0 = getelementptr inbounds [1024 x double], [1024 x double]* @A, i64 0, i64 %iv.0
-  %in1 = getelementptr inbounds [1024 x double], [1024 x double]* @A, i64 0, i64 %iv.1
-  %in2 = getelementptr inbounds [1024 x double], [1024 x double]* @A, i64 0, i64 %iv.2
-  %in3 = getelementptr inbounds [1024 x double], [1024 x double]* @A, i64 0, i64 %iv.3
+  %in0 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.3
 
-  %v0 = load double, double* %in0
-  %v1 = load double, double* %in1
-  %v2 = load double, double* %in2
-  %v3 = load double, double* %in3
+  %v0 = load double, ptr %in0
+  %v1 = load double, ptr %in1
+  %v2 = load double, ptr %in2
+  %v3 = load double, ptr %in3
 
   %reduce.add.0 = fadd double %v0, %v1
   %reduce.add.1 = fadd double %reduce.add.0, %v2
@@ -66,7 +67,7 @@ for.body:
 
   %reduce.add.2.narrow = fptoui double %reduce.add.2 to i8
 
-  %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
   store i8 %reduce.add.2.narrow, i8* %out
 
   %iv.next = add nuw nsw i64 %iv.0, 4

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll
new file mode 100644
index 0000000000000..551ce2fbccb31
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load double, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x double] zeroinitializer, align 128
+ at B = global [1024 x i8] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 30 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 17 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 70 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+;
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 18 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 35 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 100 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 200 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+
+  %in0 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.3
+  %in4 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.4
+
+  %v0 = load double, ptr %in0
+  %v1 = load double, ptr %in1
+  %v2 = load double, ptr %in2
+  %v3 = load double, ptr %in3
+  %v4 = load double, ptr %in4
+
+  %reduce.add.0 = fadd double %v0, %v1
+  %reduce.add.1 = fadd double %reduce.add.0, %v2
+  %reduce.add.2 = fadd double %reduce.add.1, %v3
+  %reduce.add.3 = fadd double %reduce.add.2, %v4
+
+  %reduce.add.3.narrow = fptoui double %reduce.add.3 to i8
+
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
+  store i8 %reduce.add.3.narrow, i8* %out
+
+  %iv.next = add nuw nsw i64 %iv.0, 5
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll
index 186560899de33..124c864320a30 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load double, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,32 +11,32 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x double] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
-;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load double, double* %in0, align 8
-; SSE2: LV: Found an estimated cost of 18 for VF 2 For instruction:   %v0 = load double, double* %in0, align 8
-; SSE2: LV: Found an estimated cost of 36 for VF 4 For instruction:   %v0 = load double, double* %in0, align 8
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 18 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 36 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX1: LV: Found an estimated cost of 21 for VF 2 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX1: LV: Found an estimated cost of 48 for VF 4 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX1: LV: Found an estimated cost of 96 for VF 8 For instruction:   %v0 = load double, double* %in0, align 8
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 48 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 96 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX2: LV: Found an estimated cost of 9 for VF 2 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX2: LV: Found an estimated cost of 24 for VF 4 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX2: LV: Found an estimated cost of 48 for VF 8 For instruction:   %v0 = load double, double* %in0, align 8
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 24 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX512: LV: Found an estimated cost of 11 for VF 2 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX512: LV: Found an estimated cost of 21 for VF 4 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX512: LV: Found an estimated cost of 51 for VF 8 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX512: LV: Found an estimated cost of 120 for VF 16 For instruction:   %v0 = load double, double* %in0, align 8
-; AVX512: LV: Found an estimated cost of 240 for VF 32 For instruction:   %v0 = load double, double* %in0, align 8
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 11 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 21 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 51 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 120 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 240 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %v0 = load double, double* %in0, align 8
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -49,19 +50,19 @@ for.body:
   %iv.4 = add nuw nsw i64 %iv, 4
   %iv.5 = add nuw nsw i64 %iv, 5
 
-  %in0 = getelementptr inbounds [1024 x double], [1024 x double]* @A, i64 0, i64 %iv.0
-  %in1 = getelementptr inbounds [1024 x double], [1024 x double]* @A, i64 0, i64 %iv.1
-  %in2 = getelementptr inbounds [1024 x double], [1024 x double]* @A, i64 0, i64 %iv.2
-  %in3 = getelementptr inbounds [1024 x double], [1024 x double]* @A, i64 0, i64 %iv.3
-  %in4 = getelementptr inbounds [1024 x double], [1024 x double]* @A, i64 0, i64 %iv.4
-  %in5 = getelementptr inbounds [1024 x double], [1024 x double]* @A, i64 0, i64 %iv.5
+  %in0 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.3
+  %in4 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.4
+  %in5 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.5
 
-  %v0 = load double, double* %in0
-  %v1 = load double, double* %in1
-  %v2 = load double, double* %in2
-  %v3 = load double, double* %in3
-  %v4 = load double, double* %in4
-  %v5 = load double, double* %in5
+  %v0 = load double, ptr %in0
+  %v1 = load double, ptr %in1
+  %v2 = load double, ptr %in2
+  %v3 = load double, ptr %in3
+  %v4 = load double, ptr %in4
+  %v5 = load double, ptr %in5
 
   %reduce.add.0 = fadd double %v0, %v1
   %reduce.add.1 = fadd double %reduce.add.0, %v2
@@ -71,7 +72,7 @@ for.body:
 
   %reduce.add.4.narrow = fptoui double %reduce.add.4 to i8
 
-  %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
   store i8 %reduce.add.4.narrow, i8* %out
 
   %iv.next = add nuw nsw i64 %iv.0, 6

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll
new file mode 100644
index 0000000000000..572795bd90f82
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load double, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x double] zeroinitializer, align 128
+ at B = global [1024 x i8] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 42 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 49 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 98 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+;
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 70 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 140 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+  %iv.5 = add nuw nsw i64 %iv, 5
+  %iv.6 = add nuw nsw i64 %iv, 6
+
+  %in0 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.3
+  %in4 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.4
+  %in5 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.5
+  %in6 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.6
+
+  %v0 = load double, ptr %in0
+  %v1 = load double, ptr %in1
+  %v2 = load double, ptr %in2
+  %v3 = load double, ptr %in3
+  %v4 = load double, ptr %in4
+  %v5 = load double, ptr %in5
+  %v6 = load double, ptr %in6
+
+  %reduce.add.0 = fadd double %v0, %v1
+  %reduce.add.1 = fadd double %reduce.add.0, %v2
+  %reduce.add.2 = fadd double %reduce.add.1, %v3
+  %reduce.add.3 = fadd double %reduce.add.2, %v4
+  %reduce.add.4 = fadd double %reduce.add.3, %v5
+  %reduce.add.5 = fadd double %reduce.add.4, %v6
+
+  %reduce.add.5.narrow = fptoui double %reduce.add.5 to i8
+
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
+  store i8 %reduce.add.5.narrow, i8* %out
+
+  %iv.next = add nuw nsw i64 %iv.0, 7
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll
new file mode 100644
index 0000000000000..0643d645f0f48
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll
@@ -0,0 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load double, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x double] zeroinitializer, align 128
+ at B = global [1024 x i8] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 48 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 64 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 128 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+;
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 14 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 160 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+  %iv.5 = add nuw nsw i64 %iv, 5
+  %iv.6 = add nuw nsw i64 %iv, 6
+  %iv.7 = add nuw nsw i64 %iv, 7
+
+  %in0 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.3
+  %in4 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.4
+  %in5 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.5
+  %in6 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.6
+  %in7 = getelementptr inbounds [1024 x double], ptr @A, i64 0, i64 %iv.7
+
+  %v0 = load double, ptr %in0
+  %v1 = load double, ptr %in1
+  %v2 = load double, ptr %in2
+  %v3 = load double, ptr %in3
+  %v4 = load double, ptr %in4
+  %v5 = load double, ptr %in5
+  %v6 = load double, ptr %in6
+  %v7 = load double, ptr %in7
+
+  %reduce.add.0 = fadd double %v0, %v1
+  %reduce.add.1 = fadd double %reduce.add.0, %v2
+  %reduce.add.2 = fadd double %reduce.add.1, %v3
+  %reduce.add.3 = fadd double %reduce.add.2, %v4
+  %reduce.add.4 = fadd double %reduce.add.3, %v5
+  %reduce.add.5 = fadd double %reduce.add.4, %v6
+  %reduce.add.6 = fadd double %reduce.add.5, %v7
+
+  %reduce.add.6.narrow = fptoui double %reduce.add.6 to i8
+
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
+  store i8 %reduce.add.6.narrow, i8* %out
+
+  %iv.next = add nuw nsw i64 %iv.0, 8
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-2.ll
index 2f9f923aad00c..1be85b1e7a40c 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-2.ll
@@ -1,7 +1,9 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i16, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512DQ
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl,+avx512bw --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512BW
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +12,48 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i16] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 34 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 68 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
 ;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
-; SSE2: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
-; SSE2: LV: Found an estimated cost of 8 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
-; SSE2: LV: Found an estimated cost of 34 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; SSE2: LV: Found an estimated cost of 68 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 41 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 86 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 172 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 3 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 41 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 86 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 172 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 7 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 11 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 22 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 3 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 7 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 11 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 22 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX512DQ-LABEL: 'test'
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 7 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 10 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 20 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 372 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX512: LV: Found an estimated cost of 3 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX512: LV: Found an estimated cost of 5 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX512: LV: Found an estimated cost of 5 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX512: LV: Found an estimated cost of 7 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX512: LV: Found an estimated cost of 34 for VF 64 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX512BW-LABEL: 'test'
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 5 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 5 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 7 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 34 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %v0 = load i16, i16* %in0, align 2
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -52,17 +63,17 @@ for.body:
   %iv.0 = add nuw nsw i64 %iv, 0
   %iv.1 = add nuw nsw i64 %iv, 1
 
-  %in0 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.0
-  %in1 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.1
+  %in0 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.1
 
-  %v0 = load i16, i16* %in0
-  %v1 = load i16, i16* %in1
+  %v0 = load i16, ptr %in0
+  %v1 = load i16, ptr %in1
 
   %reduce.add.0 = add i16 %v0, %v1
 
   %reduce.add.0.narrow = trunc i16 %reduce.add.0 to i8
 
-  %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
   store i8 %reduce.add.0.narrow, i8* %out
 
   %iv.next = add nuw nsw i64 %iv.0, 2

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-3.ll
index b7b78185df987..4096f7774a95f 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-3.ll
@@ -1,7 +1,9 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i16, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512DQ
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl,+avx512bw --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512BW
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +12,48 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i16] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 18 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 26 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 51 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 102 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
 ;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
-; SSE2: LV: Found an estimated cost of 18 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
-; SSE2: LV: Found an estimated cost of 26 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
-; SSE2: LV: Found an estimated cost of 51 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; SSE2: LV: Found an estimated cost of 102 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 31 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 58 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 129 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 258 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 15 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 31 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 58 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 129 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 258 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 10 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 11 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 31 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 62 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 8 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 11 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 31 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 62 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX512DQ-LABEL: 'test'
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 10 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 30 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 59 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 558 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX512: LV: Found an estimated cost of 7 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX512: LV: Found an estimated cost of 7 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX512: LV: Found an estimated cost of 9 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX512: LV: Found an estimated cost of 18 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX512: LV: Found an estimated cost of 81 for VF 64 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX512BW-LABEL: 'test'
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 7 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 9 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 18 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 81 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %v0 = load i16, i16* %in0, align 2
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -53,20 +64,20 @@ for.body:
   %iv.1 = add nuw nsw i64 %iv, 1
   %iv.2 = add nuw nsw i64 %iv, 2
 
-  %in0 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.0
-  %in1 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.1
-  %in2 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.2
+  %in0 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.2
 
-  %v0 = load i16, i16* %in0
-  %v1 = load i16, i16* %in1
-  %v2 = load i16, i16* %in2
+  %v0 = load i16, ptr %in0
+  %v1 = load i16, ptr %in1
+  %v2 = load i16, ptr %in2
 
   %reduce.add.0 = add i16 %v0, %v1
   %reduce.add.1 = add i16 %reduce.add.0, %v2
 
   %reduce.add.1.narrow = trunc i16 %reduce.add.1 to i8
 
-  %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
   store i8 %reduce.add.1.narrow, i8* %out
 
   %iv.next = add nuw nsw i64 %iv.0, 3

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-4.ll
index 070eef9bfc280..50fe75474adae 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-4.ll
@@ -1,7 +1,9 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i16, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512DQ
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl,+avx512bw --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512BW
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +12,48 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i16] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 17 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 34 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 68 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 136 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
 ;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
-; SSE2: LV: Found an estimated cost of 17 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
-; SSE2: LV: Found an estimated cost of 34 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
-; SSE2: LV: Found an estimated cost of 68 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; SSE2: LV: Found an estimated cost of 136 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 17 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 41 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 82 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 172 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 344 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 17 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 41 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 82 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 172 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 344 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 18 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 35 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 79 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 158 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 7 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 18 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 35 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 79 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 158 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX512DQ-LABEL: 'test'
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 18 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 34 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 77 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 154 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 744 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX512: LV: Found an estimated cost of 9 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX512: LV: Found an estimated cost of 9 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX512: LV: Found an estimated cost of 12 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX512: LV: Found an estimated cost of 34 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX512: LV: Found an estimated cost of 148 for VF 64 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX512BW-LABEL: 'test'
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 9 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 9 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 12 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 34 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 148 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %v0 = load i16, i16* %in0, align 2
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -54,15 +65,15 @@ for.body:
   %iv.2 = add nuw nsw i64 %iv, 2
   %iv.3 = add nuw nsw i64 %iv, 3
 
-  %in0 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.0
-  %in1 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.1
-  %in2 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.2
-  %in3 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.3
+  %in0 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.3
 
-  %v0 = load i16, i16* %in0
-  %v1 = load i16, i16* %in1
-  %v2 = load i16, i16* %in2
-  %v3 = load i16, i16* %in3
+  %v0 = load i16, ptr %in0
+  %v1 = load i16, ptr %in1
+  %v2 = load i16, ptr %in2
+  %v3 = load i16, ptr %in3
 
   %reduce.add.0 = add i16 %v0, %v1
   %reduce.add.1 = add i16 %reduce.add.0, %v2
@@ -70,7 +81,7 @@ for.body:
 
   %reduce.add.2.narrow = trunc i16 %reduce.add.2 to i8
 
-  %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
   store i8 %reduce.add.2.narrow, i8* %out
 
   %iv.next = add nuw nsw i64 %iv.0, 4

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
index 3d002698ebd83..87e410f72ba3c 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
@@ -1,7 +1,9 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i16, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512DQ
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl,+avx512bw --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512BW
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +12,48 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i16] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 22 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 43 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 85 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 170 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
 ;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
-; SSE2: LV: Found an estimated cost of 22 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
-; SSE2: LV: Found an estimated cost of 43 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
-; SSE2: LV: Found an estimated cost of 85 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; SSE2: LV: Found an estimated cost of 170 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 50 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 99 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 215 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 430 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 50 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 99 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 215 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 430 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 170 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 340 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 20 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 40 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 80 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 170 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 340 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX512DQ-LABEL: 'test'
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 55 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 106 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 229 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 465 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 930 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX512: LV: Found an estimated cost of 11 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX512: LV: Found an estimated cost of 11 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX512: LV: Found an estimated cost of 14 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX512: LV: Found an estimated cost of 28 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX512: LV: Found an estimated cost of 55 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX512: LV: Found an estimated cost of 235 for VF 64 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX512BW-LABEL: 'test'
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 11 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 11 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 14 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 28 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 55 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 235 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %v0 = load i16, i16* %in0, align 2
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -55,17 +66,17 @@ for.body:
   %iv.3 = add nuw nsw i64 %iv, 3
   %iv.4 = add nuw nsw i64 %iv, 4
 
-  %in0 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.0
-  %in1 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.1
-  %in2 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.2
-  %in3 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.3
-  %in4 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.4
+  %in0 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.3
+  %in4 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.4
 
-  %v0 = load i16, i16* %in0
-  %v1 = load i16, i16* %in1
-  %v2 = load i16, i16* %in2
-  %v3 = load i16, i16* %in3
-  %v4 = load i16, i16* %in4
+  %v0 = load i16, ptr %in0
+  %v1 = load i16, ptr %in1
+  %v2 = load i16, ptr %in2
+  %v3 = load i16, ptr %in3
+  %v4 = load i16, ptr %in4
 
   %reduce.add.0 = add i16 %v0, %v1
   %reduce.add.1 = add i16 %reduce.add.0, %v2
@@ -74,7 +85,7 @@ for.body:
 
   %reduce.add.3.narrow = trunc i16 %reduce.add.3 to i8
 
-  %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
   store i8 %reduce.add.3.narrow, i8* %out
 
   %iv.next = add nuw nsw i64 %iv.0, 5

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll
index 938fce58cdd36..5fd171b2645fc 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll
@@ -1,7 +1,9 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i16, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512DQ
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl,+avx512bw --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512BW
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +12,48 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i16] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 51 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 102 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 204 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
 ;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
-; SSE2: LV: Found an estimated cost of 26 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
-; SSE2: LV: Found an estimated cost of 51 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
-; SSE2: LV: Found an estimated cost of 102 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; SSE2: LV: Found an estimated cost of 204 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 31 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 58 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 123 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 258 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 516 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 31 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 58 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 123 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 258 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 516 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 16 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 11 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 42 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 112 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 224 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 16 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 11 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 42 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 112 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 224 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX512DQ-LABEL: 'test'
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 16 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 41 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 109 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 218 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1116 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX512: LV: Found an estimated cost of 13 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX512: LV: Found an estimated cost of 13 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX512: LV: Found an estimated cost of 17 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX512: LV: Found an estimated cost of 33 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX512: LV: Found an estimated cost of 81 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX512: LV: Found an estimated cost of 342 for VF 64 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX512BW-LABEL: 'test'
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 13 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 13 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 17 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 33 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 81 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 342 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %v0 = load i16, i16* %in0, align 2
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -56,19 +67,19 @@ for.body:
   %iv.4 = add nuw nsw i64 %iv, 4
   %iv.5 = add nuw nsw i64 %iv, 5
 
-  %in0 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.0
-  %in1 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.1
-  %in2 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.2
-  %in3 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.3
-  %in4 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.4
-  %in5 = getelementptr inbounds [1024 x i16], [1024 x i16]* @A, i64 0, i64 %iv.5
+  %in0 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.3
+  %in4 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.4
+  %in5 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.5
 
-  %v0 = load i16, i16* %in0
-  %v1 = load i16, i16* %in1
-  %v2 = load i16, i16* %in2
-  %v3 = load i16, i16* %in3
-  %v4 = load i16, i16* %in4
-  %v5 = load i16, i16* %in5
+  %v0 = load i16, ptr %in0
+  %v1 = load i16, ptr %in1
+  %v2 = load i16, ptr %in2
+  %v3 = load i16, ptr %in3
+  %v4 = load i16, ptr %in4
+  %v5 = load i16, ptr %in5
 
   %reduce.add.0 = add i16 %v0, %v1
   %reduce.add.1 = add i16 %reduce.add.0, %v2
@@ -78,7 +89,7 @@ for.body:
 
   %reduce.add.4.narrow = trunc i16 %reduce.add.4 to i8
 
-  %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
   store i8 %reduce.add.4.narrow, i8* %out
 
   %iv.next = add nuw nsw i64 %iv.0, 6

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll
new file mode 100644
index 0000000000000..97a2f0e7c56e5
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll
@@ -0,0 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i16, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512DQ
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl,+avx512bw --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512BW
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x i16] zeroinitializer, align 128
+ at B = global [1024 x i8] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 35 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 60 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 119 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 238 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 39 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 72 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 140 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 301 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 602 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 238 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 476 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+;
+; AVX512DQ-LABEL: 'test'
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 39 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 81 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 156 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 322 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 651 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1302 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
+;
+; AVX512BW-LABEL: 'test'
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 15 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 19 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 56 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 112 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 469 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+  %iv.5 = add nuw nsw i64 %iv, 5
+  %iv.6 = add nuw nsw i64 %iv, 6
+
+  %in0 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.3
+  %in4 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.4
+  %in5 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.5
+  %in6 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.6
+
+  %v0 = load i16, ptr %in0
+  %v1 = load i16, ptr %in1
+  %v2 = load i16, ptr %in2
+  %v3 = load i16, ptr %in3
+  %v4 = load i16, ptr %in4
+  %v5 = load i16, ptr %in5
+  %v6 = load i16, ptr %in6
+
+  %reduce.add.0 = add i16 %v0, %v1
+  %reduce.add.1 = add i16 %reduce.add.0, %v2
+  %reduce.add.2 = add i16 %reduce.add.1, %v3
+  %reduce.add.3 = add i16 %reduce.add.2, %v4
+  %reduce.add.4 = add i16 %reduce.add.3, %v5
+  %reduce.add.5 = add i16 %reduce.add.4, %v6
+
+  %reduce.add.5.narrow = trunc i16 %reduce.add.5 to i8
+
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
+  store i8 %reduce.add.5.narrow, i8* %out
+
+  %iv.next = add nuw nsw i64 %iv.0, 7
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll
new file mode 100644
index 0000000000000..8169980d47a6d
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i16, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512DQ
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl,+avx512bw --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512BW
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x i16] zeroinitializer, align 128
+ at B = global [1024 x i8] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 34 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 68 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 136 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  LV: Found an estimated cost of 272 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 41 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 82 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 164 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 344 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  LV: Found an estimated cost of 688 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 64 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 128 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 272 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  LV: Found an estimated cost of 544 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+;
+; AVX512DQ-LABEL: 'test'
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 41 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 89 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 178 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 372 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 744 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1488 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
+;
+; AVX512BW-LABEL: 'test'
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 17 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 17 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 22 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 64 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 148 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  LV: Found an estimated cost of 616 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+  %iv.5 = add nuw nsw i64 %iv, 5
+  %iv.6 = add nuw nsw i64 %iv, 6
+  %iv.7 = add nuw nsw i64 %iv, 7
+
+  %in0 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.3
+  %in4 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.4
+  %in5 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.5
+  %in6 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.6
+  %in7 = getelementptr inbounds [1024 x i16], ptr @A, i64 0, i64 %iv.7
+
+  %v0 = load i16, ptr %in0
+  %v1 = load i16, ptr %in1
+  %v2 = load i16, ptr %in2
+  %v3 = load i16, ptr %in3
+  %v4 = load i16, ptr %in4
+  %v5 = load i16, ptr %in5
+  %v6 = load i16, ptr %in6
+  %v7 = load i16, ptr %in7
+
+  %reduce.add.0 = add i16 %v0, %v1
+  %reduce.add.1 = add i16 %reduce.add.0, %v2
+  %reduce.add.2 = add i16 %reduce.add.1, %v3
+  %reduce.add.3 = add i16 %reduce.add.2, %v4
+  %reduce.add.4 = add i16 %reduce.add.3, %v5
+  %reduce.add.5 = add i16 %reduce.add.4, %v6
+  %reduce.add.6 = add i16 %reduce.add.5, %v7
+
+  %reduce.add.6.narrow = trunc i16 %reduce.add.6 to i8
+
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
+  store i8 %reduce.add.6.narrow, i8* %out
+
+  %iv.next = add nuw nsw i64 %iv.0, 8
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2.ll
index 2468ca6891b75..fd2f7ba4a1ea2 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-2.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i32, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +11,39 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i32] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
-;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; SSE2: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
-; SSE2: LV: Found an estimated cost of 4 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
-; SSE2: LV: Found an estimated cost of 60 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
-; SSE2: LV: Found an estimated cost of 120 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 60 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 120 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX1: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX1: LV: Found an estimated cost of 3 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX1: LV: Found an estimated cost of 46 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX1: LV: Found an estimated cost of 92 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX1: LV: Found an estimated cost of 184 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 46 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 92 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 184 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 3 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 6 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 12 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 24 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 6 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 12 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 24 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX512: LV: Found an estimated cost of 3 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX512: LV: Found an estimated cost of 3 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX512: LV: Found an estimated cost of 5 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX512: LV: Found an estimated cost of 22 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX512: LV: Found an estimated cost of 92 for VF 64 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 3 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 5 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 22 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 92 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %v0 = load i32, i32* %in0, align 4
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -52,17 +53,17 @@ for.body:
   %iv.0 = add nuw nsw i64 %iv, 0
   %iv.1 = add nuw nsw i64 %iv, 1
 
-  %in0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.0
-  %in1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.1
+  %in0 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.1
 
-  %v0 = load i32, i32* %in0
-  %v1 = load i32, i32* %in1
+  %v0 = load i32, ptr %in0
+  %v1 = load i32, ptr %in1
 
   %reduce.add.0 = add i32 %v0, %v1
 
   %reduce.add.0.narrow = trunc i32 %reduce.add.0 to i8
 
-  %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
   store i8 %reduce.add.0.narrow, i8* %out
 
   %iv.next = add nuw nsw i64 %iv.0, 2

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3.ll
index b3b790cae3055..1a7fb70d64b6e 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i32, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +11,39 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i32] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
-;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; SSE2: LV: Found an estimated cost of 21 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
-; SSE2: LV: Found an estimated cost of 45 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
-; SSE2: LV: Found an estimated cost of 90 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
-; SSE2: LV: Found an estimated cost of 180 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 45 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 90 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 180 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX1: LV: Found an estimated cost of 17 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX1: LV: Found an estimated cost of 30 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX1: LV: Found an estimated cost of 69 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX1: LV: Found an estimated cost of 138 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX1: LV: Found an estimated cost of 276 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 17 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 30 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 69 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 138 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 276 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 5 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 10 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 20 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 44 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 44 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX512: LV: Found an estimated cost of 4 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX512: LV: Found an estimated cost of 6 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX512: LV: Found an estimated cost of 12 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX512: LV: Found an estimated cost of 51 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX512: LV: Found an estimated cost of 210 for VF 64 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 6 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 12 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 51 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 210 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %v0 = load i32, i32* %in0, align 4
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -53,20 +54,20 @@ for.body:
   %iv.1 = add nuw nsw i64 %iv, 1
   %iv.2 = add nuw nsw i64 %iv, 2
 
-  %in0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.0
-  %in1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.1
-  %in2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.2
+  %in0 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.2
 
-  %v0 = load i32, i32* %in0
-  %v1 = load i32, i32* %in1
-  %v2 = load i32, i32* %in2
+  %v0 = load i32, ptr %in0
+  %v1 = load i32, ptr %in1
+  %v2 = load i32, ptr %in2
 
   %reduce.add.0 = add i32 %v0, %v1
   %reduce.add.1 = add i32 %reduce.add.0, %v2
 
   %reduce.add.1.narrow = trunc i32 %reduce.add.1 to i8
 
-  %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
   store i8 %reduce.add.1.narrow, i8* %out
 
   %iv.next = add nuw nsw i64 %iv.0, 3

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4.ll
index c3eb498fb9be3..885024894c64b 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i32, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,38 +11,38 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i32] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
-;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
-; SSE2: LV: Found an estimated cost of 60 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
-; SSE2: LV: Found an estimated cost of 120 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
-; SSE2: LV: Found an estimated cost of 240 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 60 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 120 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 240 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX1: LV: Found an estimated cost of 21 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX1: LV: Found an estimated cost of 42 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX1: LV: Found an estimated cost of 92 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX1: LV: Found an estimated cost of 184 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX1: LV: Found an estimated cost of 368 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 42 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 92 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 184 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 368 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 84 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 10 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 40 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 84 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX512: LV: Found an estimated cost of 5 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX512: LV: Found an estimated cost of 8 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX512: LV: Found an estimated cost of 22 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX512: LV: Found an estimated cost of 92 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 8 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 22 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 92 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %v0 = load i32, i32* %in0, align 4
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -53,15 +54,15 @@ for.body:
   %iv.2 = add nuw nsw i64 %iv, 2
   %iv.3 = add nuw nsw i64 %iv, 3
 
-  %in0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.0
-  %in1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.1
-  %in2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.2
-  %in3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.3
+  %in0 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.3
 
-  %v0 = load i32, i32* %in0
-  %v1 = load i32, i32* %in1
-  %v2 = load i32, i32* %in2
-  %v3 = load i32, i32* %in3
+  %v0 = load i32, ptr %in0
+  %v1 = load i32, ptr %in1
+  %v2 = load i32, ptr %in2
+  %v3 = load i32, ptr %in3
 
   %reduce.add.0 = add i32 %v0, %v1
   %reduce.add.1 = add i32 %reduce.add.0, %v2
@@ -69,7 +70,7 @@ for.body:
 
   %reduce.add.2.narrow = trunc i32 %reduce.add.2 to i8
 
-  %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
   store i8 %reduce.add.2.narrow, i8* %out
 
   %iv.next = add nuw nsw i64 %iv.0, 4

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll
new file mode 100644
index 0000000000000..83a98b404fb6d
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll
@@ -0,0 +1,83 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i32, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x i32] zeroinitializer, align 128
+ at B = global [1024 x i8] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 35 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 75 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 150 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 51 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 115 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 230 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 90 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 180 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+;
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 9 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 18 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 35 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 145 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+
+  %in0 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.3
+  %in4 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.4
+
+  %v0 = load i32, ptr %in0
+  %v1 = load i32, ptr %in1
+  %v2 = load i32, ptr %in2
+  %v3 = load i32, ptr %in3
+  %v4 = load i32, ptr %in4
+
+  %reduce.add.0 = add i32 %v0, %v1
+  %reduce.add.1 = add i32 %reduce.add.0, %v2
+  %reduce.add.2 = add i32 %reduce.add.1, %v3
+  %reduce.add.3 = add i32 %reduce.add.2, %v4
+
+  %reduce.add.3.narrow = trunc i32 %reduce.add.3 to i8
+
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
+  store i8 %reduce.add.3.narrow, i8* %out
+
+  %iv.next = add nuw nsw i64 %iv.0, 5
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-6.ll
index b4c67ebd8aaf3..8e7e7d9bfd9a5 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-6.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i32, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,35 +11,35 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i32] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
-;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; SSE2: LV: Found an estimated cost of 42 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
-; SSE2: LV: Found an estimated cost of 90 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
-; SSE2: LV: Found an estimated cost of 180 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 42 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 90 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 180 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX1: LV: Found an estimated cost of 30 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX1: LV: Found an estimated cost of 63 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX1: LV: Found an estimated cost of 138 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX1: LV: Found an estimated cost of 276 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 30 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 63 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 138 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 276 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 8 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 18 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 37 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX2: LV: Found an estimated cost of 76 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 18 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 37 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 76 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX512: LV: Found an estimated cost of 7 for VF 2 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX512: LV: Found an estimated cost of 11 for VF 4 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX512: LV: Found an estimated cost of 21 for VF 8 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX512: LV: Found an estimated cost of 51 for VF 16 For instruction:   %v0 = load i32, i32* %in0, align 4
-; AVX512: LV: Found an estimated cost of 210 for VF 32 For instruction:   %v0 = load i32, i32* %in0, align 4
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 11 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 21 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 51 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 210 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %v0 = load i32, i32* %in0, align 4
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -52,19 +53,19 @@ for.body:
   %iv.4 = add nuw nsw i64 %iv, 4
   %iv.5 = add nuw nsw i64 %iv, 5
 
-  %in0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.0
-  %in1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.1
-  %in2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.2
-  %in3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.3
-  %in4 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.4
-  %in5 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %iv.5
+  %in0 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.3
+  %in4 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.4
+  %in5 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.5
 
-  %v0 = load i32, i32* %in0
-  %v1 = load i32, i32* %in1
-  %v2 = load i32, i32* %in2
-  %v3 = load i32, i32* %in3
-  %v4 = load i32, i32* %in4
-  %v5 = load i32, i32* %in5
+  %v0 = load i32, ptr %in0
+  %v1 = load i32, ptr %in1
+  %v2 = load i32, ptr %in2
+  %v3 = load i32, ptr %in3
+  %v4 = load i32, ptr %in4
+  %v5 = load i32, ptr %in5
 
   %reduce.add.0 = add i32 %v0, %v1
   %reduce.add.1 = add i32 %reduce.add.0, %v2
@@ -74,7 +75,7 @@ for.body:
 
   %reduce.add.4.narrow = trunc i32 %reduce.add.4 to i8
 
-  %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
   store i8 %reduce.add.4.narrow, i8* %out
 
   %iv.next = add nuw nsw i64 %iv.0, 6

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll
new file mode 100644
index 0000000000000..07745d4fe390e
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll
@@ -0,0 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i32, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x i32] zeroinitializer, align 128
+ at B = global [1024 x i8] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 49 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 105 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 210 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 38 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 72 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 161 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 322 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 126 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 252 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+;
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 35 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 70 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+  %iv.5 = add nuw nsw i64 %iv, 5
+  %iv.6 = add nuw nsw i64 %iv, 6
+
+  %in0 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.3
+  %in4 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.4
+  %in5 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.5
+  %in6 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.6
+
+  %v0 = load i32, ptr %in0
+  %v1 = load i32, ptr %in1
+  %v2 = load i32, ptr %in2
+  %v3 = load i32, ptr %in3
+  %v4 = load i32, ptr %in4
+  %v5 = load i32, ptr %in5
+  %v6 = load i32, ptr %in6
+
+  %reduce.add.0 = add i32 %v0, %v1
+  %reduce.add.1 = add i32 %reduce.add.0, %v2
+  %reduce.add.2 = add i32 %reduce.add.1, %v3
+  %reduce.add.3 = add i32 %reduce.add.2, %v4
+  %reduce.add.4 = add i32 %reduce.add.3, %v5
+  %reduce.add.5 = add i32 %reduce.add.4, %v6
+
+  %reduce.add.5.narrow = trunc i32 %reduce.add.5 to i8
+
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
+  store i8 %reduce.add.5.narrow, i8* %out
+
+  %iv.next = add nuw nsw i64 %iv.0, 7
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll
new file mode 100644
index 0000000000000..53b0810619e30
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i32, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x i32] zeroinitializer, align 128
+ at B = global [1024 x i8] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 56 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 120 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  LV: Found an estimated cost of 240 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 42 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 84 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 184 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  LV: Found an estimated cost of 368 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 64 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  LV: Found an estimated cost of 288 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+;
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 14 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  LV: Found an estimated cost of 92 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+  %iv.5 = add nuw nsw i64 %iv, 5
+  %iv.6 = add nuw nsw i64 %iv, 6
+  %iv.7 = add nuw nsw i64 %iv, 7
+
+  %in0 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.3
+  %in4 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.4
+  %in5 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.5
+  %in6 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.6
+  %in7 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %iv.7
+
+  %v0 = load i32, ptr %in0
+  %v1 = load i32, ptr %in1
+  %v2 = load i32, ptr %in2
+  %v3 = load i32, ptr %in3
+  %v4 = load i32, ptr %in4
+  %v5 = load i32, ptr %in5
+  %v6 = load i32, ptr %in6
+  %v7 = load i32, ptr %in7
+
+  %reduce.add.0 = add i32 %v0, %v1
+  %reduce.add.1 = add i32 %reduce.add.0, %v2
+  %reduce.add.2 = add i32 %reduce.add.1, %v3
+  %reduce.add.3 = add i32 %reduce.add.2, %v4
+  %reduce.add.4 = add i32 %reduce.add.3, %v5
+  %reduce.add.5 = add i32 %reduce.add.4, %v6
+  %reduce.add.6 = add i32 %reduce.add.5, %v7
+
+  %reduce.add.6.narrow = trunc i32 %reduce.add.6 to i8
+
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
+  store i8 %reduce.add.6.narrow, i8* %out
+
+  %iv.next = add nuw nsw i64 %iv.0, 8
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll
index cffb0ee48173a..964bfcb8355da 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i64, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +11,39 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i64] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
-;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i64, i64* %in0, align 8
-; SSE2: LV: Found an estimated cost of 4 for VF 2 For instruction:   %v0 = load i64, i64* %in0, align 8
-; SSE2: LV: Found an estimated cost of 28 for VF 4 For instruction:   %v0 = load i64, i64* %in0, align 8
-; SSE2: LV: Found an estimated cost of 56 for VF 8 For instruction:   %v0 = load i64, i64* %in0, align 8
-; SSE2: LV: Found an estimated cost of 112 for VF 16 For instruction:   %v0 = load i64, i64* %in0, align 8
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 28 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 56 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 112 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX1: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX1: LV: Found an estimated cost of 26 for VF 4 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX1: LV: Found an estimated cost of 52 for VF 8 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX1: LV: Found an estimated cost of 104 for VF 16 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX1: LV: Found an estimated cost of 208 for VF 32 For instruction:   %v0 = load i64, i64* %in0, align 8
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 26 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 52 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 104 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 208 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX2: LV: Found an estimated cost of 6 for VF 4 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX2: LV: Found an estimated cost of 12 for VF 8 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX2: LV: Found an estimated cost of 24 for VF 16 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX2: LV: Found an estimated cost of 48 for VF 32 For instruction:   %v0 = load i64, i64* %in0, align 8
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 24 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 48 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX512: LV: Found an estimated cost of 3 for VF 4 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX512: LV: Found an estimated cost of 5 for VF 8 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX512: LV: Found an estimated cost of 22 for VF 16 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX512: LV: Found an estimated cost of 80 for VF 32 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX512: LV: Found an estimated cost of 160 for VF 64 For instruction:   %v0 = load i64, i64* %in0, align 8
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 5 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 22 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 80 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 160 for VF 64 For instruction: %v0 = load i64, ptr %in0, align 8
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %v0 = load i64, i64* %in0, align 8
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -52,17 +53,17 @@ for.body:
   %iv.0 = add nuw nsw i64 %iv, 0
   %iv.1 = add nuw nsw i64 %iv, 1
 
-  %in0 = getelementptr inbounds [1024 x i64], [1024 x i64]* @A, i64 0, i64 %iv.0
-  %in1 = getelementptr inbounds [1024 x i64], [1024 x i64]* @A, i64 0, i64 %iv.1
+  %in0 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.1
 
-  %v0 = load i64, i64* %in0
-  %v1 = load i64, i64* %in1
+  %v0 = load i64, ptr %in0
+  %v1 = load i64, ptr %in1
 
   %reduce.add.0 = add i64 %v0, %v1
 
   %reduce.add.0.narrow = trunc i64 %reduce.add.0 to i8
 
-  %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
   store i8 %reduce.add.0.narrow, i8* %out
 
   %iv.next = add nuw nsw i64 %iv.0, 2

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll
index 2df64b0d3b57f..d3e8241c84c5a 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i64, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,36 +11,36 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i64] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
-;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i64, i64* %in0, align 8
-; SSE2: LV: Found an estimated cost of 21 for VF 2 For instruction:   %v0 = load i64, i64* %in0, align 8
-; SSE2: LV: Found an estimated cost of 42 for VF 4 For instruction:   %v0 = load i64, i64* %in0, align 8
-; SSE2: LV: Found an estimated cost of 84 for VF 8 For instruction:   %v0 = load i64, i64* %in0, align 8
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 42 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 84 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX1: LV: Found an estimated cost of 16 for VF 2 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX1: LV: Found an estimated cost of 39 for VF 4 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX1: LV: Found an estimated cost of 78 for VF 8 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX1: LV: Found an estimated cost of 156 for VF 16 For instruction:   %v0 = load i64, i64* %in0, align 8
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 16 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 39 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 78 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 156 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction:   %v0 = load i64, i64* %in0, align 8
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 32 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX512: LV: Found an estimated cost of 6 for VF 4 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX512: LV: Found an estimated cost of 12 for VF 8 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX512: LV: Found an estimated cost of 51 for VF 16 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX512: LV: Found an estimated cost of 120 for VF 32 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX512: LV: Found an estimated cost of 240 for VF 64 For instruction:   %v0 = load i64, i64* %in0, align 8
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 51 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 120 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 240 for VF 64 For instruction: %v0 = load i64, ptr %in0, align 8
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %v0 = load i64, i64* %in0, align 8
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -50,20 +51,20 @@ for.body:
   %iv.1 = add nuw nsw i64 %iv, 1
   %iv.2 = add nuw nsw i64 %iv, 2
 
-  %in0 = getelementptr inbounds [1024 x i64], [1024 x i64]* @A, i64 0, i64 %iv.0
-  %in1 = getelementptr inbounds [1024 x i64], [1024 x i64]* @A, i64 0, i64 %iv.1
-  %in2 = getelementptr inbounds [1024 x i64], [1024 x i64]* @A, i64 0, i64 %iv.2
+  %in0 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.2
 
-  %v0 = load i64, i64* %in0
-  %v1 = load i64, i64* %in1
-  %v2 = load i64, i64* %in2
+  %v0 = load i64, ptr %in0
+  %v1 = load i64, ptr %in1
+  %v2 = load i64, ptr %in2
 
   %reduce.add.0 = add i64 %v0, %v1
   %reduce.add.1 = add i64 %reduce.add.0, %v2
 
   %reduce.add.1.narrow = trunc i64 %reduce.add.1 to i8
 
-  %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
   store i8 %reduce.add.1.narrow, i8* %out
 
   %iv.next = add nuw nsw i64 %iv.0, 3

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll
index 5b0001ed9431b..03ed8ba667a90 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i64, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,35 +11,35 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i64] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
-;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i64, i64* %in0, align 8
-; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction:   %v0 = load i64, i64* %in0, align 8
-; SSE2: LV: Found an estimated cost of 56 for VF 4 For instruction:   %v0 = load i64, i64* %in0, align 8
-; SSE2: LV: Found an estimated cost of 112 for VF 8 For instruction:   %v0 = load i64, i64* %in0, align 8
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX1: LV: Found an estimated cost of 22 for VF 2 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX1: LV: Found an estimated cost of 52 for VF 4 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX1: LV: Found an estimated cost of 104 for VF 8 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX1: LV: Found an estimated cost of 208 for VF 16 For instruction:   %v0 = load i64, i64* %in0, align 8
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 22 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 52 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 104 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 208 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX2: LV: Found an estimated cost of 8 for VF 2 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX2: LV: Found an estimated cost of 12 for VF 4 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX2: LV: Found an estimated cost of 28 for VF 8 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX2: LV: Found an estimated cost of 56 for VF 16 For instruction:   %v0 = load i64, i64* %in0, align 8
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 28 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 56 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX512: LV: Found an estimated cost of 8 for VF 4 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX512: LV: Found an estimated cost of 22 for VF 8 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX512: LV: Found an estimated cost of 80 for VF 16 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX512: LV: Found an estimated cost of 160 for VF 32 For instruction:   %v0 = load i64, i64* %in0, align 8
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 22 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 80 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 160 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %v0 = load i64, i64* %in0, align 8
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -50,15 +51,15 @@ for.body:
   %iv.2 = add nuw nsw i64 %iv, 2
   %iv.3 = add nuw nsw i64 %iv, 3
 
-  %in0 = getelementptr inbounds [1024 x i64], [1024 x i64]* @A, i64 0, i64 %iv.0
-  %in1 = getelementptr inbounds [1024 x i64], [1024 x i64]* @A, i64 0, i64 %iv.1
-  %in2 = getelementptr inbounds [1024 x i64], [1024 x i64]* @A, i64 0, i64 %iv.2
-  %in3 = getelementptr inbounds [1024 x i64], [1024 x i64]* @A, i64 0, i64 %iv.3
+  %in0 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.3
 
-  %v0 = load i64, i64* %in0
-  %v1 = load i64, i64* %in1
-  %v2 = load i64, i64* %in2
-  %v3 = load i64, i64* %in3
+  %v0 = load i64, ptr %in0
+  %v1 = load i64, ptr %in1
+  %v2 = load i64, ptr %in2
+  %v3 = load i64, ptr %in3
 
   %reduce.add.0 = add i64 %v0, %v1
   %reduce.add.1 = add i64 %reduce.add.0, %v2
@@ -66,7 +67,7 @@ for.body:
 
   %reduce.add.2.narrow = trunc i64 %reduce.add.2 to i8
 
-  %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
   store i8 %reduce.add.2.narrow, i8* %out
 
   %iv.next = add nuw nsw i64 %iv.0, 4

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll
new file mode 100644
index 0000000000000..4fb2a1067532c
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i64, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x i64] zeroinitializer, align 128
+ at B = global [1024 x i8] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 35 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 70 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 27 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 65 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 130 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 50 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 100 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+;
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 18 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 35 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 100 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 200 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+
+  %in0 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.3
+  %in4 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.4
+
+  %v0 = load i64, ptr %in0
+  %v1 = load i64, ptr %in1
+  %v2 = load i64, ptr %in2
+  %v3 = load i64, ptr %in3
+  %v4 = load i64, ptr %in4
+
+  %reduce.add.0 = add i64 %v0, %v1
+  %reduce.add.1 = add i64 %reduce.add.0, %v2
+  %reduce.add.2 = add i64 %reduce.add.1, %v3
+  %reduce.add.3 = add i64 %reduce.add.2, %v4
+
+  %reduce.add.3.narrow = trunc i64 %reduce.add.3 to i8
+
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
+  store i8 %reduce.add.3.narrow, i8* %out
+
+  %iv.next = add nuw nsw i64 %iv.0, 5
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll
index a25da5d8f175f..2bc423853dfaa 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i64, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,32 +11,32 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i64] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
-;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i64, i64* %in0, align 8
-; SSE2: LV: Found an estimated cost of 42 for VF 2 For instruction:   %v0 = load i64, i64* %in0, align 8
-; SSE2: LV: Found an estimated cost of 84 for VF 4 For instruction:   %v0 = load i64, i64* %in0, align 8
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 42 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 84 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX1: LV: Found an estimated cost of 33 for VF 2 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX1: LV: Found an estimated cost of 78 for VF 4 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX1: LV: Found an estimated cost of 156 for VF 8 For instruction:   %v0 = load i64, i64* %in0, align 8
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 33 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 78 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 156 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX2: LV: Found an estimated cost of 9 for VF 2 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX2: LV: Found an estimated cost of 24 for VF 4 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX2: LV: Found an estimated cost of 48 for VF 8 For instruction:   %v0 = load i64, i64* %in0, align 8
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 24 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX512: LV: Found an estimated cost of 11 for VF 2 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX512: LV: Found an estimated cost of 21 for VF 4 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX512: LV: Found an estimated cost of 51 for VF 8 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX512: LV: Found an estimated cost of 120 for VF 16 For instruction:   %v0 = load i64, i64* %in0, align 8
-; AVX512: LV: Found an estimated cost of 240 for VF 32 For instruction:   %v0 = load i64, i64* %in0, align 8
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 11 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 21 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 51 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 120 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 240 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %v0 = load i64, i64* %in0, align 8
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -49,19 +50,19 @@ for.body:
   %iv.4 = add nuw nsw i64 %iv, 4
   %iv.5 = add nuw nsw i64 %iv, 5
 
-  %in0 = getelementptr inbounds [1024 x i64], [1024 x i64]* @A, i64 0, i64 %iv.0
-  %in1 = getelementptr inbounds [1024 x i64], [1024 x i64]* @A, i64 0, i64 %iv.1
-  %in2 = getelementptr inbounds [1024 x i64], [1024 x i64]* @A, i64 0, i64 %iv.2
-  %in3 = getelementptr inbounds [1024 x i64], [1024 x i64]* @A, i64 0, i64 %iv.3
-  %in4 = getelementptr inbounds [1024 x i64], [1024 x i64]* @A, i64 0, i64 %iv.4
-  %in5 = getelementptr inbounds [1024 x i64], [1024 x i64]* @A, i64 0, i64 %iv.5
+  %in0 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.3
+  %in4 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.4
+  %in5 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.5
 
-  %v0 = load i64, i64* %in0
-  %v1 = load i64, i64* %in1
-  %v2 = load i64, i64* %in2
-  %v3 = load i64, i64* %in3
-  %v4 = load i64, i64* %in4
-  %v5 = load i64, i64* %in5
+  %v0 = load i64, ptr %in0
+  %v1 = load i64, ptr %in1
+  %v2 = load i64, ptr %in2
+  %v3 = load i64, ptr %in3
+  %v4 = load i64, ptr %in4
+  %v5 = load i64, ptr %in5
 
   %reduce.add.0 = add i64 %v0, %v1
   %reduce.add.1 = add i64 %reduce.add.0, %v2
@@ -71,7 +72,7 @@ for.body:
 
   %reduce.add.4.narrow = trunc i64 %reduce.add.4 to i8
 
-  %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
   store i8 %reduce.add.4.narrow, i8* %out
 
   %iv.next = add nuw nsw i64 %iv.0, 6

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll
new file mode 100644
index 0000000000000..76bb8892b97c0
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i64, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x i64] zeroinitializer, align 128
+ at B = global [1024 x i8] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 49 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 98 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 38 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 91 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 182 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 70 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 140 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+;
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 70 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 140 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+  %iv.5 = add nuw nsw i64 %iv, 5
+  %iv.6 = add nuw nsw i64 %iv, 6
+
+  %in0 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.3
+  %in4 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.4
+  %in5 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.5
+  %in6 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.6
+
+  %v0 = load i64, ptr %in0
+  %v1 = load i64, ptr %in1
+  %v2 = load i64, ptr %in2
+  %v3 = load i64, ptr %in3
+  %v4 = load i64, ptr %in4
+  %v5 = load i64, ptr %in5
+  %v6 = load i64, ptr %in6
+
+  %reduce.add.0 = add i64 %v0, %v1
+  %reduce.add.1 = add i64 %reduce.add.0, %v2
+  %reduce.add.2 = add i64 %reduce.add.1, %v3
+  %reduce.add.3 = add i64 %reduce.add.2, %v4
+  %reduce.add.4 = add i64 %reduce.add.3, %v5
+  %reduce.add.5 = add i64 %reduce.add.4, %v6
+
+  %reduce.add.5.narrow = trunc i64 %reduce.add.5 to i8
+
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
+  store i8 %reduce.add.5.narrow, i8* %out
+
+  %iv.next = add nuw nsw i64 %iv.0, 7
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll
new file mode 100644
index 0000000000000..807b6df9617cc
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll
@@ -0,0 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i64, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x i64] zeroinitializer, align 128
+ at B = global [1024 x i8] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 56 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; SSE2:  LV: Found an estimated cost of 112 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 44 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 104 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX1:  LV: Found an estimated cost of 208 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 80 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2:  LV: Found an estimated cost of 160 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+;
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 14 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512:  LV: Found an estimated cost of 160 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+  %iv.5 = add nuw nsw i64 %iv, 5
+  %iv.6 = add nuw nsw i64 %iv, 6
+  %iv.7 = add nuw nsw i64 %iv, 7
+
+  %in0 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.3
+  %in4 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.4
+  %in5 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.5
+  %in6 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.6
+  %in7 = getelementptr inbounds [1024 x i64], ptr @A, i64 0, i64 %iv.7
+
+  %v0 = load i64, ptr %in0
+  %v1 = load i64, ptr %in1
+  %v2 = load i64, ptr %in2
+  %v3 = load i64, ptr %in3
+  %v4 = load i64, ptr %in4
+  %v5 = load i64, ptr %in5
+  %v6 = load i64, ptr %in6
+  %v7 = load i64, ptr %in7
+
+  %reduce.add.0 = add i64 %v0, %v1
+  %reduce.add.1 = add i64 %reduce.add.0, %v2
+  %reduce.add.2 = add i64 %reduce.add.1, %v3
+  %reduce.add.3 = add i64 %reduce.add.2, %v4
+  %reduce.add.4 = add i64 %reduce.add.3, %v5
+  %reduce.add.5 = add i64 %reduce.add.4, %v6
+  %reduce.add.6 = add i64 %reduce.add.5, %v7
+
+  %reduce.add.6.narrow = trunc i64 %reduce.add.6 to i8
+
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
+  store i8 %reduce.add.6.narrow, i8* %out
+
+  %iv.next = add nuw nsw i64 %iv.0, 8
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-2.ll
index 184d7eda3e615..9c81d7aef45dd 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-2.ll
@@ -1,7 +1,9 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i8, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512DQ
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl,+avx512bw --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512BW
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +12,48 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 14 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 30 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 62 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 126 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
 ;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i8, i8* %in0, align 1
-; SSE2: LV: Found an estimated cost of 14 for VF 2 For instruction:   %v0 = load i8, i8* %in0, align 1
-; SSE2: LV: Found an estimated cost of 30 for VF 4 For instruction:   %v0 = load i8, i8* %in0, align 1
-; SSE2: LV: Found an estimated cost of 62 for VF 8 For instruction:   %v0 = load i8, i8* %in0, align 1
-; SSE2: LV: Found an estimated cost of 126 for VF 16 For instruction:   %v0 = load i8, i8* %in0, align 1
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 17 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 33 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 81 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 166 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX1: LV: Found an estimated cost of 9 for VF 2 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX1: LV: Found an estimated cost of 17 for VF 4 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX1: LV: Found an estimated cost of 33 for VF 8 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX1: LV: Found an estimated cost of 81 for VF 16 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX1: LV: Found an estimated cost of 166 for VF 32 For instruction:   %v0 = load i8, i8* %in0, align 1
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 3 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 5 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 8 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX2: LV: Found an estimated cost of 3 for VF 4 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX2: LV: Found an estimated cost of 3 for VF 8 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX2: LV: Found an estimated cost of 5 for VF 16 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX2: LV: Found an estimated cost of 8 for VF 32 For instruction:   %v0 = load i8, i8* %in0, align 1
+; AVX512DQ-LABEL: 'test'
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 3 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 5 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 7 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 362 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX512: LV: Found an estimated cost of 3 for VF 4 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX512: LV: Found an estimated cost of 3 for VF 8 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX512: LV: Found an estimated cost of 9 for VF 16 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX512: LV: Found an estimated cost of 17 for VF 32 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX512: LV: Found an estimated cost of 41 for VF 64 For instruction:   %v0 = load i8, i8* %in0, align 1
+; AVX512BW-LABEL: 'test'
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 3 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 9 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 17 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 41 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %v0 = load i8, i8* %in0, align 1
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -52,15 +63,15 @@ for.body:
   %iv.0 = add nuw nsw i64 %iv, 0
   %iv.1 = add nuw nsw i64 %iv, 1
 
-  %in0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
-  %in1 = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.1
+  %in0 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.1
 
-  %v0 = load i8, i8* %in0
-  %v1 = load i8, i8* %in1
+  %v0 = load i8, ptr %in0
+  %v1 = load i8, ptr %in1
 
   %reduce.add.0 = add i8 %v0, %v1
 
-  %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
   store i8 %reduce.add.0, i8* %out
 
   %iv.next = add nuw nsw i64 %iv.0, 2

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-3.ll
index e3ecd89d30f88..977fe6b56ad09 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-3.ll
@@ -1,7 +1,9 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i8, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512DQ
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl,+avx512bw --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512BW
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +12,48 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 23 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 50 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 93 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 189 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
 ;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i8, i8* %in0, align 1
-; SSE2: LV: Found an estimated cost of 23 for VF 2 For instruction:   %v0 = load i8, i8* %in0, align 1
-; SSE2: LV: Found an estimated cost of 50 for VF 4 For instruction:   %v0 = load i8, i8* %in0, align 1
-; SSE2: LV: Found an estimated cost of 93 for VF 8 For instruction:   %v0 = load i8, i8* %in0, align 1
-; SSE2: LV: Found an estimated cost of 189 for VF 16 For instruction:   %v0 = load i8, i8* %in0, align 1
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 27 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 59 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 114 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 249 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX1: LV: Found an estimated cost of 15 for VF 2 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX1: LV: Found an estimated cost of 27 for VF 4 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX1: LV: Found an estimated cost of 59 for VF 8 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX1: LV: Found an estimated cost of 114 for VF 16 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX1: LV: Found an estimated cost of 249 for VF 32 For instruction:   %v0 = load i8, i8* %in0, align 1
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 9 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 13 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 17 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX2: LV: Found an estimated cost of 6 for VF 4 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX2: LV: Found an estimated cost of 9 for VF 8 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX2: LV: Found an estimated cost of 13 for VF 16 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX2: LV: Found an estimated cost of 17 for VF 32 For instruction:   %v0 = load i8, i8* %in0, align 1
+; AVX512DQ-LABEL: 'test'
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 9 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 14 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 16 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 543 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX512: LV: Found an estimated cost of 4 for VF 4 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX512: LV: Found an estimated cost of 13 for VF 8 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX512: LV: Found an estimated cost of 13 for VF 16 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX512: LV: Found an estimated cost of 16 for VF 32 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX512: LV: Found an estimated cost of 25 for VF 64 For instruction:   %v0 = load i8, i8* %in0, align 1
+; AVX512BW-LABEL: 'test'
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 13 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 13 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 16 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 25 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %v0 = load i8, i8* %in0, align 1
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -53,18 +64,18 @@ for.body:
   %iv.1 = add nuw nsw i64 %iv, 1
   %iv.2 = add nuw nsw i64 %iv, 2
 
-  %in0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
-  %in1 = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.1
-  %in2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.2
+  %in0 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.2
 
-  %v0 = load i8, i8* %in0
-  %v1 = load i8, i8* %in1
-  %v2 = load i8, i8* %in2
+  %v0 = load i8, ptr %in0
+  %v1 = load i8, ptr %in1
+  %v2 = load i8, ptr %in2
 
   %reduce.add.0 = add i8 %v0, %v1
   %reduce.add.1 = add i8 %reduce.add.0, %v2
 
-  %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
   store i8 %reduce.add.1, i8* %out
 
   %iv.next = add nuw nsw i64 %iv.0, 3

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-4.ll
index 8866bb0c0420f..9645fea8d0399 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-4.ll
@@ -1,7 +1,9 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i8, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512DQ
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl,+avx512bw --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512BW
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +12,48 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 60 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 124 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 252 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
 ;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i8, i8* %in0, align 1
-; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction:   %v0 = load i8, i8* %in0, align 1
-; SSE2: LV: Found an estimated cost of 60 for VF 4 For instruction:   %v0 = load i8, i8* %in0, align 1
-; SSE2: LV: Found an estimated cost of 124 for VF 8 For instruction:   %v0 = load i8, i8* %in0, align 1
-; SSE2: LV: Found an estimated cost of 252 for VF 16 For instruction:   %v0 = load i8, i8* %in0, align 1
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 17 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 33 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 81 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 162 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 332 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX1: LV: Found an estimated cost of 17 for VF 2 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX1: LV: Found an estimated cost of 33 for VF 4 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX1: LV: Found an estimated cost of 81 for VF 8 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX1: LV: Found an estimated cost of 162 for VF 16 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX1: LV: Found an estimated cost of 332 for VF 32 For instruction:   %v0 = load i8, i8* %in0, align 1
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 13 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 26 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 60 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX2: LV: Found an estimated cost of 5 for VF 4 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX2: LV: Found an estimated cost of 13 for VF 8 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX2: LV: Found an estimated cost of 26 for VF 16 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX2: LV: Found an estimated cost of 60 for VF 32 For instruction:   %v0 = load i8, i8* %in0, align 1
+; AVX512DQ-LABEL: 'test'
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 13 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 25 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 58 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 724 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX512: LV: Found an estimated cost of 5 for VF 4 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX512: LV: Found an estimated cost of 17 for VF 8 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX512: LV: Found an estimated cost of 33 for VF 16 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX512: LV: Found an estimated cost of 80 for VF 32 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX512: LV: Found an estimated cost of 238 for VF 64 For instruction:   %v0 = load i8, i8* %in0, align 1
+; AVX512BW-LABEL: 'test'
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 17 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 33 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 80 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 238 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %v0 = load i8, i8* %in0, align 1
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -54,21 +65,21 @@ for.body:
   %iv.2 = add nuw nsw i64 %iv, 2
   %iv.3 = add nuw nsw i64 %iv, 3
 
-  %in0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
-  %in1 = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.1
-  %in2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.2
-  %in3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.3
+  %in0 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.3
 
-  %v0 = load i8, i8* %in0
-  %v1 = load i8, i8* %in1
-  %v2 = load i8, i8* %in2
-  %v3 = load i8, i8* %in3
+  %v0 = load i8, ptr %in0
+  %v1 = load i8, ptr %in1
+  %v2 = load i8, ptr %in2
+  %v3 = load i8, ptr %in3
 
   %reduce.add.0 = add i8 %v0, %v1
   %reduce.add.1 = add i8 %reduce.add.0, %v2
   %reduce.add.2 = add i8 %reduce.add.1, %v3
 
-  %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
   store i8 %reduce.add.2, i8* %out
 
   %iv.next = add nuw nsw i64 %iv.0, 4

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll
new file mode 100644
index 0000000000000..c1d22742ff05c
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i8, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512DQ
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl,+avx512bw --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512BW
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x i8] zeroinitializer, align 128
+ at B = global [1024 x i8] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 37 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 75 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 155 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 315 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 23 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 48 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 98 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 195 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 415 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 160 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 330 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+;
+; AVX512DQ-LABEL: 'test'
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 23 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 48 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 107 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 210 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 445 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 905 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
+;
+; AVX512BW-LABEL: 'test'
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 21 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 41 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 99 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 198 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 395 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+
+  %in0 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.3
+  %in4 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.4
+
+  %v0 = load i8, ptr %in0
+  %v1 = load i8, ptr %in1
+  %v2 = load i8, ptr %in2
+  %v3 = load i8, ptr %in3
+  %v4 = load i8, ptr %in4
+
+  %reduce.add.0 = add i8 %v0, %v1
+  %reduce.add.1 = add i8 %reduce.add.0, %v2
+  %reduce.add.2 = add i8 %reduce.add.1, %v3
+  %reduce.add.3 = add i8 %reduce.add.2, %v4
+
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
+  store i8 %reduce.add.3, i8* %out
+
+  %iv.next = add nuw nsw i64 %iv.0, 5
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-6.ll
index 5d371c326cd52..e36240e834a39 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-6.ll
@@ -1,7 +1,9 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i8, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512DQ
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl,+avx512bw --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512BW
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +12,48 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 47 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 90 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 186 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 378 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
 ;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i8, i8* %in0, align 1
-; SSE2: LV: Found an estimated cost of 47 for VF 2 For instruction:   %v0 = load i8, i8* %in0, align 1
-; SSE2: LV: Found an estimated cost of 90 for VF 4 For instruction:   %v0 = load i8, i8* %in0, align 1
-; SSE2: LV: Found an estimated cost of 186 for VF 8 For instruction:   %v0 = load i8, i8* %in0, align 1
-; SSE2: LV: Found an estimated cost of 378 for VF 16 For instruction:   %v0 = load i8, i8* %in0, align 1
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 27 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 59 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 114 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 243 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 498 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX1: LV: Found an estimated cost of 27 for VF 2 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX1: LV: Found an estimated cost of 59 for VF 4 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX1: LV: Found an estimated cost of 114 for VF 8 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX1: LV: Found an estimated cost of 243 for VF 16 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX1: LV: Found an estimated cost of 498 for VF 32 For instruction:   %v0 = load i8, i8* %in0, align 1
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 17 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 46 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 88 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX2: LV: Found an estimated cost of 9 for VF 2 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX2: LV: Found an estimated cost of 17 for VF 4 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX2: LV: Found an estimated cost of 46 for VF 16 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX2: LV: Found an estimated cost of 88 for VF 32 For instruction:   %v0 = load i8, i8* %in0, align 1
+; AVX512DQ-LABEL: 'test'
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 17 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 21 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 45 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 85 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1086 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX512: LV: Found an estimated cost of 7 for VF 2 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX512: LV: Found an estimated cost of 25 for VF 4 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX512: LV: Found an estimated cost of 49 for VF 8 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX512: LV: Found an estimated cost of 119 for VF 16 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX512: LV: Found an estimated cost of 237 for VF 32 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX512: LV: Found an estimated cost of 591 for VF 64 For instruction:   %v0 = load i8, i8* %in0, align 1
+; AVX512BW-LABEL: 'test'
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 25 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 49 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 119 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 237 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 591 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   %v0 = load i8, i8* %in0, align 1
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -56,19 +67,19 @@ for.body:
   %iv.4 = add nuw nsw i64 %iv, 4
   %iv.5 = add nuw nsw i64 %iv, 5
 
-  %in0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
-  %in1 = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.1
-  %in2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.2
-  %in3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.3
-  %in4 = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.4
-  %in5 = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.5
+  %in0 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.3
+  %in4 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.4
+  %in5 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.5
 
-  %v0 = load i8, i8* %in0
-  %v1 = load i8, i8* %in1
-  %v2 = load i8, i8* %in2
-  %v3 = load i8, i8* %in3
-  %v4 = load i8, i8* %in4
-  %v5 = load i8, i8* %in5
+  %v0 = load i8, ptr %in0
+  %v1 = load i8, ptr %in1
+  %v2 = load i8, ptr %in2
+  %v3 = load i8, ptr %in3
+  %v4 = load i8, ptr %in4
+  %v5 = load i8, ptr %in5
 
   %reduce.add.0 = add i8 %v0, %v1
   %reduce.add.1 = add i8 %reduce.add.0, %v2
@@ -76,7 +87,7 @@ for.body:
   %reduce.add.3 = add i8 %reduce.add.2, %v4
   %reduce.add.4 = add i8 %reduce.add.3, %v5
 
-  %out = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
   store i8 %reduce.add.4, i8* %out
 
   %iv.next = add nuw nsw i64 %iv.0, 6

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll
new file mode 100644
index 0000000000000..ce52d84ac5f0d
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll
@@ -0,0 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i8, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512DQ
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl,+avx512bw --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512BW
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x i8] zeroinitializer, align 128
+ at B = global [1024 x i8] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 56 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 110 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 217 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 441 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 33 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 73 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 140 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 276 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 581 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 224 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 462 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+;
+; AVX512DQ-LABEL: 'test'
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 33 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 73 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 157 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 308 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 626 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1267 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
+;
+; AVX512BW-LABEL: 'test'
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 29 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 57 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 138 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 413 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 826 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+  %iv.5 = add nuw nsw i64 %iv, 5
+  %iv.6 = add nuw nsw i64 %iv, 6
+
+  %in0 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.3
+  %in4 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.4
+  %in5 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.5
+  %in6 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.6
+
+  %v0 = load i8, ptr %in0
+  %v1 = load i8, ptr %in1
+  %v2 = load i8, ptr %in2
+  %v3 = load i8, ptr %in3
+  %v4 = load i8, ptr %in4
+  %v5 = load i8, ptr %in5
+  %v6 = load i8, ptr %in6
+
+  %reduce.add.0 = add i8 %v0, %v1
+  %reduce.add.1 = add i8 %reduce.add.0, %v2
+  %reduce.add.2 = add i8 %reduce.add.1, %v3
+  %reduce.add.3 = add i8 %reduce.add.2, %v4
+  %reduce.add.4 = add i8 %reduce.add.3, %v5
+  %reduce.add.5 = add i8 %reduce.add.4, %v6
+
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
+  store i8 %reduce.add.5, i8* %out
+
+  %iv.next = add nuw nsw i64 %iv.0, 7
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll
new file mode 100644
index 0000000000000..62b4900174573
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i8, ptr %in0"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512DQ
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl,+avx512bw --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512BW
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x i8] zeroinitializer, align 128
+ at B = global [1024 x i8] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 56 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 120 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 248 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  LV: Found an estimated cost of 504 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 33 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 81 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 162 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 324 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  LV: Found an estimated cost of 664 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 64 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 128 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 256 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  LV: Found an estimated cost of 528 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+;
+; AVX512DQ-LABEL: 'test'
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 33 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 81 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 177 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 354 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 724 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1448 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
+;
+; AVX512BW-LABEL: 'test'
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 33 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 65 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 158 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 472 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  LV: Found an estimated cost of 1100 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+  %iv.5 = add nuw nsw i64 %iv, 5
+  %iv.6 = add nuw nsw i64 %iv, 6
+  %iv.7 = add nuw nsw i64 %iv, 7
+
+  %in0 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.1
+  %in2 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.2
+  %in3 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.3
+  %in4 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.4
+  %in5 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.5
+  %in6 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.6
+  %in7 = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.7
+
+  %v0 = load i8, ptr %in0
+  %v1 = load i8, ptr %in1
+  %v2 = load i8, ptr %in2
+  %v3 = load i8, ptr %in3
+  %v4 = load i8, ptr %in4
+  %v5 = load i8, ptr %in5
+  %v6 = load i8, ptr %in6
+  %v7 = load i8, ptr %in7
+
+  %reduce.add.0 = add i8 %v0, %v1
+  %reduce.add.1 = add i8 %reduce.add.0, %v2
+  %reduce.add.2 = add i8 %reduce.add.1, %v3
+  %reduce.add.3 = add i8 %reduce.add.2, %v4
+  %reduce.add.4 = add i8 %reduce.add.3, %v5
+  %reduce.add.5 = add i8 %reduce.add.4, %v6
+  %reduce.add.6 = add i8 %reduce.add.5, %v7
+
+  %out = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
+  store i8 %reduce.add.6, i8* %out
+
+  %iv.next = add nuw nsw i64 %iv.0, 8
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-2.ll
index 41bcb1844f348..ab904c895e57a 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-2.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store float %v1, ptr %out1"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +11,39 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x float] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
-;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store float %v1, float* %out1, align 4
-; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction:   store float %v1, float* %out1, align 4
-; SSE2: LV: Found an estimated cost of 14 for VF 4 For instruction:   store float %v1, float* %out1, align 4
-; SSE2: LV: Found an estimated cost of 28 for VF 8 For instruction:   store float %v1, float* %out1, align 4
-; SSE2: LV: Found an estimated cost of 56 for VF 16 For instruction:   store float %v1, float* %out1, align 4
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v1, ptr %out1, align 4
+; SSE2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store float %v1, ptr %out1, align 4
+; SSE2:  LV: Found an estimated cost of 14 for VF 4 For instruction: store float %v1, ptr %out1, align 4
+; SSE2:  LV: Found an estimated cost of 28 for VF 8 For instruction: store float %v1, ptr %out1, align 4
+; SSE2:  LV: Found an estimated cost of 56 for VF 16 For instruction: store float %v1, ptr %out1, align 4
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store float %v1, float* %out1, align 4
-; AVX1: LV: Found an estimated cost of 2 for VF 2 For instruction:   store float %v1, float* %out1, align 4
-; AVX1: LV: Found an estimated cost of 15 for VF 4 For instruction:   store float %v1, float* %out1, align 4
-; AVX1: LV: Found an estimated cost of 38 for VF 8 For instruction:   store float %v1, float* %out1, align 4
-; AVX1: LV: Found an estimated cost of 76 for VF 16 For instruction:   store float %v1, float* %out1, align 4
-; AVX1: LV: Found an estimated cost of 152 for VF 32 For instruction:   store float %v1, float* %out1, align 4
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v1, ptr %out1, align 4
+; AVX1:  LV: Found an estimated cost of 2 for VF 2 For instruction: store float %v1, ptr %out1, align 4
+; AVX1:  LV: Found an estimated cost of 15 for VF 4 For instruction: store float %v1, ptr %out1, align 4
+; AVX1:  LV: Found an estimated cost of 38 for VF 8 For instruction: store float %v1, ptr %out1, align 4
+; AVX1:  LV: Found an estimated cost of 76 for VF 16 For instruction: store float %v1, ptr %out1, align 4
+; AVX1:  LV: Found an estimated cost of 152 for VF 32 For instruction: store float %v1, ptr %out1, align 4
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store float %v1, float* %out1, align 4
-; AVX2: LV: Found an estimated cost of 2 for VF 2 For instruction:   store float %v1, float* %out1, align 4
-; AVX2: LV: Found an estimated cost of 3 for VF 4 For instruction:   store float %v1, float* %out1, align 4
-; AVX2: LV: Found an estimated cost of 6 for VF 8 For instruction:   store float %v1, float* %out1, align 4
-; AVX2: LV: Found an estimated cost of 12 for VF 16 For instruction:   store float %v1, float* %out1, align 4
-; AVX2: LV: Found an estimated cost of 24 for VF 32 For instruction:   store float %v1, float* %out1, align 4
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v1, ptr %out1, align 4
+; AVX2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store float %v1, ptr %out1, align 4
+; AVX2:  LV: Found an estimated cost of 3 for VF 4 For instruction: store float %v1, ptr %out1, align 4
+; AVX2:  LV: Found an estimated cost of 6 for VF 8 For instruction: store float %v1, ptr %out1, align 4
+; AVX2:  LV: Found an estimated cost of 12 for VF 16 For instruction: store float %v1, ptr %out1, align 4
+; AVX2:  LV: Found an estimated cost of 24 for VF 32 For instruction: store float %v1, ptr %out1, align 4
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store float %v1, float* %out1, align 4
-; AVX512: LV: Found an estimated cost of 2 for VF 2 For instruction:   store float %v1, float* %out1, align 4
-; AVX512: LV: Found an estimated cost of 2 for VF 4 For instruction:   store float %v1, float* %out1, align 4
-; AVX512: LV: Found an estimated cost of 2 for VF 8 For instruction:   store float %v1, float* %out1, align 4
-; AVX512: LV: Found an estimated cost of 5 for VF 16 For instruction:   store float %v1, float* %out1, align 4
-; AVX512: LV: Found an estimated cost of 10 for VF 32 For instruction:   store float %v1, float* %out1, align 4
-; AVX512: LV: Found an estimated cost of 20 for VF 64 For instruction:   store float %v1, float* %out1, align 4
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v1, ptr %out1, align 4
+; AVX512:  LV: Found an estimated cost of 2 for VF 2 For instruction: store float %v1, ptr %out1, align 4
+; AVX512:  LV: Found an estimated cost of 2 for VF 4 For instruction: store float %v1, ptr %out1, align 4
+; AVX512:  LV: Found an estimated cost of 2 for VF 8 For instruction: store float %v1, ptr %out1, align 4
+; AVX512:  LV: Found an estimated cost of 5 for VF 16 For instruction: store float %v1, ptr %out1, align 4
+; AVX512:  LV: Found an estimated cost of 10 for VF 32 For instruction: store float %v1, ptr %out1, align 4
+; AVX512:  LV: Found an estimated cost of 20 for VF 64 For instruction: store float %v1, ptr %out1, align 4
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store float %v1, float* %out1, align 4
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -52,19 +53,19 @@ for.body:
   %iv.0 = add nuw nsw i64 %iv, 0
   %iv.1 = add nuw nsw i64 %iv, 1
 
-  %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
-  %v.narrow = load i8, i8* %in
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
 
   %v = uitofp i8 %v.narrow to float
 
   %v0 = fadd float %v, 0.0
   %v1 = fadd float %v, 1.0
 
-  %out0 = getelementptr inbounds [1024 x float], [1024 x float]* @B, i64 0, i64 %iv.0
-  %out1 = getelementptr inbounds [1024 x float], [1024 x float]* @B, i64 0, i64 %iv.1
+  %out0 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.1
 
-  store float %v0, float* %out0
-  store float %v1, float* %out1
+  store float %v0, ptr %out0
+  store float %v1, ptr %out1
 
   %iv.next = add nuw nsw i64 %iv.0, 2
   %cmp = icmp ult i64 %iv.next, 1024

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-3.ll
index 637a43de59da6..05f55e6d18aba 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-3.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store float %v2, ptr %out2"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +11,39 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x float] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
-;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store float %v2, float* %out2, align 4
-; SSE2: LV: Found an estimated cost of 11 for VF 2 For instruction:   store float %v2, float* %out2, align 4
-; SSE2: LV: Found an estimated cost of 24 for VF 4 For instruction:   store float %v2, float* %out2, align 4
-; SSE2: LV: Found an estimated cost of 48 for VF 8 For instruction:   store float %v2, float* %out2, align 4
-; SSE2: LV: Found an estimated cost of 96 for VF 16 For instruction:   store float %v2, float* %out2, align 4
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v2, ptr %out2, align 4
+; SSE2:  LV: Found an estimated cost of 11 for VF 2 For instruction: store float %v2, ptr %out2, align 4
+; SSE2:  LV: Found an estimated cost of 24 for VF 4 For instruction: store float %v2, ptr %out2, align 4
+; SSE2:  LV: Found an estimated cost of 48 for VF 8 For instruction: store float %v2, ptr %out2, align 4
+; SSE2:  LV: Found an estimated cost of 96 for VF 16 For instruction: store float %v2, ptr %out2, align 4
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store float %v2, float* %out2, align 4
-; AVX1: LV: Found an estimated cost of 13 for VF 2 For instruction:   store float %v2, float* %out2, align 4
-; AVX1: LV: Found an estimated cost of 23 for VF 4 For instruction:   store float %v2, float* %out2, align 4
-; AVX1: LV: Found an estimated cost of 57 for VF 8 For instruction:   store float %v2, float* %out2, align 4
-; AVX1: LV: Found an estimated cost of 114 for VF 16 For instruction:   store float %v2, float* %out2, align 4
-; AVX1: LV: Found an estimated cost of 228 for VF 32 For instruction:   store float %v2, float* %out2, align 4
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v2, ptr %out2, align 4
+; AVX1:  LV: Found an estimated cost of 13 for VF 2 For instruction: store float %v2, ptr %out2, align 4
+; AVX1:  LV: Found an estimated cost of 23 for VF 4 For instruction: store float %v2, ptr %out2, align 4
+; AVX1:  LV: Found an estimated cost of 57 for VF 8 For instruction: store float %v2, ptr %out2, align 4
+; AVX1:  LV: Found an estimated cost of 114 for VF 16 For instruction: store float %v2, ptr %out2, align 4
+; AVX1:  LV: Found an estimated cost of 228 for VF 32 For instruction: store float %v2, ptr %out2, align 4
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store float %v2, float* %out2, align 4
-; AVX2: LV: Found an estimated cost of 7 for VF 2 For instruction:   store float %v2, float* %out2, align 4
-; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction:   store float %v2, float* %out2, align 4
-; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction:   store float %v2, float* %out2, align 4
-; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction:   store float %v2, float* %out2, align 4
-; AVX2: LV: Found an estimated cost of 60 for VF 32 For instruction:   store float %v2, float* %out2, align 4
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v2, ptr %out2, align 4
+; AVX2:  LV: Found an estimated cost of 7 for VF 2 For instruction: store float %v2, ptr %out2, align 4
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: store float %v2, ptr %out2, align 4
+; AVX2:  LV: Found an estimated cost of 14 for VF 8 For instruction: store float %v2, ptr %out2, align 4
+; AVX2:  LV: Found an estimated cost of 28 for VF 16 For instruction: store float %v2, ptr %out2, align 4
+; AVX2:  LV: Found an estimated cost of 60 for VF 32 For instruction: store float %v2, ptr %out2, align 4
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store float %v2, float* %out2, align 4
-; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction:   store float %v2, float* %out2, align 4
-; AVX512: LV: Found an estimated cost of 4 for VF 4 For instruction:   store float %v2, float* %out2, align 4
-; AVX512: LV: Found an estimated cost of 8 for VF 8 For instruction:   store float %v2, float* %out2, align 4
-; AVX512: LV: Found an estimated cost of 12 for VF 16 For instruction:   store float %v2, float* %out2, align 4
-; AVX512: LV: Found an estimated cost of 24 for VF 32 For instruction:   store float %v2, float* %out2, align 4
-; AVX512: LV: Found an estimated cost of 48 for VF 64 For instruction:   store float %v2, float* %out2, align 4
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v2, ptr %out2, align 4
+; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: store float %v2, ptr %out2, align 4
+; AVX512:  LV: Found an estimated cost of 4 for VF 4 For instruction: store float %v2, ptr %out2, align 4
+; AVX512:  LV: Found an estimated cost of 8 for VF 8 For instruction: store float %v2, ptr %out2, align 4
+; AVX512:  LV: Found an estimated cost of 12 for VF 16 For instruction: store float %v2, ptr %out2, align 4
+; AVX512:  LV: Found an estimated cost of 24 for VF 32 For instruction: store float %v2, ptr %out2, align 4
+; AVX512:  LV: Found an estimated cost of 48 for VF 64 For instruction: store float %v2, ptr %out2, align 4
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store float %v2, float* %out2, align 4
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -53,8 +54,8 @@ for.body:
   %iv.1 = add nuw nsw i64 %iv, 1
   %iv.2 = add nuw nsw i64 %iv, 2
 
-  %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
-  %v.narrow = load i8, i8* %in
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
 
   %v = uitofp i8 %v.narrow to float
 
@@ -62,13 +63,13 @@ for.body:
   %v1 = fadd float %v, 1.0
   %v2 = fadd float %v, 2.0
 
-  %out0 = getelementptr inbounds [1024 x float], [1024 x float]* @B, i64 0, i64 %iv.0
-  %out1 = getelementptr inbounds [1024 x float], [1024 x float]* @B, i64 0, i64 %iv.1
-  %out2 = getelementptr inbounds [1024 x float], [1024 x float]* @B, i64 0, i64 %iv.2
+  %out0 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.2
 
-  store float %v0, float* %out0
-  store float %v1, float* %out1
-  store float %v2, float* %out2
+  store float %v0, ptr %out0
+  store float %v1, ptr %out1
+  store float %v2, ptr %out2
 
   %iv.next = add nuw nsw i64 %iv.0, 3
   %cmp = icmp ult i64 %iv.next, 1024

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-4.ll
index 4e970745d5a21..cf9cbaaf468c1 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-4.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store float %v3, ptr %out3"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +11,39 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x float] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
-;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store float %v3, float* %out3, align 4
-; SSE2: LV: Found an estimated cost of 12 for VF 2 For instruction:   store float %v3, float* %out3, align 4
-; SSE2: LV: Found an estimated cost of 28 for VF 4 For instruction:   store float %v3, float* %out3, align 4
-; SSE2: LV: Found an estimated cost of 56 for VF 8 For instruction:   store float %v3, float* %out3, align 4
-; SSE2: LV: Found an estimated cost of 112 for VF 16 For instruction:   store float %v3, float* %out3, align 4
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v3, ptr %out3, align 4
+; SSE2:  LV: Found an estimated cost of 12 for VF 2 For instruction: store float %v3, ptr %out3, align 4
+; SSE2:  LV: Found an estimated cost of 28 for VF 4 For instruction: store float %v3, ptr %out3, align 4
+; SSE2:  LV: Found an estimated cost of 56 for VF 8 For instruction: store float %v3, ptr %out3, align 4
+; SSE2:  LV: Found an estimated cost of 112 for VF 16 For instruction: store float %v3, ptr %out3, align 4
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store float %v3, float* %out3, align 4
-; AVX1: LV: Found an estimated cost of 13 for VF 2 For instruction:   store float %v3, float* %out3, align 4
-; AVX1: LV: Found an estimated cost of 30 for VF 4 For instruction:   store float %v3, float* %out3, align 4
-; AVX1: LV: Found an estimated cost of 76 for VF 8 For instruction:   store float %v3, float* %out3, align 4
-; AVX1: LV: Found an estimated cost of 152 for VF 16 For instruction:   store float %v3, float* %out3, align 4
-; AVX1: LV: Found an estimated cost of 304 for VF 32 For instruction:   store float %v3, float* %out3, align 4
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v3, ptr %out3, align 4
+; AVX1:  LV: Found an estimated cost of 13 for VF 2 For instruction: store float %v3, ptr %out3, align 4
+; AVX1:  LV: Found an estimated cost of 30 for VF 4 For instruction: store float %v3, ptr %out3, align 4
+; AVX1:  LV: Found an estimated cost of 76 for VF 8 For instruction: store float %v3, ptr %out3, align 4
+; AVX1:  LV: Found an estimated cost of 152 for VF 16 For instruction: store float %v3, ptr %out3, align 4
+; AVX1:  LV: Found an estimated cost of 304 for VF 32 For instruction: store float %v3, ptr %out3, align 4
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store float %v3, float* %out3, align 4
-; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction:   store float %v3, float* %out3, align 4
-; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction:   store float %v3, float* %out3, align 4
-; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction:   store float %v3, float* %out3, align 4
-; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction:   store float %v3, float* %out3, align 4
-; AVX2: LV: Found an estimated cost of 80 for VF 32 For instruction:   store float %v3, float* %out3, align 4
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v3, ptr %out3, align 4
+; AVX2:  LV: Found an estimated cost of 6 for VF 2 For instruction: store float %v3, ptr %out3, align 4
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: store float %v3, ptr %out3, align 4
+; AVX2:  LV: Found an estimated cost of 20 for VF 8 For instruction: store float %v3, ptr %out3, align 4
+; AVX2:  LV: Found an estimated cost of 40 for VF 16 For instruction: store float %v3, ptr %out3, align 4
+; AVX2:  LV: Found an estimated cost of 80 for VF 32 For instruction: store float %v3, ptr %out3, align 4
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store float %v3, float* %out3, align 4
-; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction:   store float %v3, float* %out3, align 4
-; AVX512: LV: Found an estimated cost of 5 for VF 4 For instruction:   store float %v3, float* %out3, align 4
-; AVX512: LV: Found an estimated cost of 11 for VF 8 For instruction:   store float %v3, float* %out3, align 4
-; AVX512: LV: Found an estimated cost of 22 for VF 16 For instruction:   store float %v3, float* %out3, align 4
-; AVX512: LV: Found an estimated cost of 44 for VF 32 For instruction:   store float %v3, float* %out3, align 4
-; AVX512: LV: Found an estimated cost of 88 for VF 64 For instruction:   store float %v3, float* %out3, align 4
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v3, ptr %out3, align 4
+; AVX512:  LV: Found an estimated cost of 5 for VF 2 For instruction: store float %v3, ptr %out3, align 4
+; AVX512:  LV: Found an estimated cost of 5 for VF 4 For instruction: store float %v3, ptr %out3, align 4
+; AVX512:  LV: Found an estimated cost of 11 for VF 8 For instruction: store float %v3, ptr %out3, align 4
+; AVX512:  LV: Found an estimated cost of 22 for VF 16 For instruction: store float %v3, ptr %out3, align 4
+; AVX512:  LV: Found an estimated cost of 44 for VF 32 For instruction: store float %v3, ptr %out3, align 4
+; AVX512:  LV: Found an estimated cost of 88 for VF 64 For instruction: store float %v3, ptr %out3, align 4
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store float %v3, float* %out3, align 4
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -54,8 +55,8 @@ for.body:
   %iv.2 = add nuw nsw i64 %iv, 2
   %iv.3 = add nuw nsw i64 %iv, 3
 
-  %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
-  %v.narrow = load i8, i8* %in
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
 
   %v = uitofp i8 %v.narrow to float
 
@@ -64,15 +65,15 @@ for.body:
   %v2 = fadd float %v, 2.0
   %v3 = fadd float %v, 3.0
 
-  %out0 = getelementptr inbounds [1024 x float], [1024 x float]* @B, i64 0, i64 %iv.0
-  %out1 = getelementptr inbounds [1024 x float], [1024 x float]* @B, i64 0, i64 %iv.1
-  %out2 = getelementptr inbounds [1024 x float], [1024 x float]* @B, i64 0, i64 %iv.2
-  %out3 = getelementptr inbounds [1024 x float], [1024 x float]* @B, i64 0, i64 %iv.3
+  %out0 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.2
+  %out3 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.3
 
-  store float %v0, float* %out0
-  store float %v1, float* %out1
-  store float %v2, float* %out2
-  store float %v3, float* %out3
+  store float %v0, ptr %out0
+  store float %v1, ptr %out1
+  store float %v2, ptr %out2
+  store float %v3, ptr %out3
 
   %iv.next = add nuw nsw i64 %iv.0, 4
   %cmp = icmp ult i64 %iv.next, 1024

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-5.ll
new file mode 100644
index 0000000000000..52e553f9562d8
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-5.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store float %v4, ptr %out4"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x i8] zeroinitializer, align 128
+ at B = global [1024 x float] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v4, ptr %out4, align 4
+; SSE2:  LV: Found an estimated cost of 20 for VF 2 For instruction: store float %v4, ptr %out4, align 4
+; SSE2:  LV: Found an estimated cost of 44 for VF 4 For instruction: store float %v4, ptr %out4, align 4
+; SSE2:  LV: Found an estimated cost of 88 for VF 8 For instruction: store float %v4, ptr %out4, align 4
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v4, ptr %out4, align 4
+; AVX1:  LV: Found an estimated cost of 17 for VF 2 For instruction: store float %v4, ptr %out4, align 4
+; AVX1:  LV: Found an estimated cost of 38 for VF 4 For instruction: store float %v4, ptr %out4, align 4
+; AVX1:  LV: Found an estimated cost of 95 for VF 8 For instruction: store float %v4, ptr %out4, align 4
+; AVX1:  LV: Found an estimated cost of 190 for VF 16 For instruction: store float %v4, ptr %out4, align 4
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v4, ptr %out4, align 4
+; AVX2:  LV: Found an estimated cost of 17 for VF 2 For instruction: store float %v4, ptr %out4, align 4
+; AVX2:  LV: Found an estimated cost of 38 for VF 4 For instruction: store float %v4, ptr %out4, align 4
+; AVX2:  LV: Found an estimated cost of 95 for VF 8 For instruction: store float %v4, ptr %out4, align 4
+; AVX2:  LV: Found an estimated cost of 190 for VF 16 For instruction: store float %v4, ptr %out4, align 4
+;
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v4, ptr %out4, align 4
+; AVX512:  LV: Found an estimated cost of 7 for VF 2 For instruction: store float %v4, ptr %out4, align 4
+; AVX512:  LV: Found an estimated cost of 14 for VF 4 For instruction: store float %v4, ptr %out4, align 4
+; AVX512:  LV: Found an estimated cost of 21 for VF 8 For instruction: store float %v4, ptr %out4, align 4
+; AVX512:  LV: Found an estimated cost of 35 for VF 16 For instruction: store float %v4, ptr %out4, align 4
+; AVX512:  LV: Found an estimated cost of 70 for VF 32 For instruction: store float %v4, ptr %out4, align 4
+; AVX512:  LV: Found an estimated cost of 140 for VF 64 For instruction: store float %v4, ptr %out4, align 4
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
+
+  %v = uitofp i8 %v.narrow to float
+
+  %v0 = fadd float %v, 0.0
+  %v1 = fadd float %v, 1.0
+  %v2 = fadd float %v, 2.0
+  %v3 = fadd float %v, 3.0
+  %v4 = fadd float %v, 4.0
+
+  %out0 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.2
+  %out3 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.3
+  %out4 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.4
+
+  store float %v0, ptr %out0
+  store float %v1, ptr %out1
+  store float %v2, ptr %out2
+  store float %v3, ptr %out3
+  store float %v4, ptr %out4
+
+  %iv.next = add nuw nsw i64 %iv.0, 5
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-6.ll
index b515407823d90..9d692b7f4ac2f 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-6.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store float %v5, ptr %out5"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,36 +11,36 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x float] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
-;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store float %v5, float* %out5, align 4
-; SSE2: LV: Found an estimated cost of 21 for VF 2 For instruction:   store float %v5, float* %out5, align 4
-; SSE2: LV: Found an estimated cost of 48 for VF 4 For instruction:   store float %v5, float* %out5, align 4
-; SSE2: LV: Found an estimated cost of 96 for VF 8 For instruction:   store float %v5, float* %out5, align 4
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v5, ptr %out5, align 4
+; SSE2:  LV: Found an estimated cost of 21 for VF 2 For instruction: store float %v5, ptr %out5, align 4
+; SSE2:  LV: Found an estimated cost of 48 for VF 4 For instruction: store float %v5, ptr %out5, align 4
+; SSE2:  LV: Found an estimated cost of 96 for VF 8 For instruction: store float %v5, ptr %out5, align 4
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store float %v5, float* %out5, align 4
-; AVX1: LV: Found an estimated cost of 20 for VF 2 For instruction:   store float %v5, float* %out5, align 4
-; AVX1: LV: Found an estimated cost of 45 for VF 4 For instruction:   store float %v5, float* %out5, align 4
-; AVX1: LV: Found an estimated cost of 114 for VF 8 For instruction:   store float %v5, float* %out5, align 4
-; AVX1: LV: Found an estimated cost of 228 for VF 16 For instruction:   store float %v5, float* %out5, align 4
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v5, ptr %out5, align 4
+; AVX1:  LV: Found an estimated cost of 20 for VF 2 For instruction: store float %v5, ptr %out5, align 4
+; AVX1:  LV: Found an estimated cost of 45 for VF 4 For instruction: store float %v5, ptr %out5, align 4
+; AVX1:  LV: Found an estimated cost of 114 for VF 8 For instruction: store float %v5, ptr %out5, align 4
+; AVX1:  LV: Found an estimated cost of 228 for VF 16 For instruction: store float %v5, ptr %out5, align 4
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store float %v5, float* %out5, align 4
-; AVX2: LV: Found an estimated cost of 11 for VF 2 For instruction:   store float %v5, float* %out5, align 4
-; AVX2: LV: Found an estimated cost of 15 for VF 4 For instruction:   store float %v5, float* %out5, align 4
-; AVX2: LV: Found an estimated cost of 39 for VF 8 For instruction:   store float %v5, float* %out5, align 4
-; AVX2: LV: Found an estimated cost of 78 for VF 16 For instruction:   store float %v5, float* %out5, align 4
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v5, ptr %out5, align 4
+; AVX2:  LV: Found an estimated cost of 11 for VF 2 For instruction: store float %v5, ptr %out5, align 4
+; AVX2:  LV: Found an estimated cost of 15 for VF 4 For instruction: store float %v5, ptr %out5, align 4
+; AVX2:  LV: Found an estimated cost of 39 for VF 8 For instruction: store float %v5, ptr %out5, align 4
+; AVX2:  LV: Found an estimated cost of 78 for VF 16 For instruction: store float %v5, ptr %out5, align 4
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store float %v5, float* %out5, align 4
-; AVX512: LV: Found an estimated cost of 8 for VF 2 For instruction:   store float %v5, float* %out5, align 4
-; AVX512: LV: Found an estimated cost of 17 for VF 4 For instruction:   store float %v5, float* %out5, align 4
-; AVX512: LV: Found an estimated cost of 25 for VF 8 For instruction:   store float %v5, float* %out5, align 4
-; AVX512: LV: Found an estimated cost of 51 for VF 16 For instruction:   store float %v5, float* %out5, align 4
-; AVX512: LV: Found an estimated cost of 102 for VF 32 For instruction:   store float %v5, float* %out5, align 4
-; AVX512: LV: Found an estimated cost of 204 for VF 64 For instruction:   store float %v5, float* %out5, align 4
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v5, ptr %out5, align 4
+; AVX512:  LV: Found an estimated cost of 8 for VF 2 For instruction: store float %v5, ptr %out5, align 4
+; AVX512:  LV: Found an estimated cost of 17 for VF 4 For instruction: store float %v5, ptr %out5, align 4
+; AVX512:  LV: Found an estimated cost of 25 for VF 8 For instruction: store float %v5, ptr %out5, align 4
+; AVX512:  LV: Found an estimated cost of 51 for VF 16 For instruction: store float %v5, ptr %out5, align 4
+; AVX512:  LV: Found an estimated cost of 102 for VF 32 For instruction: store float %v5, ptr %out5, align 4
+; AVX512:  LV: Found an estimated cost of 204 for VF 64 For instruction: store float %v5, ptr %out5, align 4
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store float %v5, float* %out5, align 4
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -53,8 +54,8 @@ for.body:
   %iv.4 = add nuw nsw i64 %iv, 4
   %iv.5 = add nuw nsw i64 %iv, 5
 
-  %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
-  %v.narrow = load i8, i8* %in
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
 
   %v = uitofp i8 %v.narrow to float
 
@@ -65,19 +66,19 @@ for.body:
   %v4 = fadd float %v, 4.0
   %v5 = fadd float %v, 5.0
 
-  %out0 = getelementptr inbounds [1024 x float], [1024 x float]* @B, i64 0, i64 %iv.0
-  %out1 = getelementptr inbounds [1024 x float], [1024 x float]* @B, i64 0, i64 %iv.1
-  %out2 = getelementptr inbounds [1024 x float], [1024 x float]* @B, i64 0, i64 %iv.2
-  %out3 = getelementptr inbounds [1024 x float], [1024 x float]* @B, i64 0, i64 %iv.3
-  %out4 = getelementptr inbounds [1024 x float], [1024 x float]* @B, i64 0, i64 %iv.4
-  %out5 = getelementptr inbounds [1024 x float], [1024 x float]* @B, i64 0, i64 %iv.5
+  %out0 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.2
+  %out3 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.3
+  %out4 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.4
+  %out5 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.5
 
-  store float %v0, float* %out0
-  store float %v1, float* %out1
-  store float %v2, float* %out2
-  store float %v3, float* %out3
-  store float %v4, float* %out4
-  store float %v5, float* %out5
+  store float %v0, ptr %out0
+  store float %v1, ptr %out1
+  store float %v2, ptr %out2
+  store float %v3, ptr %out3
+  store float %v4, ptr %out4
+  store float %v5, ptr %out5
 
   %iv.next = add nuw nsw i64 %iv.0, 6
   %cmp = icmp ult i64 %iv.next, 1024

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-7.ll
new file mode 100644
index 0000000000000..291ee8781ac4f
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-7.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store float %v6, ptr %out6"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x i8] zeroinitializer, align 128
+ at B = global [1024 x float] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v6, ptr %out6, align 4
+; SSE2:  LV: Found an estimated cost of 23 for VF 2 For instruction: store float %v6, ptr %out6, align 4
+; SSE2:  LV: Found an estimated cost of 52 for VF 4 For instruction: store float %v6, ptr %out6, align 4
+; SSE2:  LV: Found an estimated cost of 104 for VF 8 For instruction: store float %v6, ptr %out6, align 4
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v6, ptr %out6, align 4
+; AVX1:  LV: Found an estimated cost of 26 for VF 2 For instruction: store float %v6, ptr %out6, align 4
+; AVX1:  LV: Found an estimated cost of 53 for VF 4 For instruction: store float %v6, ptr %out6, align 4
+; AVX1:  LV: Found an estimated cost of 133 for VF 8 For instruction: store float %v6, ptr %out6, align 4
+; AVX1:  LV: Found an estimated cost of 266 for VF 16 For instruction: store float %v6, ptr %out6, align 4
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v6, ptr %out6, align 4
+; AVX2:  LV: Found an estimated cost of 26 for VF 2 For instruction: store float %v6, ptr %out6, align 4
+; AVX2:  LV: Found an estimated cost of 53 for VF 4 For instruction: store float %v6, ptr %out6, align 4
+; AVX2:  LV: Found an estimated cost of 133 for VF 8 For instruction: store float %v6, ptr %out6, align 4
+; AVX2:  LV: Found an estimated cost of 266 for VF 16 For instruction: store float %v6, ptr %out6, align 4
+;
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v6, ptr %out6, align 4
+; AVX512:  LV: Found an estimated cost of 10 for VF 2 For instruction: store float %v6, ptr %out6, align 4
+; AVX512:  LV: Found an estimated cost of 20 for VF 4 For instruction: store float %v6, ptr %out6, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 8 For instruction: store float %v6, ptr %out6, align 4
+; AVX512:  LV: Found an estimated cost of 70 for VF 16 For instruction: store float %v6, ptr %out6, align 4
+; AVX512:  LV: Found an estimated cost of 140 for VF 32 For instruction: store float %v6, ptr %out6, align 4
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+  %iv.5 = add nuw nsw i64 %iv, 5
+  %iv.6 = add nuw nsw i64 %iv, 6
+
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
+
+  %v = uitofp i8 %v.narrow to float
+
+  %v0 = fadd float %v, 0.0
+  %v1 = fadd float %v, 1.0
+  %v2 = fadd float %v, 2.0
+  %v3 = fadd float %v, 3.0
+  %v4 = fadd float %v, 4.0
+  %v5 = fadd float %v, 5.0
+  %v6 = fadd float %v, 6.0
+
+  %out0 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.2
+  %out3 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.3
+  %out4 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.4
+  %out5 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.5
+  %out6 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.6
+
+  store float %v0, ptr %out0
+  store float %v1, ptr %out1
+  store float %v2, ptr %out2
+  store float %v3, ptr %out3
+  store float %v4, ptr %out4
+  store float %v5, ptr %out5
+  store float %v6, ptr %out6
+
+  %iv.next = add nuw nsw i64 %iv.0, 7
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-8.ll
new file mode 100644
index 0000000000000..702444afafc91
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-8.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store float %v7, ptr %out7"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x i8] zeroinitializer, align 128
+ at B = global [1024 x float] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v7, ptr %out7, align 4
+; SSE2:  LV: Found an estimated cost of 24 for VF 2 For instruction: store float %v7, ptr %out7, align 4
+; SSE2:  LV: Found an estimated cost of 56 for VF 4 For instruction: store float %v7, ptr %out7, align 4
+; SSE2:  LV: Found an estimated cost of 112 for VF 8 For instruction: store float %v7, ptr %out7, align 4
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v7, ptr %out7, align 4
+; AVX1:  LV: Found an estimated cost of 26 for VF 2 For instruction: store float %v7, ptr %out7, align 4
+; AVX1:  LV: Found an estimated cost of 60 for VF 4 For instruction: store float %v7, ptr %out7, align 4
+; AVX1:  LV: Found an estimated cost of 152 for VF 8 For instruction: store float %v7, ptr %out7, align 4
+; AVX1:  LV: Found an estimated cost of 304 for VF 16 For instruction: store float %v7, ptr %out7, align 4
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v7, ptr %out7, align 4
+; AVX2:  LV: Found an estimated cost of 26 for VF 2 For instruction: store float %v7, ptr %out7, align 4
+; AVX2:  LV: Found an estimated cost of 60 for VF 4 For instruction: store float %v7, ptr %out7, align 4
+; AVX2:  LV: Found an estimated cost of 152 for VF 8 For instruction: store float %v7, ptr %out7, align 4
+; AVX2:  LV: Found an estimated cost of 304 for VF 16 For instruction: store float %v7, ptr %out7, align 4
+;
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v7, ptr %out7, align 4
+; AVX512:  LV: Found an estimated cost of 11 for VF 2 For instruction: store float %v7, ptr %out7, align 4
+; AVX512:  LV: Found an estimated cost of 23 for VF 4 For instruction: store float %v7, ptr %out7, align 4
+; AVX512:  LV: Found an estimated cost of 46 for VF 8 For instruction: store float %v7, ptr %out7, align 4
+; AVX512:  LV: Found an estimated cost of 92 for VF 16 For instruction: store float %v7, ptr %out7, align 4
+; AVX512:  LV: Found an estimated cost of 184 for VF 32 For instruction: store float %v7, ptr %out7, align 4
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+  %iv.5 = add nuw nsw i64 %iv, 5
+  %iv.6 = add nuw nsw i64 %iv, 6
+  %iv.7 = add nuw nsw i64 %iv, 7
+
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
+
+  %v = uitofp i8 %v.narrow to float
+
+  %v0 = fadd float %v, 0.0
+  %v1 = fadd float %v, 1.0
+  %v2 = fadd float %v, 2.0
+  %v3 = fadd float %v, 3.0
+  %v4 = fadd float %v, 4.0
+  %v5 = fadd float %v, 5.0
+  %v6 = fadd float %v, 6.0
+  %v7 = fadd float %v, 7.0
+
+  %out0 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.2
+  %out3 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.3
+  %out4 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.4
+  %out5 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.5
+  %out6 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.6
+  %out7 = getelementptr inbounds [1024 x float], ptr @B, i64 0, i64 %iv.7
+
+  store float %v0, ptr %out0
+  store float %v1, ptr %out1
+  store float %v2, ptr %out2
+  store float %v3, ptr %out3
+  store float %v4, ptr %out4
+  store float %v5, ptr %out5
+  store float %v6, ptr %out6
+  store float %v7, ptr %out7
+
+  %iv.next = add nuw nsw i64 %iv.0, 8
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-2.ll
index 88ab8a60531c1..73505eca28610 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-2.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store double %v1, ptr %out1"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +11,39 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x double] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
-;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store double %v1, double* %out1, align 8
-; SSE2: LV: Found an estimated cost of 6 for VF 2 For instruction:   store double %v1, double* %out1, align 8
-; SSE2: LV: Found an estimated cost of 12 for VF 4 For instruction:   store double %v1, double* %out1, align 8
-; SSE2: LV: Found an estimated cost of 24 for VF 8 For instruction:   store double %v1, double* %out1, align 8
-; SSE2: LV: Found an estimated cost of 48 for VF 16 For instruction:   store double %v1, double* %out1, align 8
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8
+; SSE2:  LV: Found an estimated cost of 6 for VF 2 For instruction: store double %v1, ptr %out1, align 8
+; SSE2:  LV: Found an estimated cost of 12 for VF 4 For instruction: store double %v1, ptr %out1, align 8
+; SSE2:  LV: Found an estimated cost of 24 for VF 8 For instruction: store double %v1, ptr %out1, align 8
+; SSE2:  LV: Found an estimated cost of 48 for VF 16 For instruction: store double %v1, ptr %out1, align 8
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store double %v1, double* %out1, align 8
-; AVX1: LV: Found an estimated cost of 6 for VF 2 For instruction:   store double %v1, double* %out1, align 8
-; AVX1: LV: Found an estimated cost of 16 for VF 4 For instruction:   store double %v1, double* %out1, align 8
-; AVX1: LV: Found an estimated cost of 32 for VF 8 For instruction:   store double %v1, double* %out1, align 8
-; AVX1: LV: Found an estimated cost of 64 for VF 16 For instruction:   store double %v1, double* %out1, align 8
-; AVX1: LV: Found an estimated cost of 128 for VF 32 For instruction:   store double %v1, double* %out1, align 8
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8
+; AVX1:  LV: Found an estimated cost of 6 for VF 2 For instruction: store double %v1, ptr %out1, align 8
+; AVX1:  LV: Found an estimated cost of 16 for VF 4 For instruction: store double %v1, ptr %out1, align 8
+; AVX1:  LV: Found an estimated cost of 32 for VF 8 For instruction: store double %v1, ptr %out1, align 8
+; AVX1:  LV: Found an estimated cost of 64 for VF 16 For instruction: store double %v1, ptr %out1, align 8
+; AVX1:  LV: Found an estimated cost of 128 for VF 32 For instruction: store double %v1, ptr %out1, align 8
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store double %v1, double* %out1, align 8
-; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction:   store double %v1, double* %out1, align 8
-; AVX2: LV: Found an estimated cost of 6 for VF 4 For instruction:   store double %v1, double* %out1, align 8
-; AVX2: LV: Found an estimated cost of 12 for VF 8 For instruction:   store double %v1, double* %out1, align 8
-; AVX2: LV: Found an estimated cost of 24 for VF 16 For instruction:   store double %v1, double* %out1, align 8
-; AVX2: LV: Found an estimated cost of 48 for VF 32 For instruction:   store double %v1, double* %out1, align 8
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: store double %v1, ptr %out1, align 8
+; AVX2:  LV: Found an estimated cost of 6 for VF 4 For instruction: store double %v1, ptr %out1, align 8
+; AVX2:  LV: Found an estimated cost of 12 for VF 8 For instruction: store double %v1, ptr %out1, align 8
+; AVX2:  LV: Found an estimated cost of 24 for VF 16 For instruction: store double %v1, ptr %out1, align 8
+; AVX2:  LV: Found an estimated cost of 48 for VF 32 For instruction: store double %v1, ptr %out1, align 8
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store double %v1, double* %out1, align 8
-; AVX512: LV: Found an estimated cost of 2 for VF 2 For instruction:   store double %v1, double* %out1, align 8
-; AVX512: LV: Found an estimated cost of 2 for VF 4 For instruction:   store double %v1, double* %out1, align 8
-; AVX512: LV: Found an estimated cost of 5 for VF 8 For instruction:   store double %v1, double* %out1, align 8
-; AVX512: LV: Found an estimated cost of 10 for VF 16 For instruction:   store double %v1, double* %out1, align 8
-; AVX512: LV: Found an estimated cost of 20 for VF 32 For instruction:   store double %v1, double* %out1, align 8
-; AVX512: LV: Found an estimated cost of 40 for VF 64 For instruction:   store double %v1, double* %out1, align 8
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 2 for VF 2 For instruction: store double %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 2 for VF 4 For instruction: store double %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 5 for VF 8 For instruction: store double %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 16 For instruction: store double %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 32 For instruction: store double %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 64 For instruction: store double %v1, ptr %out1, align 8
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store double %v1, double* %out1, align 8
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -52,19 +53,19 @@ for.body:
   %iv.0 = add nuw nsw i64 %iv, 0
   %iv.1 = add nuw nsw i64 %iv, 1
 
-  %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
-  %v.narrow = load i8, i8* %in
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
 
   %v = uitofp i8 %v.narrow to double
 
   %v0 = fadd double %v, 0.0
   %v1 = fadd double %v, 1.0
 
-  %out0 = getelementptr inbounds [1024 x double], [1024 x double]* @B, i64 0, i64 %iv.0
-  %out1 = getelementptr inbounds [1024 x double], [1024 x double]* @B, i64 0, i64 %iv.1
+  %out0 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.1
 
-  store double %v0, double* %out0
-  store double %v1, double* %out1
+  store double %v0, ptr %out0
+  store double %v1, ptr %out1
 
   %iv.next = add nuw nsw i64 %iv.0, 2
   %cmp = icmp ult i64 %iv.next, 1024

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-3.ll
index d4faabbb65302..88b425b5ddc52 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-3.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store double %v2, ptr %out2"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,36 +11,36 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x double] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
-;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store double %v2, double* %out2, align 8
-; SSE2: LV: Found an estimated cost of 10 for VF 2 For instruction:   store double %v2, double* %out2, align 8
-; SSE2: LV: Found an estimated cost of 20 for VF 4 For instruction:   store double %v2, double* %out2, align 8
-; SSE2: LV: Found an estimated cost of 40 for VF 8 For instruction:   store double %v2, double* %out2, align 8
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8
+; SSE2:  LV: Found an estimated cost of 10 for VF 2 For instruction: store double %v2, ptr %out2, align 8
+; SSE2:  LV: Found an estimated cost of 20 for VF 4 For instruction: store double %v2, ptr %out2, align 8
+; SSE2:  LV: Found an estimated cost of 40 for VF 8 For instruction: store double %v2, ptr %out2, align 8
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store double %v2, double* %out2, align 8
-; AVX1: LV: Found an estimated cost of 11 for VF 2 For instruction:   store double %v2, double* %out2, align 8
-; AVX1: LV: Found an estimated cost of 27 for VF 4 For instruction:   store double %v2, double* %out2, align 8
-; AVX1: LV: Found an estimated cost of 54 for VF 8 For instruction:   store double %v2, double* %out2, align 8
-; AVX1: LV: Found an estimated cost of 108 for VF 16 For instruction:   store double %v2, double* %out2, align 8
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8
+; AVX1:  LV: Found an estimated cost of 11 for VF 2 For instruction: store double %v2, ptr %out2, align 8
+; AVX1:  LV: Found an estimated cost of 27 for VF 4 For instruction: store double %v2, ptr %out2, align 8
+; AVX1:  LV: Found an estimated cost of 54 for VF 8 For instruction: store double %v2, ptr %out2, align 8
+; AVX1:  LV: Found an estimated cost of 108 for VF 16 For instruction: store double %v2, ptr %out2, align 8
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store double %v2, double* %out2, align 8
-; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction:   store double %v2, double* %out2, align 8
-; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction:   store double %v2, double* %out2, align 8
-; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction:   store double %v2, double* %out2, align 8
-; AVX2: LV: Found an estimated cost of 36 for VF 16 For instruction:   store double %v2, double* %out2, align 8
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8
+; AVX2:  LV: Found an estimated cost of 6 for VF 2 For instruction: store double %v2, ptr %out2, align 8
+; AVX2:  LV: Found an estimated cost of 9 for VF 4 For instruction: store double %v2, ptr %out2, align 8
+; AVX2:  LV: Found an estimated cost of 18 for VF 8 For instruction: store double %v2, ptr %out2, align 8
+; AVX2:  LV: Found an estimated cost of 36 for VF 16 For instruction: store double %v2, ptr %out2, align 8
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store double %v2, double* %out2, align 8
-; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction:   store double %v2, double* %out2, align 8
-; AVX512: LV: Found an estimated cost of 8 for VF 4 For instruction:   store double %v2, double* %out2, align 8
-; AVX512: LV: Found an estimated cost of 12 for VF 8 For instruction:   store double %v2, double* %out2, align 8
-; AVX512: LV: Found an estimated cost of 24 for VF 16 For instruction:   store double %v2, double* %out2, align 8
-; AVX512: LV: Found an estimated cost of 48 for VF 32 For instruction:   store double %v2, double* %out2, align 8
-; AVX512: LV: Found an estimated cost of 96 for VF 64 For instruction:   store double %v2, double* %out2, align 8
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: store double %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 8 for VF 4 For instruction: store double %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 12 for VF 8 For instruction: store double %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 24 for VF 16 For instruction: store double %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 48 for VF 32 For instruction: store double %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 96 for VF 64 For instruction: store double %v2, ptr %out2, align 8
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store double %v2, double* %out2, align 8
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -50,8 +51,8 @@ for.body:
   %iv.1 = add nuw nsw i64 %iv, 1
   %iv.2 = add nuw nsw i64 %iv, 2
 
-  %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
-  %v.narrow = load i8, i8* %in
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
 
   %v = uitofp i8 %v.narrow to double
 
@@ -59,13 +60,13 @@ for.body:
   %v1 = fadd double %v, 1.0
   %v2 = fadd double %v, 2.0
 
-  %out0 = getelementptr inbounds [1024 x double], [1024 x double]* @B, i64 0, i64 %iv.0
-  %out1 = getelementptr inbounds [1024 x double], [1024 x double]* @B, i64 0, i64 %iv.1
-  %out2 = getelementptr inbounds [1024 x double], [1024 x double]* @B, i64 0, i64 %iv.2
+  %out0 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.2
 
-  store double %v0, double* %out0
-  store double %v1, double* %out1
-  store double %v2, double* %out2
+  store double %v0, ptr %out0
+  store double %v1, ptr %out1
+  store double %v2, ptr %out2
 
   %iv.next = add nuw nsw i64 %iv.0, 3
   %cmp = icmp ult i64 %iv.next, 1024

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-4.ll
index 1e7b7fcb74b59..dec47f2bc3c17 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-4.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store double %v3, ptr %out3"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,35 +11,35 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x double] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
-;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store double %v3, double* %out3, align 8
-; SSE2: LV: Found an estimated cost of 12 for VF 2 For instruction:   store double %v3, double* %out3, align 8
-; SSE2: LV: Found an estimated cost of 24 for VF 4 For instruction:   store double %v3, double* %out3, align 8
-; SSE2: LV: Found an estimated cost of 48 for VF 8 For instruction:   store double %v3, double* %out3, align 8
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8
+; SSE2:  LV: Found an estimated cost of 12 for VF 2 For instruction: store double %v3, ptr %out3, align 8
+; SSE2:  LV: Found an estimated cost of 24 for VF 4 For instruction: store double %v3, ptr %out3, align 8
+; SSE2:  LV: Found an estimated cost of 48 for VF 8 For instruction: store double %v3, ptr %out3, align 8
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store double %v3, double* %out3, align 8
-; AVX1: LV: Found an estimated cost of 12 for VF 2 For instruction:   store double %v3, double* %out3, align 8
-; AVX1: LV: Found an estimated cost of 32 for VF 4 For instruction:   store double %v3, double* %out3, align 8
-; AVX1: LV: Found an estimated cost of 64 for VF 8 For instruction:   store double %v3, double* %out3, align 8
-; AVX1: LV: Found an estimated cost of 128 for VF 16 For instruction:   store double %v3, double* %out3, align 8
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8
+; AVX1:  LV: Found an estimated cost of 12 for VF 2 For instruction: store double %v3, ptr %out3, align 8
+; AVX1:  LV: Found an estimated cost of 32 for VF 4 For instruction: store double %v3, ptr %out3, align 8
+; AVX1:  LV: Found an estimated cost of 64 for VF 8 For instruction: store double %v3, ptr %out3, align 8
+; AVX1:  LV: Found an estimated cost of 128 for VF 16 For instruction: store double %v3, ptr %out3, align 8
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store double %v3, double* %out3, align 8
-; AVX2: LV: Found an estimated cost of 8 for VF 2 For instruction:   store double %v3, double* %out3, align 8
-; AVX2: LV: Found an estimated cost of 12 for VF 4 For instruction:   store double %v3, double* %out3, align 8
-; AVX2: LV: Found an estimated cost of 28 for VF 8 For instruction:   store double %v3, double* %out3, align 8
-; AVX2: LV: Found an estimated cost of 56 for VF 16 For instruction:   store double %v3, double* %out3, align 8
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8
+; AVX2:  LV: Found an estimated cost of 8 for VF 2 For instruction: store double %v3, ptr %out3, align 8
+; AVX2:  LV: Found an estimated cost of 12 for VF 4 For instruction: store double %v3, ptr %out3, align 8
+; AVX2:  LV: Found an estimated cost of 28 for VF 8 For instruction: store double %v3, ptr %out3, align 8
+; AVX2:  LV: Found an estimated cost of 56 for VF 16 For instruction: store double %v3, ptr %out3, align 8
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store double %v3, double* %out3, align 8
-; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction:   store double %v3, double* %out3, align 8
-; AVX512: LV: Found an estimated cost of 11 for VF 4 For instruction:   store double %v3, double* %out3, align 8
-; AVX512: LV: Found an estimated cost of 22 for VF 8 For instruction:   store double %v3, double* %out3, align 8
-; AVX512: LV: Found an estimated cost of 44 for VF 16 For instruction:   store double %v3, double* %out3, align 8
-; AVX512: LV: Found an estimated cost of 88 for VF 32 For instruction:   store double %v3, double* %out3, align 8
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 5 for VF 2 For instruction: store double %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 11 for VF 4 For instruction: store double %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 22 for VF 8 For instruction: store double %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 44 for VF 16 For instruction: store double %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 88 for VF 32 For instruction: store double %v3, ptr %out3, align 8
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store double %v3, double* %out3, align 8
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -50,8 +51,8 @@ for.body:
   %iv.2 = add nuw nsw i64 %iv, 2
   %iv.3 = add nuw nsw i64 %iv, 3
 
-  %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
-  %v.narrow = load i8, i8* %in
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
 
   %v = uitofp i8 %v.narrow to double
 
@@ -60,15 +61,15 @@ for.body:
   %v2 = fadd double %v, 2.0
   %v3 = fadd double %v, 3.0
 
-  %out0 = getelementptr inbounds [1024 x double], [1024 x double]* @B, i64 0, i64 %iv.0
-  %out1 = getelementptr inbounds [1024 x double], [1024 x double]* @B, i64 0, i64 %iv.1
-  %out2 = getelementptr inbounds [1024 x double], [1024 x double]* @B, i64 0, i64 %iv.2
-  %out3 = getelementptr inbounds [1024 x double], [1024 x double]* @B, i64 0, i64 %iv.3
+  %out0 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.2
+  %out3 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.3
 
-  store double %v0, double* %out0
-  store double %v1, double* %out1
-  store double %v2, double* %out2
-  store double %v3, double* %out3
+  store double %v0, ptr %out0
+  store double %v1, ptr %out1
+  store double %v2, ptr %out2
+  store double %v3, ptr %out3
 
   %iv.next = add nuw nsw i64 %iv.0, 4
   %cmp = icmp ult i64 %iv.next, 1024

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-5.ll
new file mode 100644
index 0000000000000..e75502aed80d2
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-5.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store double %v4, ptr %out4"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x i8] zeroinitializer, align 128
+ at B = global [1024 x double] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8
+; SSE2:  LV: Found an estimated cost of 18 for VF 2 For instruction: store double %v4, ptr %out4, align 8
+; SSE2:  LV: Found an estimated cost of 36 for VF 4 For instruction: store double %v4, ptr %out4, align 8
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8
+; AVX1:  LV: Found an estimated cost of 20 for VF 2 For instruction: store double %v4, ptr %out4, align 8
+; AVX1:  LV: Found an estimated cost of 49 for VF 4 For instruction: store double %v4, ptr %out4, align 8
+; AVX1:  LV: Found an estimated cost of 98 for VF 8 For instruction: store double %v4, ptr %out4, align 8
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8
+; AVX2:  LV: Found an estimated cost of 20 for VF 2 For instruction: store double %v4, ptr %out4, align 8
+; AVX2:  LV: Found an estimated cost of 49 for VF 4 For instruction: store double %v4, ptr %out4, align 8
+; AVX2:  LV: Found an estimated cost of 98 for VF 8 For instruction: store double %v4, ptr %out4, align 8
+;
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 14 for VF 2 For instruction: store double %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 21 for VF 4 For instruction: store double %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 35 for VF 8 For instruction: store double %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 70 for VF 16 For instruction: store double %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 140 for VF 32 For instruction: store double %v4, ptr %out4, align 8
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
+
+  %v = uitofp i8 %v.narrow to double
+
+  %v0 = fadd double %v, 0.0
+  %v1 = fadd double %v, 1.0
+  %v2 = fadd double %v, 2.0
+  %v3 = fadd double %v, 3.0
+  %v4 = fadd double %v, 4.0
+
+  %out0 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.2
+  %out3 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.3
+  %out4 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.4
+
+  store double %v0, ptr %out0
+  store double %v1, ptr %out1
+  store double %v2, ptr %out2
+  store double %v3, ptr %out3
+  store double %v4, ptr %out4
+
+  %iv.next = add nuw nsw i64 %iv.0, 5
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-6.ll
index 2815908e1cde7..b5fefd7cdd7f6 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-6.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store double %v5, ptr %out5"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,32 +11,32 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x double] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
-;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store double %v5, double* %out5, align 8
-; SSE2: LV: Found an estimated cost of 20 for VF 2 For instruction:   store double %v5, double* %out5, align 8
-; SSE2: LV: Found an estimated cost of 40 for VF 4 For instruction:   store double %v5, double* %out5, align 8
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8
+; SSE2:  LV: Found an estimated cost of 20 for VF 2 For instruction: store double %v5, ptr %out5, align 8
+; SSE2:  LV: Found an estimated cost of 40 for VF 4 For instruction: store double %v5, ptr %out5, align 8
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store double %v5, double* %out5, align 8
-; AVX1: LV: Found an estimated cost of 21 for VF 2 For instruction:   store double %v5, double* %out5, align 8
-; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction:   store double %v5, double* %out5, align 8
-; AVX1: LV: Found an estimated cost of 108 for VF 8 For instruction:   store double %v5, double* %out5, align 8
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8
+; AVX1:  LV: Found an estimated cost of 21 for VF 2 For instruction: store double %v5, ptr %out5, align 8
+; AVX1:  LV: Found an estimated cost of 54 for VF 4 For instruction: store double %v5, ptr %out5, align 8
+; AVX1:  LV: Found an estimated cost of 108 for VF 8 For instruction: store double %v5, ptr %out5, align 8
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store double %v5, double* %out5, align 8
-; AVX2: LV: Found an estimated cost of 11 for VF 2 For instruction:   store double %v5, double* %out5, align 8
-; AVX2: LV: Found an estimated cost of 21 for VF 4 For instruction:   store double %v5, double* %out5, align 8
-; AVX2: LV: Found an estimated cost of 42 for VF 8 For instruction:   store double %v5, double* %out5, align 8
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8
+; AVX2:  LV: Found an estimated cost of 11 for VF 2 For instruction: store double %v5, ptr %out5, align 8
+; AVX2:  LV: Found an estimated cost of 21 for VF 4 For instruction: store double %v5, ptr %out5, align 8
+; AVX2:  LV: Found an estimated cost of 42 for VF 8 For instruction: store double %v5, ptr %out5, align 8
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store double %v5, double* %out5, align 8
-; AVX512: LV: Found an estimated cost of 17 for VF 2 For instruction:   store double %v5, double* %out5, align 8
-; AVX512: LV: Found an estimated cost of 25 for VF 4 For instruction:   store double %v5, double* %out5, align 8
-; AVX512: LV: Found an estimated cost of 51 for VF 8 For instruction:   store double %v5, double* %out5, align 8
-; AVX512: LV: Found an estimated cost of 102 for VF 16 For instruction:   store double %v5, double* %out5, align 8
-; AVX512: LV: Found an estimated cost of 204 for VF 32 For instruction:   store double %v5, double* %out5, align 8
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 17 for VF 2 For instruction: store double %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 25 for VF 4 For instruction: store double %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 51 for VF 8 For instruction: store double %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 102 for VF 16 For instruction: store double %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 204 for VF 32 For instruction: store double %v5, ptr %out5, align 8
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store double %v5, double* %out5, align 8
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -49,8 +50,8 @@ for.body:
   %iv.4 = add nuw nsw i64 %iv, 4
   %iv.5 = add nuw nsw i64 %iv, 5
 
-  %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
-  %v.narrow = load i8, i8* %in
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
 
   %v = uitofp i8 %v.narrow to double
 
@@ -61,19 +62,19 @@ for.body:
   %v4 = fadd double %v, 4.0
   %v5 = fadd double %v, 5.0
 
-  %out0 = getelementptr inbounds [1024 x double], [1024 x double]* @B, i64 0, i64 %iv.0
-  %out1 = getelementptr inbounds [1024 x double], [1024 x double]* @B, i64 0, i64 %iv.1
-  %out2 = getelementptr inbounds [1024 x double], [1024 x double]* @B, i64 0, i64 %iv.2
-  %out3 = getelementptr inbounds [1024 x double], [1024 x double]* @B, i64 0, i64 %iv.3
-  %out4 = getelementptr inbounds [1024 x double], [1024 x double]* @B, i64 0, i64 %iv.4
-  %out5 = getelementptr inbounds [1024 x double], [1024 x double]* @B, i64 0, i64 %iv.5
+  %out0 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.2
+  %out3 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.3
+  %out4 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.4
+  %out5 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.5
 
-  store double %v0, double* %out0
-  store double %v1, double* %out1
-  store double %v2, double* %out2
-  store double %v3, double* %out3
-  store double %v4, double* %out4
-  store double %v5, double* %out5
+  store double %v0, ptr %out0
+  store double %v1, ptr %out1
+  store double %v2, ptr %out2
+  store double %v3, ptr %out3
+  store double %v4, ptr %out4
+  store double %v5, ptr %out5
 
   %iv.next = add nuw nsw i64 %iv.0, 6
   %cmp = icmp ult i64 %iv.next, 1024

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-7.ll
new file mode 100644
index 0000000000000..13326f082833c
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-7.ll
@@ -0,0 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store double %v6, ptr %out6"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x i8] zeroinitializer, align 128
+ at B = global [1024 x double] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8
+; SSE2:  LV: Found an estimated cost of 22 for VF 2 For instruction: store double %v6, ptr %out6, align 8
+; SSE2:  LV: Found an estimated cost of 44 for VF 4 For instruction: store double %v6, ptr %out6, align 8
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8
+; AVX1:  LV: Found an estimated cost of 23 for VF 2 For instruction: store double %v6, ptr %out6, align 8
+; AVX1:  LV: Found an estimated cost of 59 for VF 4 For instruction: store double %v6, ptr %out6, align 8
+; AVX1:  LV: Found an estimated cost of 118 for VF 8 For instruction: store double %v6, ptr %out6, align 8
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8
+; AVX2:  LV: Found an estimated cost of 23 for VF 2 For instruction: store double %v6, ptr %out6, align 8
+; AVX2:  LV: Found an estimated cost of 59 for VF 4 For instruction: store double %v6, ptr %out6, align 8
+; AVX2:  LV: Found an estimated cost of 118 for VF 8 For instruction: store double %v6, ptr %out6, align 8
+;
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 2 For instruction: store double %v6, ptr %out6, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 4 For instruction: store double %v6, ptr %out6, align 8
+; AVX512:  LV: Found an estimated cost of 70 for VF 8 For instruction: store double %v6, ptr %out6, align 8
+; AVX512:  LV: Found an estimated cost of 140 for VF 16 For instruction: store double %v6, ptr %out6, align 8
+; AVX512:  LV: Found an estimated cost of 280 for VF 32 For instruction: store double %v6, ptr %out6, align 8
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+  %iv.5 = add nuw nsw i64 %iv, 5
+  %iv.6 = add nuw nsw i64 %iv, 6
+
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
+
+  %v = uitofp i8 %v.narrow to double
+
+  %v0 = fadd double %v, 0.0
+  %v1 = fadd double %v, 1.0
+  %v2 = fadd double %v, 2.0
+  %v3 = fadd double %v, 3.0
+  %v4 = fadd double %v, 4.0
+  %v5 = fadd double %v, 5.0
+  %v6 = fadd double %v, 6.0
+
+  %out0 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.2
+  %out3 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.3
+  %out4 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.4
+  %out5 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.5
+  %out6 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.6
+
+  store double %v0, ptr %out0
+  store double %v1, ptr %out1
+  store double %v2, ptr %out2
+  store double %v3, ptr %out3
+  store double %v4, ptr %out4
+  store double %v5, ptr %out5
+  store double %v6, ptr %out6
+
+  %iv.next = add nuw nsw i64 %iv.0, 7
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll
new file mode 100644
index 0000000000000..dc91cd3149e77
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll
@@ -0,0 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store double %v7, ptr %out7"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x i8] zeroinitializer, align 128
+ at B = global [1024 x double] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v7, ptr %out7, align 8
+; SSE2:  LV: Found an estimated cost of 24 for VF 2 For instruction: store double %v7, ptr %out7, align 8
+; SSE2:  LV: Found an estimated cost of 48 for VF 4 For instruction: store double %v7, ptr %out7, align 8
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v7, ptr %out7, align 8
+; AVX1:  LV: Found an estimated cost of 24 for VF 2 For instruction: store double %v7, ptr %out7, align 8
+; AVX1:  LV: Found an estimated cost of 64 for VF 4 For instruction: store double %v7, ptr %out7, align 8
+; AVX1:  LV: Found an estimated cost of 128 for VF 8 For instruction: store double %v7, ptr %out7, align 8
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v7, ptr %out7, align 8
+; AVX2:  LV: Found an estimated cost of 24 for VF 2 For instruction: store double %v7, ptr %out7, align 8
+; AVX2:  LV: Found an estimated cost of 64 for VF 4 For instruction: store double %v7, ptr %out7, align 8
+; AVX2:  LV: Found an estimated cost of 128 for VF 8 For instruction: store double %v7, ptr %out7, align 8
+;
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v7, ptr %out7, align 8
+; AVX512:  LV: Found an estimated cost of 23 for VF 2 For instruction: store double %v7, ptr %out7, align 8
+; AVX512:  LV: Found an estimated cost of 46 for VF 4 For instruction: store double %v7, ptr %out7, align 8
+; AVX512:  LV: Found an estimated cost of 80 for VF 8 For instruction: store double %v7, ptr %out7, align 8
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+  %iv.5 = add nuw nsw i64 %iv, 5
+  %iv.6 = add nuw nsw i64 %iv, 6
+  %iv.7 = add nuw nsw i64 %iv, 7
+
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
+
+  %v = uitofp i8 %v.narrow to double
+
+  %v0 = fadd double %v, 0.0
+  %v1 = fadd double %v, 1.0
+  %v2 = fadd double %v, 2.0
+  %v3 = fadd double %v, 3.0
+  %v4 = fadd double %v, 4.0
+  %v5 = fadd double %v, 5.0
+  %v6 = fadd double %v, 6.0
+  %v7 = fadd double %v, 7.0
+
+  %out0 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.2
+  %out3 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.3
+  %out4 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.4
+  %out5 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.5
+  %out6 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.6
+  %out7 = getelementptr inbounds [1024 x double], ptr @B, i64 0, i64 %iv.7
+
+  store double %v0, ptr %out0
+  store double %v1, ptr %out1
+  store double %v2, ptr %out2
+  store double %v3, ptr %out3
+  store double %v4, ptr %out4
+  store double %v5, ptr %out5
+  store double %v6, ptr %out6
+  store double %v7, ptr %out7
+
+  %iv.next = add nuw nsw i64 %iv.0, 8
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-2.ll
index f672150ad4f86..24fecd1ed25d2 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-2.ll
@@ -1,7 +1,9 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i16 %v1, ptr %out1"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512DQ
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl,+avx512bw --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512BW
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +12,48 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i16] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v1, ptr %out1, align 2
+; SSE2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %v1, ptr %out1, align 2
+; SSE2:  LV: Found an estimated cost of 2 for VF 4 For instruction: store i16 %v1, ptr %out1, align 2
+; SSE2:  LV: Found an estimated cost of 34 for VF 8 For instruction: store i16 %v1, ptr %out1, align 2
+; SSE2:  LV: Found an estimated cost of 68 for VF 16 For instruction: store i16 %v1, ptr %out1, align 2
 ;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v1, i16* %out1, align 2
-; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction:   store i16 %v1, i16* %out1, align 2
-; SSE2: LV: Found an estimated cost of 2 for VF 4 For instruction:   store i16 %v1, i16* %out1, align 2
-; SSE2: LV: Found an estimated cost of 34 for VF 8 For instruction:   store i16 %v1, i16* %out1, align 2
-; SSE2: LV: Found an estimated cost of 68 for VF 16 For instruction:   store i16 %v1, i16* %out1, align 2
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v1, ptr %out1, align 2
+; AVX1:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %v1, ptr %out1, align 2
+; AVX1:  LV: Found an estimated cost of 2 for VF 4 For instruction: store i16 %v1, ptr %out1, align 2
+; AVX1:  LV: Found an estimated cost of 35 for VF 8 For instruction: store i16 %v1, ptr %out1, align 2
+; AVX1:  LV: Found an estimated cost of 86 for VF 16 For instruction: store i16 %v1, ptr %out1, align 2
+; AVX1:  LV: Found an estimated cost of 172 for VF 32 For instruction: store i16 %v1, ptr %out1, align 2
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v1, i16* %out1, align 2
-; AVX1: LV: Found an estimated cost of 2 for VF 2 For instruction:   store i16 %v1, i16* %out1, align 2
-; AVX1: LV: Found an estimated cost of 2 for VF 4 For instruction:   store i16 %v1, i16* %out1, align 2
-; AVX1: LV: Found an estimated cost of 35 for VF 8 For instruction:   store i16 %v1, i16* %out1, align 2
-; AVX1: LV: Found an estimated cost of 86 for VF 16 For instruction:   store i16 %v1, i16* %out1, align 2
-; AVX1: LV: Found an estimated cost of 172 for VF 32 For instruction:   store i16 %v1, i16* %out1, align 2
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v1, ptr %out1, align 2
+; AVX2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %v1, ptr %out1, align 2
+; AVX2:  LV: Found an estimated cost of 2 for VF 4 For instruction: store i16 %v1, ptr %out1, align 2
+; AVX2:  LV: Found an estimated cost of 4 for VF 8 For instruction: store i16 %v1, ptr %out1, align 2
+; AVX2:  LV: Found an estimated cost of 6 for VF 16 For instruction: store i16 %v1, ptr %out1, align 2
+; AVX2:  LV: Found an estimated cost of 12 for VF 32 For instruction: store i16 %v1, ptr %out1, align 2
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v1, i16* %out1, align 2
-; AVX2: LV: Found an estimated cost of 2 for VF 2 For instruction:   store i16 %v1, i16* %out1, align 2
-; AVX2: LV: Found an estimated cost of 2 for VF 4 For instruction:   store i16 %v1, i16* %out1, align 2
-; AVX2: LV: Found an estimated cost of 4 for VF 8 For instruction:   store i16 %v1, i16* %out1, align 2
-; AVX2: LV: Found an estimated cost of 6 for VF 16 For instruction:   store i16 %v1, i16* %out1, align 2
-; AVX2: LV: Found an estimated cost of 12 for VF 32 For instruction:   store i16 %v1, i16* %out1, align 2
+; AVX512DQ-LABEL: 'test'
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v1, ptr %out1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %v1, ptr %out1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 2 for VF 4 For instruction: store i16 %v1, ptr %out1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 4 for VF 8 For instruction: store i16 %v1, ptr %out1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 5 for VF 16 For instruction: store i16 %v1, ptr %out1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 10 for VF 32 For instruction: store i16 %v1, ptr %out1, align 2
+; AVX512DQ:  LV: Found an estimated cost of 372 for VF 64 For instruction: store i16 %v1, ptr %out1, align 2
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v1, i16* %out1, align 2
-; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction:   store i16 %v1, i16* %out1, align 2
-; AVX512: LV: Found an estimated cost of 3 for VF 4 For instruction:   store i16 %v1, i16* %out1, align 2
-; AVX512: LV: Found an estimated cost of 3 for VF 8 For instruction:   store i16 %v1, i16* %out1, align 2
-; AVX512: LV: Found an estimated cost of 3 for VF 16 For instruction:   store i16 %v1, i16* %out1, align 2
-; AVX512: LV: Found an estimated cost of 7 for VF 32 For instruction:   store i16 %v1, i16* %out1, align 2
-; AVX512: LV: Found an estimated cost of 14 for VF 64 For instruction:   store i16 %v1, i16* %out1, align 2
+; AVX512BW-LABEL: 'test'
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v1, ptr %out1, align 2
+; AVX512BW:  LV: Found an estimated cost of 3 for VF 2 For instruction: store i16 %v1, ptr %out1, align 2
+; AVX512BW:  LV: Found an estimated cost of 3 for VF 4 For instruction: store i16 %v1, ptr %out1, align 2
+; AVX512BW:  LV: Found an estimated cost of 3 for VF 8 For instruction: store i16 %v1, ptr %out1, align 2
+; AVX512BW:  LV: Found an estimated cost of 3 for VF 16 For instruction: store i16 %v1, ptr %out1, align 2
+; AVX512BW:  LV: Found an estimated cost of 7 for VF 32 For instruction: store i16 %v1, ptr %out1, align 2
+; AVX512BW:  LV: Found an estimated cost of 14 for VF 64 For instruction: store i16 %v1, ptr %out1, align 2
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store i16 %v1, i16* %out1, align 2
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -52,19 +63,19 @@ for.body:
   %iv.0 = add nuw nsw i64 %iv, 0
   %iv.1 = add nuw nsw i64 %iv, 1
 
-  %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
-  %v.narrow = load i8, i8* %in
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
 
   %v = zext i8 %v.narrow to i16
 
   %v0 = add i16 %v, 0
   %v1 = add i16 %v, 1
 
-  %out0 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.0
-  %out1 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.1
+  %out0 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.1
 
-  store i16 %v0, i16* %out0
-  store i16 %v1, i16* %out1
+  store i16 %v0, ptr %out0
+  store i16 %v1, ptr %out1
 
   %iv.next = add nuw nsw i64 %iv.0, 2
   %cmp = icmp ult i64 %iv.next, 1024

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-3.ll
index c4834da5ecbc9..ceca668e789dc 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-3.ll
@@ -1,7 +1,9 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i16 %v2, ptr %out2"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512DQ
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl,+avx512bw --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512BW
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +12,48 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i16] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v2, ptr %out2, align 2
+; SSE2:  LV: Found an estimated cost of 16 for VF 2 For instruction: store i16 %v2, ptr %out2, align 2
+; SSE2:  LV: Found an estimated cost of 26 for VF 4 For instruction: store i16 %v2, ptr %out2, align 2
+; SSE2:  LV: Found an estimated cost of 51 for VF 8 For instruction: store i16 %v2, ptr %out2, align 2
+; SSE2:  LV: Found an estimated cost of 102 for VF 16 For instruction: store i16 %v2, ptr %out2, align 2
 ;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v2, i16* %out2, align 2
-; SSE2: LV: Found an estimated cost of 16 for VF 2 For instruction:   store i16 %v2, i16* %out2, align 2
-; SSE2: LV: Found an estimated cost of 26 for VF 4 For instruction:   store i16 %v2, i16* %out2, align 2
-; SSE2: LV: Found an estimated cost of 51 for VF 8 For instruction:   store i16 %v2, i16* %out2, align 2
-; SSE2: LV: Found an estimated cost of 102 for VF 16 For instruction:   store i16 %v2, i16* %out2, align 2
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v2, ptr %out2, align 2
+; AVX1:  LV: Found an estimated cost of 15 for VF 2 For instruction: store i16 %v2, ptr %out2, align 2
+; AVX1:  LV: Found an estimated cost of 30 for VF 4 For instruction: store i16 %v2, ptr %out2, align 2
+; AVX1:  LV: Found an estimated cost of 53 for VF 8 For instruction: store i16 %v2, ptr %out2, align 2
+; AVX1:  LV: Found an estimated cost of 129 for VF 16 For instruction: store i16 %v2, ptr %out2, align 2
+; AVX1:  LV: Found an estimated cost of 258 for VF 32 For instruction: store i16 %v2, ptr %out2, align 2
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX1: LV: Found an estimated cost of 15 for VF 2 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX1: LV: Found an estimated cost of 30 for VF 4 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX1: LV: Found an estimated cost of 53 for VF 8 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX1: LV: Found an estimated cost of 129 for VF 16 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX1: LV: Found an estimated cost of 258 for VF 32 For instruction:   store i16 %v2, i16* %out2, align 2
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v2, ptr %out2, align 2
+; AVX2:  LV: Found an estimated cost of 7 for VF 2 For instruction: store i16 %v2, ptr %out2, align 2
+; AVX2:  LV: Found an estimated cost of 9 for VF 4 For instruction: store i16 %v2, ptr %out2, align 2
+; AVX2:  LV: Found an estimated cost of 14 for VF 8 For instruction: store i16 %v2, ptr %out2, align 2
+; AVX2:  LV: Found an estimated cost of 30 for VF 16 For instruction: store i16 %v2, ptr %out2, align 2
+; AVX2:  LV: Found an estimated cost of 60 for VF 32 For instruction: store i16 %v2, ptr %out2, align 2
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX2: LV: Found an estimated cost of 7 for VF 2 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX2: LV: Found an estimated cost of 60 for VF 32 For instruction:   store i16 %v2, i16* %out2, align 2
+; AVX512DQ-LABEL: 'test'
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v2, ptr %out2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 7 for VF 2 For instruction: store i16 %v2, ptr %out2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 9 for VF 4 For instruction: store i16 %v2, ptr %out2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 15 for VF 8 For instruction: store i16 %v2, ptr %out2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 29 for VF 16 For instruction: store i16 %v2, ptr %out2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 57 for VF 32 For instruction: store i16 %v2, ptr %out2, align 2
+; AVX512DQ:  LV: Found an estimated cost of 558 for VF 64 For instruction: store i16 %v2, ptr %out2, align 2
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX512: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX512: LV: Found an estimated cost of 6 for VF 4 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX512: LV: Found an estimated cost of 6 for VF 8 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX512: LV: Found an estimated cost of 12 for VF 16 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX512: LV: Found an estimated cost of 18 for VF 32 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX512: LV: Found an estimated cost of 36 for VF 64 For instruction:   store i16 %v2, i16* %out2, align 2
+; AVX512BW-LABEL: 'test'
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v2, ptr %out2, align 2
+; AVX512BW:  LV: Found an estimated cost of 6 for VF 2 For instruction: store i16 %v2, ptr %out2, align 2
+; AVX512BW:  LV: Found an estimated cost of 6 for VF 4 For instruction: store i16 %v2, ptr %out2, align 2
+; AVX512BW:  LV: Found an estimated cost of 6 for VF 8 For instruction: store i16 %v2, ptr %out2, align 2
+; AVX512BW:  LV: Found an estimated cost of 12 for VF 16 For instruction: store i16 %v2, ptr %out2, align 2
+; AVX512BW:  LV: Found an estimated cost of 18 for VF 32 For instruction: store i16 %v2, ptr %out2, align 2
+; AVX512BW:  LV: Found an estimated cost of 36 for VF 64 For instruction: store i16 %v2, ptr %out2, align 2
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store i16 %v2, i16* %out2, align 2
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -53,8 +64,8 @@ for.body:
   %iv.1 = add nuw nsw i64 %iv, 1
   %iv.2 = add nuw nsw i64 %iv, 2
 
-  %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
-  %v.narrow = load i8, i8* %in
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
 
   %v = zext i8 %v.narrow to i16
 
@@ -62,13 +73,13 @@ for.body:
   %v1 = add i16 %v, 1
   %v2 = add i16 %v, 2
 
-  %out0 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.0
-  %out1 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.1
-  %out2 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.2
+  %out0 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.2
 
-  store i16 %v0, i16* %out0
-  store i16 %v1, i16* %out1
-  store i16 %v2, i16* %out2
+  store i16 %v0, ptr %out0
+  store i16 %v1, ptr %out1
+  store i16 %v2, ptr %out2
 
   %iv.next = add nuw nsw i64 %iv.0, 3
   %cmp = icmp ult i64 %iv.next, 1024

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-4.ll
index 25f73b1302150..9527912ab239c 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-4.ll
@@ -1,7 +1,9 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i16 %v3, ptr %out3"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512DQ
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl,+avx512bw --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512BW
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +12,48 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i16] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v3, ptr %out3, align 2
+; SSE2:  LV: Found an estimated cost of 17 for VF 2 For instruction: store i16 %v3, ptr %out3, align 2
+; SSE2:  LV: Found an estimated cost of 34 for VF 4 For instruction: store i16 %v3, ptr %out3, align 2
+; SSE2:  LV: Found an estimated cost of 68 for VF 8 For instruction: store i16 %v3, ptr %out3, align 2
+; SSE2:  LV: Found an estimated cost of 136 for VF 16 For instruction: store i16 %v3, ptr %out3, align 2
 ;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v3, i16* %out3, align 2
-; SSE2: LV: Found an estimated cost of 17 for VF 2 For instruction:   store i16 %v3, i16* %out3, align 2
-; SSE2: LV: Found an estimated cost of 34 for VF 4 For instruction:   store i16 %v3, i16* %out3, align 2
-; SSE2: LV: Found an estimated cost of 68 for VF 8 For instruction:   store i16 %v3, i16* %out3, align 2
-; SSE2: LV: Found an estimated cost of 136 for VF 16 For instruction:   store i16 %v3, i16* %out3, align 2
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v3, ptr %out3, align 2
+; AVX1:  LV: Found an estimated cost of 17 for VF 2 For instruction: store i16 %v3, ptr %out3, align 2
+; AVX1:  LV: Found an estimated cost of 35 for VF 4 For instruction: store i16 %v3, ptr %out3, align 2
+; AVX1:  LV: Found an estimated cost of 70 for VF 8 For instruction: store i16 %v3, ptr %out3, align 2
+; AVX1:  LV: Found an estimated cost of 172 for VF 16 For instruction: store i16 %v3, ptr %out3, align 2
+; AVX1:  LV: Found an estimated cost of 344 for VF 32 For instruction: store i16 %v3, ptr %out3, align 2
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v3, i16* %out3, align 2
-; AVX1: LV: Found an estimated cost of 17 for VF 2 For instruction:   store i16 %v3, i16* %out3, align 2
-; AVX1: LV: Found an estimated cost of 35 for VF 4 For instruction:   store i16 %v3, i16* %out3, align 2
-; AVX1: LV: Found an estimated cost of 70 for VF 8 For instruction:   store i16 %v3, i16* %out3, align 2
-; AVX1: LV: Found an estimated cost of 172 for VF 16 For instruction:   store i16 %v3, i16* %out3, align 2
-; AVX1: LV: Found an estimated cost of 344 for VF 32 For instruction:   store i16 %v3, i16* %out3, align 2
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v3, ptr %out3, align 2
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: store i16 %v3, ptr %out3, align 2
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: store i16 %v3, ptr %out3, align 2
+; AVX2:  LV: Found an estimated cost of 12 for VF 8 For instruction: store i16 %v3, ptr %out3, align 2
+; AVX2:  LV: Found an estimated cost of 36 for VF 16 For instruction: store i16 %v3, ptr %out3, align 2
+; AVX2:  LV: Found an estimated cost of 72 for VF 32 For instruction: store i16 %v3, ptr %out3, align 2
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v3, i16* %out3, align 2
-; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction:   store i16 %v3, i16* %out3, align 2
-; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction:   store i16 %v3, i16* %out3, align 2
-; AVX2: LV: Found an estimated cost of 12 for VF 8 For instruction:   store i16 %v3, i16* %out3, align 2
-; AVX2: LV: Found an estimated cost of 36 for VF 16 For instruction:   store i16 %v3, i16* %out3, align 2
-; AVX2: LV: Found an estimated cost of 72 for VF 32 For instruction:   store i16 %v3, i16* %out3, align 2
+; AVX512DQ-LABEL: 'test'
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v3, ptr %out3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 3 for VF 2 For instruction: store i16 %v3, ptr %out3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 7 for VF 4 For instruction: store i16 %v3, ptr %out3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 11 for VF 8 For instruction: store i16 %v3, ptr %out3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 34 for VF 16 For instruction: store i16 %v3, ptr %out3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 68 for VF 32 For instruction: store i16 %v3, ptr %out3, align 2
+; AVX512DQ:  LV: Found an estimated cost of 744 for VF 64 For instruction: store i16 %v3, ptr %out3, align 2
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v3, i16* %out3, align 2
-; AVX512: LV: Found an estimated cost of 8 for VF 2 For instruction:   store i16 %v3, i16* %out3, align 2
-; AVX512: LV: Found an estimated cost of 8 for VF 4 For instruction:   store i16 %v3, i16* %out3, align 2
-; AVX512: LV: Found an estimated cost of 8 for VF 8 For instruction:   store i16 %v3, i16* %out3, align 2
-; AVX512: LV: Found an estimated cost of 17 for VF 16 For instruction:   store i16 %v3, i16* %out3, align 2
-; AVX512: LV: Found an estimated cost of 34 for VF 32 For instruction:   store i16 %v3, i16* %out3, align 2
-; AVX512: LV: Found an estimated cost of 68 for VF 64 For instruction:   store i16 %v3, i16* %out3, align 2
+; AVX512BW-LABEL: 'test'
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v3, ptr %out3, align 2
+; AVX512BW:  LV: Found an estimated cost of 8 for VF 2 For instruction: store i16 %v3, ptr %out3, align 2
+; AVX512BW:  LV: Found an estimated cost of 8 for VF 4 For instruction: store i16 %v3, ptr %out3, align 2
+; AVX512BW:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %v3, ptr %out3, align 2
+; AVX512BW:  LV: Found an estimated cost of 17 for VF 16 For instruction: store i16 %v3, ptr %out3, align 2
+; AVX512BW:  LV: Found an estimated cost of 34 for VF 32 For instruction: store i16 %v3, ptr %out3, align 2
+; AVX512BW:  LV: Found an estimated cost of 68 for VF 64 For instruction: store i16 %v3, ptr %out3, align 2
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store i16 %v3, i16* %out3, align 2
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -54,8 +65,8 @@ for.body:
   %iv.2 = add nuw nsw i64 %iv, 2
   %iv.3 = add nuw nsw i64 %iv, 3
 
-  %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
-  %v.narrow = load i8, i8* %in
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
 
   %v = zext i8 %v.narrow to i16
 
@@ -64,15 +75,15 @@ for.body:
   %v2 = add i16 %v, 2
   %v3 = add i16 %v, 3
 
-  %out0 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.0
-  %out1 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.1
-  %out2 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.2
-  %out3 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.3
+  %out0 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.2
+  %out3 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.3
 
-  store i16 %v0, i16* %out0
-  store i16 %v1, i16* %out1
-  store i16 %v2, i16* %out2
-  store i16 %v3, i16* %out3
+  store i16 %v0, ptr %out0
+  store i16 %v1, ptr %out1
+  store i16 %v2, ptr %out2
+  store i16 %v3, ptr %out3
 
   %iv.next = add nuw nsw i64 %iv.0, 4
   %cmp = icmp ult i64 %iv.next, 1024

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-5.ll
index 80f3ac8bcd0be..d6f300e3c5eb0 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-5.ll
@@ -1,7 +1,9 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i16 %v4, ptr %out4"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512DQ
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl,+avx512bw --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512BW
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +12,48 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i16] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v4, ptr %out4, align 2
+; SSE2:  LV: Found an estimated cost of 22 for VF 2 For instruction: store i16 %v4, ptr %out4, align 2
+; SSE2:  LV: Found an estimated cost of 43 for VF 4 For instruction: store i16 %v4, ptr %out4, align 2
+; SSE2:  LV: Found an estimated cost of 85 for VF 8 For instruction: store i16 %v4, ptr %out4, align 2
+; SSE2:  LV: Found an estimated cost of 170 for VF 16 For instruction: store i16 %v4, ptr %out4, align 2
 ;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v4, i16* %out4, align 2
-; SSE2: LV: Found an estimated cost of 22 for VF 2 For instruction:   store i16 %v4, i16* %out4, align 2
-; SSE2: LV: Found an estimated cost of 43 for VF 4 For instruction:   store i16 %v4, i16* %out4, align 2
-; SSE2: LV: Found an estimated cost of 85 for VF 8 For instruction:   store i16 %v4, i16* %out4, align 2
-; SSE2: LV: Found an estimated cost of 170 for VF 16 For instruction:   store i16 %v4, i16* %out4, align 2
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v4, ptr %out4, align 2
+; AVX1:  LV: Found an estimated cost of 27 for VF 2 For instruction: store i16 %v4, ptr %out4, align 2
+; AVX1:  LV: Found an estimated cost of 45 for VF 4 For instruction: store i16 %v4, ptr %out4, align 2
+; AVX1:  LV: Found an estimated cost of 88 for VF 8 For instruction: store i16 %v4, ptr %out4, align 2
+; AVX1:  LV: Found an estimated cost of 215 for VF 16 For instruction: store i16 %v4, ptr %out4, align 2
+; AVX1:  LV: Found an estimated cost of 430 for VF 32 For instruction: store i16 %v4, ptr %out4, align 2
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v4, i16* %out4, align 2
-; AVX1: LV: Found an estimated cost of 27 for VF 2 For instruction:   store i16 %v4, i16* %out4, align 2
-; AVX1: LV: Found an estimated cost of 45 for VF 4 For instruction:   store i16 %v4, i16* %out4, align 2
-; AVX1: LV: Found an estimated cost of 88 for VF 8 For instruction:   store i16 %v4, i16* %out4, align 2
-; AVX1: LV: Found an estimated cost of 215 for VF 16 For instruction:   store i16 %v4, i16* %out4, align 2
-; AVX1: LV: Found an estimated cost of 430 for VF 32 For instruction:   store i16 %v4, i16* %out4, align 2
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v4, ptr %out4, align 2
+; AVX2:  LV: Found an estimated cost of 27 for VF 2 For instruction: store i16 %v4, ptr %out4, align 2
+; AVX2:  LV: Found an estimated cost of 45 for VF 4 For instruction: store i16 %v4, ptr %out4, align 2
+; AVX2:  LV: Found an estimated cost of 88 for VF 8 For instruction: store i16 %v4, ptr %out4, align 2
+; AVX2:  LV: Found an estimated cost of 215 for VF 16 For instruction: store i16 %v4, ptr %out4, align 2
+; AVX2:  LV: Found an estimated cost of 430 for VF 32 For instruction: store i16 %v4, ptr %out4, align 2
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v4, i16* %out4, align 2
-; AVX2: LV: Found an estimated cost of 27 for VF 2 For instruction:   store i16 %v4, i16* %out4, align 2
-; AVX2: LV: Found an estimated cost of 45 for VF 4 For instruction:   store i16 %v4, i16* %out4, align 2
-; AVX2: LV: Found an estimated cost of 88 for VF 8 For instruction:   store i16 %v4, i16* %out4, align 2
-; AVX2: LV: Found an estimated cost of 215 for VF 16 For instruction:   store i16 %v4, i16* %out4, align 2
-; AVX2: LV: Found an estimated cost of 430 for VF 32 For instruction:   store i16 %v4, i16* %out4, align 2
+; AVX512DQ-LABEL: 'test'
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v4, ptr %out4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 27 for VF 2 For instruction: store i16 %v4, ptr %out4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 47 for VF 4 For instruction: store i16 %v4, ptr %out4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 87 for VF 8 For instruction: store i16 %v4, ptr %out4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 213 for VF 16 For instruction: store i16 %v4, ptr %out4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 465 for VF 32 For instruction: store i16 %v4, ptr %out4, align 2
+; AVX512DQ:  LV: Found an estimated cost of 930 for VF 64 For instruction: store i16 %v4, ptr %out4, align 2
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v4, i16* %out4, align 2
-; AVX512: LV: Found an estimated cost of 11 for VF 2 For instruction:   store i16 %v4, i16* %out4, align 2
-; AVX512: LV: Found an estimated cost of 11 for VF 4 For instruction:   store i16 %v4, i16* %out4, align 2
-; AVX512: LV: Found an estimated cost of 22 for VF 8 For instruction:   store i16 %v4, i16* %out4, align 2
-; AVX512: LV: Found an estimated cost of 33 for VF 16 For instruction:   store i16 %v4, i16* %out4, align 2
-; AVX512: LV: Found an estimated cost of 55 for VF 32 For instruction:   store i16 %v4, i16* %out4, align 2
-; AVX512: LV: Found an estimated cost of 110 for VF 64 For instruction:   store i16 %v4, i16* %out4, align 2
+; AVX512BW-LABEL: 'test'
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v4, ptr %out4, align 2
+; AVX512BW:  LV: Found an estimated cost of 11 for VF 2 For instruction: store i16 %v4, ptr %out4, align 2
+; AVX512BW:  LV: Found an estimated cost of 11 for VF 4 For instruction: store i16 %v4, ptr %out4, align 2
+; AVX512BW:  LV: Found an estimated cost of 22 for VF 8 For instruction: store i16 %v4, ptr %out4, align 2
+; AVX512BW:  LV: Found an estimated cost of 33 for VF 16 For instruction: store i16 %v4, ptr %out4, align 2
+; AVX512BW:  LV: Found an estimated cost of 55 for VF 32 For instruction: store i16 %v4, ptr %out4, align 2
+; AVX512BW:  LV: Found an estimated cost of 110 for VF 64 For instruction: store i16 %v4, ptr %out4, align 2
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store i16 %v4, i16* %out4, align 2
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -55,8 +66,8 @@ for.body:
   %iv.3 = add nuw nsw i64 %iv, 3
   %iv.4 = add nuw nsw i64 %iv, 4
 
-  %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
-  %v.narrow = load i8, i8* %in
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
 
   %v = zext i8 %v.narrow to i16
 
@@ -66,17 +77,17 @@ for.body:
   %v3 = add i16 %v, 3
   %v4 = add i16 %v, 4
 
-  %out0 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.0
-  %out1 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.1
-  %out2 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.2
-  %out3 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.3
-  %out4 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.4
+  %out0 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.2
+  %out3 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.3
+  %out4 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.4
 
-  store i16 %v0, i16* %out0
-  store i16 %v1, i16* %out1
-  store i16 %v2, i16* %out2
-  store i16 %v3, i16* %out3
-  store i16 %v4, i16* %out4
+  store i16 %v0, ptr %out0
+  store i16 %v1, ptr %out1
+  store i16 %v2, ptr %out2
+  store i16 %v3, ptr %out3
+  store i16 %v4, ptr %out4
 
   %iv.next = add nuw nsw i64 %iv.0, 5
   %cmp = icmp ult i64 %iv.next, 1024

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-6.ll
index 0ee917f80f006..639bc8a3d2587 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-6.ll
@@ -1,7 +1,9 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i16 %v5, ptr %out5"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512DQ
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl,+avx512bw --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512BW
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +12,48 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i16] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v5, ptr %out5, align 2
+; SSE2:  LV: Found an estimated cost of 26 for VF 2 For instruction: store i16 %v5, ptr %out5, align 2
+; SSE2:  LV: Found an estimated cost of 51 for VF 4 For instruction: store i16 %v5, ptr %out5, align 2
+; SSE2:  LV: Found an estimated cost of 102 for VF 8 For instruction: store i16 %v5, ptr %out5, align 2
+; SSE2:  LV: Found an estimated cost of 204 for VF 16 For instruction: store i16 %v5, ptr %out5, align 2
 ;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v5, i16* %out5, align 2
-; SSE2: LV: Found an estimated cost of 26 for VF 2 For instruction:   store i16 %v5, i16* %out5, align 2
-; SSE2: LV: Found an estimated cost of 51 for VF 4 For instruction:   store i16 %v5, i16* %out5, align 2
-; SSE2: LV: Found an estimated cost of 102 for VF 8 For instruction:   store i16 %v5, i16* %out5, align 2
-; SSE2: LV: Found an estimated cost of 204 for VF 16 For instruction:   store i16 %v5, i16* %out5, align 2
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v5, ptr %out5, align 2
+; AVX1:  LV: Found an estimated cost of 30 for VF 2 For instruction: store i16 %v5, ptr %out5, align 2
+; AVX1:  LV: Found an estimated cost of 53 for VF 4 For instruction: store i16 %v5, ptr %out5, align 2
+; AVX1:  LV: Found an estimated cost of 105 for VF 8 For instruction: store i16 %v5, ptr %out5, align 2
+; AVX1:  LV: Found an estimated cost of 258 for VF 16 For instruction: store i16 %v5, ptr %out5, align 2
+; AVX1:  LV: Found an estimated cost of 516 for VF 32 For instruction: store i16 %v5, ptr %out5, align 2
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v5, i16* %out5, align 2
-; AVX1: LV: Found an estimated cost of 30 for VF 2 For instruction:   store i16 %v5, i16* %out5, align 2
-; AVX1: LV: Found an estimated cost of 53 for VF 4 For instruction:   store i16 %v5, i16* %out5, align 2
-; AVX1: LV: Found an estimated cost of 105 for VF 8 For instruction:   store i16 %v5, i16* %out5, align 2
-; AVX1: LV: Found an estimated cost of 258 for VF 16 For instruction:   store i16 %v5, i16* %out5, align 2
-; AVX1: LV: Found an estimated cost of 516 for VF 32 For instruction:   store i16 %v5, i16* %out5, align 2
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v5, ptr %out5, align 2
+; AVX2:  LV: Found an estimated cost of 13 for VF 2 For instruction: store i16 %v5, ptr %out5, align 2
+; AVX2:  LV: Found an estimated cost of 17 for VF 4 For instruction: store i16 %v5, ptr %out5, align 2
+; AVX2:  LV: Found an estimated cost of 24 for VF 8 For instruction: store i16 %v5, ptr %out5, align 2
+; AVX2:  LV: Found an estimated cost of 64 for VF 16 For instruction: store i16 %v5, ptr %out5, align 2
+; AVX2:  LV: Found an estimated cost of 102 for VF 32 For instruction: store i16 %v5, ptr %out5, align 2
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v5, i16* %out5, align 2
-; AVX2: LV: Found an estimated cost of 13 for VF 2 For instruction:   store i16 %v5, i16* %out5, align 2
-; AVX2: LV: Found an estimated cost of 17 for VF 4 For instruction:   store i16 %v5, i16* %out5, align 2
-; AVX2: LV: Found an estimated cost of 24 for VF 8 For instruction:   store i16 %v5, i16* %out5, align 2
-; AVX2: LV: Found an estimated cost of 64 for VF 16 For instruction:   store i16 %v5, i16* %out5, align 2
-; AVX2: LV: Found an estimated cost of 102 for VF 32 For instruction:   store i16 %v5, i16* %out5, align 2
+; AVX512DQ-LABEL: 'test'
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v5, ptr %out5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 13 for VF 2 For instruction: store i16 %v5, ptr %out5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 18 for VF 4 For instruction: store i16 %v5, ptr %out5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 23 for VF 8 For instruction: store i16 %v5, ptr %out5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 61 for VF 16 For instruction: store i16 %v5, ptr %out5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 96 for VF 32 For instruction: store i16 %v5, ptr %out5, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1116 for VF 64 For instruction: store i16 %v5, ptr %out5, align 2
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v5, i16* %out5, align 2
-; AVX512: LV: Found an estimated cost of 13 for VF 2 For instruction:   store i16 %v5, i16* %out5, align 2
-; AVX512: LV: Found an estimated cost of 13 for VF 4 For instruction:   store i16 %v5, i16* %out5, align 2
-; AVX512: LV: Found an estimated cost of 27 for VF 8 For instruction:   store i16 %v5, i16* %out5, align 2
-; AVX512: LV: Found an estimated cost of 40 for VF 16 For instruction:   store i16 %v5, i16* %out5, align 2
-; AVX512: LV: Found an estimated cost of 81 for VF 32 For instruction:   store i16 %v5, i16* %out5, align 2
-; AVX512: LV: Found an estimated cost of 162 for VF 64 For instruction:   store i16 %v5, i16* %out5, align 2
+; AVX512BW-LABEL: 'test'
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v5, ptr %out5, align 2
+; AVX512BW:  LV: Found an estimated cost of 13 for VF 2 For instruction: store i16 %v5, ptr %out5, align 2
+; AVX512BW:  LV: Found an estimated cost of 13 for VF 4 For instruction: store i16 %v5, ptr %out5, align 2
+; AVX512BW:  LV: Found an estimated cost of 27 for VF 8 For instruction: store i16 %v5, ptr %out5, align 2
+; AVX512BW:  LV: Found an estimated cost of 40 for VF 16 For instruction: store i16 %v5, ptr %out5, align 2
+; AVX512BW:  LV: Found an estimated cost of 81 for VF 32 For instruction: store i16 %v5, ptr %out5, align 2
+; AVX512BW:  LV: Found an estimated cost of 162 for VF 64 For instruction: store i16 %v5, ptr %out5, align 2
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store i16 %v5, i16* %out5, align 2
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -56,8 +67,8 @@ for.body:
   %iv.4 = add nuw nsw i64 %iv, 4
   %iv.5 = add nuw nsw i64 %iv, 5
 
-  %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
-  %v.narrow = load i8, i8* %in
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
 
   %v = zext i8 %v.narrow to i16
 
@@ -68,19 +79,19 @@ for.body:
   %v4 = add i16 %v, 4
   %v5 = add i16 %v, 5
 
-  %out0 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.0
-  %out1 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.1
-  %out2 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.2
-  %out3 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.3
-  %out4 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.4
-  %out5 = getelementptr inbounds [1024 x i16], [1024 x i16]* @B, i64 0, i64 %iv.5
+  %out0 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.2
+  %out3 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.3
+  %out4 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.4
+  %out5 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.5
 
-  store i16 %v0, i16* %out0
-  store i16 %v1, i16* %out1
-  store i16 %v2, i16* %out2
-  store i16 %v3, i16* %out3
-  store i16 %v4, i16* %out4
-  store i16 %v5, i16* %out5
+  store i16 %v0, ptr %out0
+  store i16 %v1, ptr %out1
+  store i16 %v2, ptr %out2
+  store i16 %v3, ptr %out3
+  store i16 %v4, ptr %out4
+  store i16 %v5, ptr %out5
 
   %iv.next = add nuw nsw i64 %iv.0, 6
   %cmp = icmp ult i64 %iv.next, 1024

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-7.ll
new file mode 100644
index 0000000000000..8e4f8730c77e1
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-7.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i16 %v6, ptr %out6"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512DQ
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl,+avx512bw --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512BW
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x i8] zeroinitializer, align 128
+ at B = global [1024 x i16] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v6, ptr %out6, align 2
+; SSE2:  LV: Found an estimated cost of 33 for VF 2 For instruction: store i16 %v6, ptr %out6, align 2
+; SSE2:  LV: Found an estimated cost of 60 for VF 4 For instruction: store i16 %v6, ptr %out6, align 2
+; SSE2:  LV: Found an estimated cost of 119 for VF 8 For instruction: store i16 %v6, ptr %out6, align 2
+; SSE2:  LV: Found an estimated cost of 238 for VF 16 For instruction: store i16 %v6, ptr %out6, align 2
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v6, ptr %out6, align 2
+; AVX1:  LV: Found an estimated cost of 36 for VF 2 For instruction: store i16 %v6, ptr %out6, align 2
+; AVX1:  LV: Found an estimated cost of 65 for VF 4 For instruction: store i16 %v6, ptr %out6, align 2
+; AVX1:  LV: Found an estimated cost of 123 for VF 8 For instruction: store i16 %v6, ptr %out6, align 2
+; AVX1:  LV: Found an estimated cost of 301 for VF 16 For instruction: store i16 %v6, ptr %out6, align 2
+; AVX1:  LV: Found an estimated cost of 602 for VF 32 For instruction: store i16 %v6, ptr %out6, align 2
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v6, ptr %out6, align 2
+; AVX2:  LV: Found an estimated cost of 36 for VF 2 For instruction: store i16 %v6, ptr %out6, align 2
+; AVX2:  LV: Found an estimated cost of 65 for VF 4 For instruction: store i16 %v6, ptr %out6, align 2
+; AVX2:  LV: Found an estimated cost of 123 for VF 8 For instruction: store i16 %v6, ptr %out6, align 2
+; AVX2:  LV: Found an estimated cost of 301 for VF 16 For instruction: store i16 %v6, ptr %out6, align 2
+; AVX2:  LV: Found an estimated cost of 602 for VF 32 For instruction: store i16 %v6, ptr %out6, align 2
+;
+; AVX512DQ-LABEL: 'test'
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v6, ptr %out6, align 2
+; AVX512DQ:  LV: Found an estimated cost of 36 for VF 2 For instruction: store i16 %v6, ptr %out6, align 2
+; AVX512DQ:  LV: Found an estimated cost of 66 for VF 4 For instruction: store i16 %v6, ptr %out6, align 2
+; AVX512DQ:  LV: Found an estimated cost of 123 for VF 8 For instruction: store i16 %v6, ptr %out6, align 2
+; AVX512DQ:  LV: Found an estimated cost of 298 for VF 16 For instruction: store i16 %v6, ptr %out6, align 2
+; AVX512DQ:  LV: Found an estimated cost of 651 for VF 32 For instruction: store i16 %v6, ptr %out6, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1302 for VF 64 For instruction: store i16 %v6, ptr %out6, align 2
+;
+; AVX512BW-LABEL: 'test'
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v6, ptr %out6, align 2
+; AVX512BW:  LV: Found an estimated cost of 16 for VF 2 For instruction: store i16 %v6, ptr %out6, align 2
+; AVX512BW:  LV: Found an estimated cost of 16 for VF 4 For instruction: store i16 %v6, ptr %out6, align 2
+; AVX512BW:  LV: Found an estimated cost of 32 for VF 8 For instruction: store i16 %v6, ptr %out6, align 2
+; AVX512BW:  LV: Found an estimated cost of 64 for VF 16 For instruction: store i16 %v6, ptr %out6, align 2
+; AVX512BW:  LV: Found an estimated cost of 112 for VF 32 For instruction: store i16 %v6, ptr %out6, align 2
+; AVX512BW:  LV: Found an estimated cost of 224 for VF 64 For instruction: store i16 %v6, ptr %out6, align 2
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+  %iv.5 = add nuw nsw i64 %iv, 5
+  %iv.6 = add nuw nsw i64 %iv, 6
+
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
+
+  %v = zext i8 %v.narrow to i16
+
+  %v0 = add i16 %v, 0
+  %v1 = add i16 %v, 1
+  %v2 = add i16 %v, 2
+  %v3 = add i16 %v, 3
+  %v4 = add i16 %v, 4
+  %v5 = add i16 %v, 5
+  %v6 = add i16 %v, 6
+
+  %out0 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.2
+  %out3 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.3
+  %out4 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.4
+  %out5 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.5
+  %out6 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.6
+
+  store i16 %v0, ptr %out0
+  store i16 %v1, ptr %out1
+  store i16 %v2, ptr %out2
+  store i16 %v3, ptr %out3
+  store i16 %v4, ptr %out4
+  store i16 %v5, ptr %out5
+  store i16 %v6, ptr %out6
+
+  %iv.next = add nuw nsw i64 %iv.0, 7
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-8.ll
new file mode 100644
index 0000000000000..194ec83e3f056
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-8.ll
@@ -0,0 +1,110 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i16 %v7, ptr %out7"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512DQ
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl,+avx512bw --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512BW
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x i8] zeroinitializer, align 128
+ at B = global [1024 x i16] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v7, ptr %out7, align 2
+; SSE2:  LV: Found an estimated cost of 34 for VF 2 For instruction: store i16 %v7, ptr %out7, align 2
+; SSE2:  LV: Found an estimated cost of 68 for VF 4 For instruction: store i16 %v7, ptr %out7, align 2
+; SSE2:  LV: Found an estimated cost of 136 for VF 8 For instruction: store i16 %v7, ptr %out7, align 2
+; SSE2:  LV: Found an estimated cost of 272 for VF 16 For instruction: store i16 %v7, ptr %out7, align 2
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v7, ptr %out7, align 2
+; AVX1:  LV: Found an estimated cost of 35 for VF 2 For instruction: store i16 %v7, ptr %out7, align 2
+; AVX1:  LV: Found an estimated cost of 70 for VF 4 For instruction: store i16 %v7, ptr %out7, align 2
+; AVX1:  LV: Found an estimated cost of 140 for VF 8 For instruction: store i16 %v7, ptr %out7, align 2
+; AVX1:  LV: Found an estimated cost of 344 for VF 16 For instruction: store i16 %v7, ptr %out7, align 2
+; AVX1:  LV: Found an estimated cost of 688 for VF 32 For instruction: store i16 %v7, ptr %out7, align 2
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v7, ptr %out7, align 2
+; AVX2:  LV: Found an estimated cost of 35 for VF 2 For instruction: store i16 %v7, ptr %out7, align 2
+; AVX2:  LV: Found an estimated cost of 70 for VF 4 For instruction: store i16 %v7, ptr %out7, align 2
+; AVX2:  LV: Found an estimated cost of 140 for VF 8 For instruction: store i16 %v7, ptr %out7, align 2
+; AVX2:  LV: Found an estimated cost of 344 for VF 16 For instruction: store i16 %v7, ptr %out7, align 2
+; AVX2:  LV: Found an estimated cost of 688 for VF 32 For instruction: store i16 %v7, ptr %out7, align 2
+;
+; AVX512DQ-LABEL: 'test'
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v7, ptr %out7, align 2
+; AVX512DQ:  LV: Found an estimated cost of 35 for VF 2 For instruction: store i16 %v7, ptr %out7, align 2
+; AVX512DQ:  LV: Found an estimated cost of 69 for VF 4 For instruction: store i16 %v7, ptr %out7, align 2
+; AVX512DQ:  LV: Found an estimated cost of 138 for VF 8 For instruction: store i16 %v7, ptr %out7, align 2
+; AVX512DQ:  LV: Found an estimated cost of 340 for VF 16 For instruction: store i16 %v7, ptr %out7, align 2
+; AVX512DQ:  LV: Found an estimated cost of 744 for VF 32 For instruction: store i16 %v7, ptr %out7, align 2
+; AVX512DQ:  LV: Found an estimated cost of 1488 for VF 64 For instruction: store i16 %v7, ptr %out7, align 2
+;
+; AVX512BW-LABEL: 'test'
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v7, ptr %out7, align 2
+; AVX512BW:  LV: Found an estimated cost of 18 for VF 2 For instruction: store i16 %v7, ptr %out7, align 2
+; AVX512BW:  LV: Found an estimated cost of 18 for VF 4 For instruction: store i16 %v7, ptr %out7, align 2
+; AVX512BW:  LV: Found an estimated cost of 37 for VF 8 For instruction: store i16 %v7, ptr %out7, align 2
+; AVX512BW:  LV: Found an estimated cost of 74 for VF 16 For instruction: store i16 %v7, ptr %out7, align 2
+; AVX512BW:  LV: Found an estimated cost of 148 for VF 32 For instruction: store i16 %v7, ptr %out7, align 2
+; AVX512BW:  LV: Found an estimated cost of 296 for VF 64 For instruction: store i16 %v7, ptr %out7, align 2
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+  %iv.5 = add nuw nsw i64 %iv, 5
+  %iv.6 = add nuw nsw i64 %iv, 6
+  %iv.7 = add nuw nsw i64 %iv, 7
+
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
+
+  %v = zext i8 %v.narrow to i16
+
+  %v0 = add i16 %v, 0
+  %v1 = add i16 %v, 1
+  %v2 = add i16 %v, 2
+  %v3 = add i16 %v, 3
+  %v4 = add i16 %v, 4
+  %v5 = add i16 %v, 5
+  %v6 = add i16 %v, 6
+  %v7 = add i16 %v, 7
+
+  %out0 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.2
+  %out3 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.3
+  %out4 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.4
+  %out5 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.5
+  %out6 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.6
+  %out7 = getelementptr inbounds [1024 x i16], ptr @B, i64 0, i64 %iv.7
+
+  store i16 %v0, ptr %out0
+  store i16 %v1, ptr %out1
+  store i16 %v2, ptr %out2
+  store i16 %v3, ptr %out3
+  store i16 %v4, ptr %out4
+  store i16 %v5, ptr %out5
+  store i16 %v6, ptr %out6
+  store i16 %v7, ptr %out7
+
+  %iv.next = add nuw nsw i64 %iv.0, 8
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-2.ll
index dceb41472ace2..0fcd96667c203 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-2.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i32 %v1, ptr %out1"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +11,39 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i32] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
-;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %v1, i32* %out1, align 4
-; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction:   store i32 %v1, i32* %out1, align 4
-; SSE2: LV: Found an estimated cost of 30 for VF 4 For instruction:   store i32 %v1, i32* %out1, align 4
-; SSE2: LV: Found an estimated cost of 60 for VF 8 For instruction:   store i32 %v1, i32* %out1, align 4
-; SSE2: LV: Found an estimated cost of 120 for VF 16 For instruction:   store i32 %v1, i32* %out1, align 4
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v1, ptr %out1, align 4
+; SSE2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %v1, ptr %out1, align 4
+; SSE2:  LV: Found an estimated cost of 30 for VF 4 For instruction: store i32 %v1, ptr %out1, align 4
+; SSE2:  LV: Found an estimated cost of 60 for VF 8 For instruction: store i32 %v1, ptr %out1, align 4
+; SSE2:  LV: Found an estimated cost of 120 for VF 16 For instruction: store i32 %v1, ptr %out1, align 4
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %v1, i32* %out1, align 4
-; AVX1: LV: Found an estimated cost of 2 for VF 2 For instruction:   store i32 %v1, i32* %out1, align 4
-; AVX1: LV: Found an estimated cost of 19 for VF 4 For instruction:   store i32 %v1, i32* %out1, align 4
-; AVX1: LV: Found an estimated cost of 46 for VF 8 For instruction:   store i32 %v1, i32* %out1, align 4
-; AVX1: LV: Found an estimated cost of 92 for VF 16 For instruction:   store i32 %v1, i32* %out1, align 4
-; AVX1: LV: Found an estimated cost of 184 for VF 32 For instruction:   store i32 %v1, i32* %out1, align 4
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v1, ptr %out1, align 4
+; AVX1:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %v1, ptr %out1, align 4
+; AVX1:  LV: Found an estimated cost of 19 for VF 4 For instruction: store i32 %v1, ptr %out1, align 4
+; AVX1:  LV: Found an estimated cost of 46 for VF 8 For instruction: store i32 %v1, ptr %out1, align 4
+; AVX1:  LV: Found an estimated cost of 92 for VF 16 For instruction: store i32 %v1, ptr %out1, align 4
+; AVX1:  LV: Found an estimated cost of 184 for VF 32 For instruction: store i32 %v1, ptr %out1, align 4
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %v1, i32* %out1, align 4
-; AVX2: LV: Found an estimated cost of 2 for VF 2 For instruction:   store i32 %v1, i32* %out1, align 4
-; AVX2: LV: Found an estimated cost of 3 for VF 4 For instruction:   store i32 %v1, i32* %out1, align 4
-; AVX2: LV: Found an estimated cost of 6 for VF 8 For instruction:   store i32 %v1, i32* %out1, align 4
-; AVX2: LV: Found an estimated cost of 12 for VF 16 For instruction:   store i32 %v1, i32* %out1, align 4
-; AVX2: LV: Found an estimated cost of 24 for VF 32 For instruction:   store i32 %v1, i32* %out1, align 4
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v1, ptr %out1, align 4
+; AVX2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %v1, ptr %out1, align 4
+; AVX2:  LV: Found an estimated cost of 3 for VF 4 For instruction: store i32 %v1, ptr %out1, align 4
+; AVX2:  LV: Found an estimated cost of 6 for VF 8 For instruction: store i32 %v1, ptr %out1, align 4
+; AVX2:  LV: Found an estimated cost of 12 for VF 16 For instruction: store i32 %v1, ptr %out1, align 4
+; AVX2:  LV: Found an estimated cost of 24 for VF 32 For instruction: store i32 %v1, ptr %out1, align 4
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %v1, i32* %out1, align 4
-; AVX512: LV: Found an estimated cost of 2 for VF 2 For instruction:   store i32 %v1, i32* %out1, align 4
-; AVX512: LV: Found an estimated cost of 2 for VF 4 For instruction:   store i32 %v1, i32* %out1, align 4
-; AVX512: LV: Found an estimated cost of 2 for VF 8 For instruction:   store i32 %v1, i32* %out1, align 4
-; AVX512: LV: Found an estimated cost of 5 for VF 16 For instruction:   store i32 %v1, i32* %out1, align 4
-; AVX512: LV: Found an estimated cost of 10 for VF 32 For instruction:   store i32 %v1, i32* %out1, align 4
-; AVX512: LV: Found an estimated cost of 20 for VF 64 For instruction:   store i32 %v1, i32* %out1, align 4
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v1, ptr %out1, align 4
+; AVX512:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %v1, ptr %out1, align 4
+; AVX512:  LV: Found an estimated cost of 2 for VF 4 For instruction: store i32 %v1, ptr %out1, align 4
+; AVX512:  LV: Found an estimated cost of 2 for VF 8 For instruction: store i32 %v1, ptr %out1, align 4
+; AVX512:  LV: Found an estimated cost of 5 for VF 16 For instruction: store i32 %v1, ptr %out1, align 4
+; AVX512:  LV: Found an estimated cost of 10 for VF 32 For instruction: store i32 %v1, ptr %out1, align 4
+; AVX512:  LV: Found an estimated cost of 20 for VF 64 For instruction: store i32 %v1, ptr %out1, align 4
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store i32 %v1, i32* %out1, align 4
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -52,19 +53,19 @@ for.body:
   %iv.0 = add nuw nsw i64 %iv, 0
   %iv.1 = add nuw nsw i64 %iv, 1
 
-  %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
-  %v.narrow = load i8, i8* %in
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
 
   %v = zext i8 %v.narrow to i32
 
   %v0 = add i32 %v, 0
   %v1 = add i32 %v, 1
 
-  %out0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %iv.0
-  %out1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %iv.1
+  %out0 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.1
 
-  store i32 %v0, i32* %out0
-  store i32 %v1, i32* %out1
+  store i32 %v0, ptr %out0
+  store i32 %v1, ptr %out1
 
   %iv.next = add nuw nsw i64 %iv.0, 2
   %cmp = icmp ult i64 %iv.next, 1024

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-3.ll
index a75e116846eab..ecacdc06d7d01 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-3.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i32 %v2, ptr %out2"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +11,39 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i32] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
-;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %v2, i32* %out2, align 4
-; SSE2: LV: Found an estimated cost of 23 for VF 2 For instruction:   store i32 %v2, i32* %out2, align 4
-; SSE2: LV: Found an estimated cost of 48 for VF 4 For instruction:   store i32 %v2, i32* %out2, align 4
-; SSE2: LV: Found an estimated cost of 96 for VF 8 For instruction:   store i32 %v2, i32* %out2, align 4
-; SSE2: LV: Found an estimated cost of 192 for VF 16 For instruction:   store i32 %v2, i32* %out2, align 4
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v2, ptr %out2, align 4
+; SSE2:  LV: Found an estimated cost of 23 for VF 2 For instruction: store i32 %v2, ptr %out2, align 4
+; SSE2:  LV: Found an estimated cost of 48 for VF 4 For instruction: store i32 %v2, ptr %out2, align 4
+; SSE2:  LV: Found an estimated cost of 96 for VF 8 For instruction: store i32 %v2, ptr %out2, align 4
+; SSE2:  LV: Found an estimated cost of 192 for VF 16 For instruction: store i32 %v2, ptr %out2, align 4
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %v2, i32* %out2, align 4
-; AVX1: LV: Found an estimated cost of 18 for VF 2 For instruction:   store i32 %v2, i32* %out2, align 4
-; AVX1: LV: Found an estimated cost of 29 for VF 4 For instruction:   store i32 %v2, i32* %out2, align 4
-; AVX1: LV: Found an estimated cost of 69 for VF 8 For instruction:   store i32 %v2, i32* %out2, align 4
-; AVX1: LV: Found an estimated cost of 138 for VF 16 For instruction:   store i32 %v2, i32* %out2, align 4
-; AVX1: LV: Found an estimated cost of 276 for VF 32 For instruction:   store i32 %v2, i32* %out2, align 4
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v2, ptr %out2, align 4
+; AVX1:  LV: Found an estimated cost of 18 for VF 2 For instruction: store i32 %v2, ptr %out2, align 4
+; AVX1:  LV: Found an estimated cost of 29 for VF 4 For instruction: store i32 %v2, ptr %out2, align 4
+; AVX1:  LV: Found an estimated cost of 69 for VF 8 For instruction: store i32 %v2, ptr %out2, align 4
+; AVX1:  LV: Found an estimated cost of 138 for VF 16 For instruction: store i32 %v2, ptr %out2, align 4
+; AVX1:  LV: Found an estimated cost of 276 for VF 32 For instruction: store i32 %v2, ptr %out2, align 4
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %v2, i32* %out2, align 4
-; AVX2: LV: Found an estimated cost of 7 for VF 2 For instruction:   store i32 %v2, i32* %out2, align 4
-; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction:   store i32 %v2, i32* %out2, align 4
-; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction:   store i32 %v2, i32* %out2, align 4
-; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction:   store i32 %v2, i32* %out2, align 4
-; AVX2: LV: Found an estimated cost of 60 for VF 32 For instruction:   store i32 %v2, i32* %out2, align 4
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v2, ptr %out2, align 4
+; AVX2:  LV: Found an estimated cost of 7 for VF 2 For instruction: store i32 %v2, ptr %out2, align 4
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: store i32 %v2, ptr %out2, align 4
+; AVX2:  LV: Found an estimated cost of 14 for VF 8 For instruction: store i32 %v2, ptr %out2, align 4
+; AVX2:  LV: Found an estimated cost of 28 for VF 16 For instruction: store i32 %v2, ptr %out2, align 4
+; AVX2:  LV: Found an estimated cost of 60 for VF 32 For instruction: store i32 %v2, ptr %out2, align 4
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %v2, i32* %out2, align 4
-; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction:   store i32 %v2, i32* %out2, align 4
-; AVX512: LV: Found an estimated cost of 4 for VF 4 For instruction:   store i32 %v2, i32* %out2, align 4
-; AVX512: LV: Found an estimated cost of 8 for VF 8 For instruction:   store i32 %v2, i32* %out2, align 4
-; AVX512: LV: Found an estimated cost of 12 for VF 16 For instruction:   store i32 %v2, i32* %out2, align 4
-; AVX512: LV: Found an estimated cost of 24 for VF 32 For instruction:   store i32 %v2, i32* %out2, align 4
-; AVX512: LV: Found an estimated cost of 48 for VF 64 For instruction:   store i32 %v2, i32* %out2, align 4
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v2, ptr %out2, align 4
+; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: store i32 %v2, ptr %out2, align 4
+; AVX512:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %v2, ptr %out2, align 4
+; AVX512:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i32 %v2, ptr %out2, align 4
+; AVX512:  LV: Found an estimated cost of 12 for VF 16 For instruction: store i32 %v2, ptr %out2, align 4
+; AVX512:  LV: Found an estimated cost of 24 for VF 32 For instruction: store i32 %v2, ptr %out2, align 4
+; AVX512:  LV: Found an estimated cost of 48 for VF 64 For instruction: store i32 %v2, ptr %out2, align 4
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store i32 %v2, i32* %out2, align 4
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -53,8 +54,8 @@ for.body:
   %iv.1 = add nuw nsw i64 %iv, 1
   %iv.2 = add nuw nsw i64 %iv, 2
 
-  %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
-  %v.narrow = load i8, i8* %in
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
 
   %v = zext i8 %v.narrow to i32
 
@@ -62,13 +63,13 @@ for.body:
   %v1 = add i32 %v, 1
   %v2 = add i32 %v, 2
 
-  %out0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %iv.0
-  %out1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %iv.1
-  %out2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %iv.2
+  %out0 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.2
 
-  store i32 %v0, i32* %out0
-  store i32 %v1, i32* %out1
-  store i32 %v2, i32* %out2
+  store i32 %v0, ptr %out0
+  store i32 %v1, ptr %out1
+  store i32 %v2, ptr %out2
 
   %iv.next = add nuw nsw i64 %iv.0, 3
   %cmp = icmp ult i64 %iv.next, 1024

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-4.ll
index 25cfe69878c06..6fdc9282d6b50 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-4.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i32 %v3, ptr %out3"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +11,39 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i32] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
-;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %v3, i32* %out3, align 4
-; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction:   store i32 %v3, i32* %out3, align 4
-; SSE2: LV: Found an estimated cost of 60 for VF 4 For instruction:   store i32 %v3, i32* %out3, align 4
-; SSE2: LV: Found an estimated cost of 120 for VF 8 For instruction:   store i32 %v3, i32* %out3, align 4
-; SSE2: LV: Found an estimated cost of 240 for VF 16 For instruction:   store i32 %v3, i32* %out3, align 4
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v3, ptr %out3, align 4
+; SSE2:  LV: Found an estimated cost of 28 for VF 2 For instruction: store i32 %v3, ptr %out3, align 4
+; SSE2:  LV: Found an estimated cost of 60 for VF 4 For instruction: store i32 %v3, ptr %out3, align 4
+; SSE2:  LV: Found an estimated cost of 120 for VF 8 For instruction: store i32 %v3, ptr %out3, align 4
+; SSE2:  LV: Found an estimated cost of 240 for VF 16 For instruction: store i32 %v3, ptr %out3, align 4
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %v3, i32* %out3, align 4
-; AVX1: LV: Found an estimated cost of 19 for VF 2 For instruction:   store i32 %v3, i32* %out3, align 4
-; AVX1: LV: Found an estimated cost of 38 for VF 4 For instruction:   store i32 %v3, i32* %out3, align 4
-; AVX1: LV: Found an estimated cost of 92 for VF 8 For instruction:   store i32 %v3, i32* %out3, align 4
-; AVX1: LV: Found an estimated cost of 184 for VF 16 For instruction:   store i32 %v3, i32* %out3, align 4
-; AVX1: LV: Found an estimated cost of 368 for VF 32 For instruction:   store i32 %v3, i32* %out3, align 4
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v3, ptr %out3, align 4
+; AVX1:  LV: Found an estimated cost of 19 for VF 2 For instruction: store i32 %v3, ptr %out3, align 4
+; AVX1:  LV: Found an estimated cost of 38 for VF 4 For instruction: store i32 %v3, ptr %out3, align 4
+; AVX1:  LV: Found an estimated cost of 92 for VF 8 For instruction: store i32 %v3, ptr %out3, align 4
+; AVX1:  LV: Found an estimated cost of 184 for VF 16 For instruction: store i32 %v3, ptr %out3, align 4
+; AVX1:  LV: Found an estimated cost of 368 for VF 32 For instruction: store i32 %v3, ptr %out3, align 4
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %v3, i32* %out3, align 4
-; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i32 %v3, i32* %out3, align 4
-; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction:   store i32 %v3, i32* %out3, align 4
-; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction:   store i32 %v3, i32* %out3, align 4
-; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction:   store i32 %v3, i32* %out3, align 4
-; AVX2: LV: Found an estimated cost of 80 for VF 32 For instruction:   store i32 %v3, i32* %out3, align 4
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v3, ptr %out3, align 4
+; AVX2:  LV: Found an estimated cost of 6 for VF 2 For instruction: store i32 %v3, ptr %out3, align 4
+; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: store i32 %v3, ptr %out3, align 4
+; AVX2:  LV: Found an estimated cost of 20 for VF 8 For instruction: store i32 %v3, ptr %out3, align 4
+; AVX2:  LV: Found an estimated cost of 40 for VF 16 For instruction: store i32 %v3, ptr %out3, align 4
+; AVX2:  LV: Found an estimated cost of 80 for VF 32 For instruction: store i32 %v3, ptr %out3, align 4
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %v3, i32* %out3, align 4
-; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction:   store i32 %v3, i32* %out3, align 4
-; AVX512: LV: Found an estimated cost of 5 for VF 4 For instruction:   store i32 %v3, i32* %out3, align 4
-; AVX512: LV: Found an estimated cost of 11 for VF 8 For instruction:   store i32 %v3, i32* %out3, align 4
-; AVX512: LV: Found an estimated cost of 22 for VF 16 For instruction:   store i32 %v3, i32* %out3, align 4
-; AVX512: LV: Found an estimated cost of 44 for VF 32 For instruction:   store i32 %v3, i32* %out3, align 4
-; AVX512: LV: Found an estimated cost of 88 for VF 64 For instruction:   store i32 %v3, i32* %out3, align 4
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v3, ptr %out3, align 4
+; AVX512:  LV: Found an estimated cost of 5 for VF 2 For instruction: store i32 %v3, ptr %out3, align 4
+; AVX512:  LV: Found an estimated cost of 5 for VF 4 For instruction: store i32 %v3, ptr %out3, align 4
+; AVX512:  LV: Found an estimated cost of 11 for VF 8 For instruction: store i32 %v3, ptr %out3, align 4
+; AVX512:  LV: Found an estimated cost of 22 for VF 16 For instruction: store i32 %v3, ptr %out3, align 4
+; AVX512:  LV: Found an estimated cost of 44 for VF 32 For instruction: store i32 %v3, ptr %out3, align 4
+; AVX512:  LV: Found an estimated cost of 88 for VF 64 For instruction: store i32 %v3, ptr %out3, align 4
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store i32 %v3, i32* %out3, align 4
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -54,8 +55,8 @@ for.body:
   %iv.2 = add nuw nsw i64 %iv, 2
   %iv.3 = add nuw nsw i64 %iv, 3
 
-  %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
-  %v.narrow = load i8, i8* %in
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
 
   %v = zext i8 %v.narrow to i32
 
@@ -64,15 +65,15 @@ for.body:
   %v2 = add i32 %v, 2
   %v3 = add i32 %v, 3
 
-  %out0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %iv.0
-  %out1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %iv.1
-  %out2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %iv.2
-  %out3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %iv.3
+  %out0 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.2
+  %out3 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.3
 
-  store i32 %v0, i32* %out0
-  store i32 %v1, i32* %out1
-  store i32 %v2, i32* %out2
-  store i32 %v3, i32* %out3
+  store i32 %v0, ptr %out0
+  store i32 %v1, ptr %out1
+  store i32 %v2, ptr %out2
+  store i32 %v3, ptr %out3
 
   %iv.next = add nuw nsw i64 %iv.0, 4
   %cmp = icmp ult i64 %iv.next, 1024

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-5.ll
new file mode 100644
index 0000000000000..7a1921faacc44
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-5.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i32 %v4, ptr %out4"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x i8] zeroinitializer, align 128
+ at B = global [1024 x i32] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v4, ptr %out4, align 4
+; SSE2:  LV: Found an estimated cost of 40 for VF 2 For instruction: store i32 %v4, ptr %out4, align 4
+; SSE2:  LV: Found an estimated cost of 84 for VF 4 For instruction: store i32 %v4, ptr %out4, align 4
+; SSE2:  LV: Found an estimated cost of 168 for VF 8 For instruction: store i32 %v4, ptr %out4, align 4
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v4, ptr %out4, align 4
+; AVX1:  LV: Found an estimated cost of 25 for VF 2 For instruction: store i32 %v4, ptr %out4, align 4
+; AVX1:  LV: Found an estimated cost of 48 for VF 4 For instruction: store i32 %v4, ptr %out4, align 4
+; AVX1:  LV: Found an estimated cost of 115 for VF 8 For instruction: store i32 %v4, ptr %out4, align 4
+; AVX1:  LV: Found an estimated cost of 230 for VF 16 For instruction: store i32 %v4, ptr %out4, align 4
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v4, ptr %out4, align 4
+; AVX2:  LV: Found an estimated cost of 25 for VF 2 For instruction: store i32 %v4, ptr %out4, align 4
+; AVX2:  LV: Found an estimated cost of 48 for VF 4 For instruction: store i32 %v4, ptr %out4, align 4
+; AVX2:  LV: Found an estimated cost of 115 for VF 8 For instruction: store i32 %v4, ptr %out4, align 4
+; AVX2:  LV: Found an estimated cost of 230 for VF 16 For instruction: store i32 %v4, ptr %out4, align 4
+;
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v4, ptr %out4, align 4
+; AVX512:  LV: Found an estimated cost of 7 for VF 2 For instruction: store i32 %v4, ptr %out4, align 4
+; AVX512:  LV: Found an estimated cost of 14 for VF 4 For instruction: store i32 %v4, ptr %out4, align 4
+; AVX512:  LV: Found an estimated cost of 21 for VF 8 For instruction: store i32 %v4, ptr %out4, align 4
+; AVX512:  LV: Found an estimated cost of 35 for VF 16 For instruction: store i32 %v4, ptr %out4, align 4
+; AVX512:  LV: Found an estimated cost of 70 for VF 32 For instruction: store i32 %v4, ptr %out4, align 4
+; AVX512:  LV: Found an estimated cost of 140 for VF 64 For instruction: store i32 %v4, ptr %out4, align 4
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
+
+  %v = zext i8 %v.narrow to i32
+
+  %v0 = add i32 %v, 0
+  %v1 = add i32 %v, 1
+  %v2 = add i32 %v, 2
+  %v3 = add i32 %v, 3
+  %v4 = add i32 %v, 4
+
+  %out0 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.2
+  %out3 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.3
+  %out4 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.4
+
+  store i32 %v0, ptr %out0
+  store i32 %v1, ptr %out1
+  store i32 %v2, ptr %out2
+  store i32 %v3, ptr %out3
+  store i32 %v4, ptr %out4
+
+  %iv.next = add nuw nsw i64 %iv.0, 5
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-6.ll
index df2c6a5ec909e..44bc4436a76e1 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-6.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i32 %v5, ptr %out5"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,36 +11,36 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i32] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
-;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %v5, i32* %out5, align 4
-; SSE2: LV: Found an estimated cost of 45 for VF 2 For instruction:   store i32 %v5, i32* %out5, align 4
-; SSE2: LV: Found an estimated cost of 96 for VF 4 For instruction:   store i32 %v5, i32* %out5, align 4
-; SSE2: LV: Found an estimated cost of 192 for VF 8 For instruction:   store i32 %v5, i32* %out5, align 4
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v5, ptr %out5, align 4
+; SSE2:  LV: Found an estimated cost of 45 for VF 2 For instruction: store i32 %v5, ptr %out5, align 4
+; SSE2:  LV: Found an estimated cost of 96 for VF 4 For instruction: store i32 %v5, ptr %out5, align 4
+; SSE2:  LV: Found an estimated cost of 192 for VF 8 For instruction: store i32 %v5, ptr %out5, align 4
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %v5, i32* %out5, align 4
-; AVX1: LV: Found an estimated cost of 29 for VF 2 For instruction:   store i32 %v5, i32* %out5, align 4
-; AVX1: LV: Found an estimated cost of 57 for VF 4 For instruction:   store i32 %v5, i32* %out5, align 4
-; AVX1: LV: Found an estimated cost of 138 for VF 8 For instruction:   store i32 %v5, i32* %out5, align 4
-; AVX1: LV: Found an estimated cost of 276 for VF 16 For instruction:   store i32 %v5, i32* %out5, align 4
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v5, ptr %out5, align 4
+; AVX1:  LV: Found an estimated cost of 29 for VF 2 For instruction: store i32 %v5, ptr %out5, align 4
+; AVX1:  LV: Found an estimated cost of 57 for VF 4 For instruction: store i32 %v5, ptr %out5, align 4
+; AVX1:  LV: Found an estimated cost of 138 for VF 8 For instruction: store i32 %v5, ptr %out5, align 4
+; AVX1:  LV: Found an estimated cost of 276 for VF 16 For instruction: store i32 %v5, ptr %out5, align 4
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %v5, i32* %out5, align 4
-; AVX2: LV: Found an estimated cost of 11 for VF 2 For instruction:   store i32 %v5, i32* %out5, align 4
-; AVX2: LV: Found an estimated cost of 15 for VF 4 For instruction:   store i32 %v5, i32* %out5, align 4
-; AVX2: LV: Found an estimated cost of 39 for VF 8 For instruction:   store i32 %v5, i32* %out5, align 4
-; AVX2: LV: Found an estimated cost of 78 for VF 16 For instruction:   store i32 %v5, i32* %out5, align 4
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v5, ptr %out5, align 4
+; AVX2:  LV: Found an estimated cost of 11 for VF 2 For instruction: store i32 %v5, ptr %out5, align 4
+; AVX2:  LV: Found an estimated cost of 15 for VF 4 For instruction: store i32 %v5, ptr %out5, align 4
+; AVX2:  LV: Found an estimated cost of 39 for VF 8 For instruction: store i32 %v5, ptr %out5, align 4
+; AVX2:  LV: Found an estimated cost of 78 for VF 16 For instruction: store i32 %v5, ptr %out5, align 4
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %v5, i32* %out5, align 4
-; AVX512: LV: Found an estimated cost of 8 for VF 2 For instruction:   store i32 %v5, i32* %out5, align 4
-; AVX512: LV: Found an estimated cost of 17 for VF 4 For instruction:   store i32 %v5, i32* %out5, align 4
-; AVX512: LV: Found an estimated cost of 25 for VF 8 For instruction:   store i32 %v5, i32* %out5, align 4
-; AVX512: LV: Found an estimated cost of 51 for VF 16 For instruction:   store i32 %v5, i32* %out5, align 4
-; AVX512: LV: Found an estimated cost of 102 for VF 32 For instruction:   store i32 %v5, i32* %out5, align 4
-; AVX512: LV: Found an estimated cost of 204 for VF 64 For instruction:   store i32 %v5, i32* %out5, align 4
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v5, ptr %out5, align 4
+; AVX512:  LV: Found an estimated cost of 8 for VF 2 For instruction: store i32 %v5, ptr %out5, align 4
+; AVX512:  LV: Found an estimated cost of 17 for VF 4 For instruction: store i32 %v5, ptr %out5, align 4
+; AVX512:  LV: Found an estimated cost of 25 for VF 8 For instruction: store i32 %v5, ptr %out5, align 4
+; AVX512:  LV: Found an estimated cost of 51 for VF 16 For instruction: store i32 %v5, ptr %out5, align 4
+; AVX512:  LV: Found an estimated cost of 102 for VF 32 For instruction: store i32 %v5, ptr %out5, align 4
+; AVX512:  LV: Found an estimated cost of 204 for VF 64 For instruction: store i32 %v5, ptr %out5, align 4
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store i32 %v5, i32* %out5, align 4
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -53,8 +54,8 @@ for.body:
   %iv.4 = add nuw nsw i64 %iv, 4
   %iv.5 = add nuw nsw i64 %iv, 5
 
-  %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
-  %v.narrow = load i8, i8* %in
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
 
   %v = zext i8 %v.narrow to i32
 
@@ -65,19 +66,19 @@ for.body:
   %v4 = add i32 %v, 4
   %v5 = add i32 %v, 5
 
-  %out0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %iv.0
-  %out1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %iv.1
-  %out2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %iv.2
-  %out3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %iv.3
-  %out4 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %iv.4
-  %out5 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %iv.5
+  %out0 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.2
+  %out3 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.3
+  %out4 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.4
+  %out5 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.5
 
-  store i32 %v0, i32* %out0
-  store i32 %v1, i32* %out1
-  store i32 %v2, i32* %out2
-  store i32 %v3, i32* %out3
-  store i32 %v4, i32* %out4
-  store i32 %v5, i32* %out5
+  store i32 %v0, ptr %out0
+  store i32 %v1, ptr %out1
+  store i32 %v2, ptr %out2
+  store i32 %v3, ptr %out3
+  store i32 %v4, ptr %out4
+  store i32 %v5, ptr %out5
 
   %iv.next = add nuw nsw i64 %iv.0, 6
   %cmp = icmp ult i64 %iv.next, 1024

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-7.ll
new file mode 100644
index 0000000000000..72aa6e9f938c1
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-7.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i32 %v6, ptr %out6"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x i8] zeroinitializer, align 128
+ at B = global [1024 x i32] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v6, ptr %out6, align 4
+; SSE2:  LV: Found an estimated cost of 51 for VF 2 For instruction: store i32 %v6, ptr %out6, align 4
+; SSE2:  LV: Found an estimated cost of 108 for VF 4 For instruction: store i32 %v6, ptr %out6, align 4
+; SSE2:  LV: Found an estimated cost of 216 for VF 8 For instruction: store i32 %v6, ptr %out6, align 4
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v6, ptr %out6, align 4
+; AVX1:  LV: Found an estimated cost of 37 for VF 2 For instruction: store i32 %v6, ptr %out6, align 4
+; AVX1:  LV: Found an estimated cost of 67 for VF 4 For instruction: store i32 %v6, ptr %out6, align 4
+; AVX1:  LV: Found an estimated cost of 161 for VF 8 For instruction: store i32 %v6, ptr %out6, align 4
+; AVX1:  LV: Found an estimated cost of 322 for VF 16 For instruction: store i32 %v6, ptr %out6, align 4
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v6, ptr %out6, align 4
+; AVX2:  LV: Found an estimated cost of 37 for VF 2 For instruction: store i32 %v6, ptr %out6, align 4
+; AVX2:  LV: Found an estimated cost of 67 for VF 4 For instruction: store i32 %v6, ptr %out6, align 4
+; AVX2:  LV: Found an estimated cost of 161 for VF 8 For instruction: store i32 %v6, ptr %out6, align 4
+; AVX2:  LV: Found an estimated cost of 322 for VF 16 For instruction: store i32 %v6, ptr %out6, align 4
+;
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v6, ptr %out6, align 4
+; AVX512:  LV: Found an estimated cost of 10 for VF 2 For instruction: store i32 %v6, ptr %out6, align 4
+; AVX512:  LV: Found an estimated cost of 20 for VF 4 For instruction: store i32 %v6, ptr %out6, align 4
+; AVX512:  LV: Found an estimated cost of 40 for VF 8 For instruction: store i32 %v6, ptr %out6, align 4
+; AVX512:  LV: Found an estimated cost of 70 for VF 16 For instruction: store i32 %v6, ptr %out6, align 4
+; AVX512:  LV: Found an estimated cost of 140 for VF 32 For instruction: store i32 %v6, ptr %out6, align 4
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+  %iv.5 = add nuw nsw i64 %iv, 5
+  %iv.6 = add nuw nsw i64 %iv, 6
+
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
+
+  %v = zext i8 %v.narrow to i32
+
+  %v0 = add i32 %v, 0
+  %v1 = add i32 %v, 1
+  %v2 = add i32 %v, 2
+  %v3 = add i32 %v, 3
+  %v4 = add i32 %v, 4
+  %v5 = add i32 %v, 5
+  %v6 = add i32 %v, 6
+
+  %out0 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.2
+  %out3 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.3
+  %out4 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.4
+  %out5 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.5
+  %out6 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.6
+
+  store i32 %v0, ptr %out0
+  store i32 %v1, ptr %out1
+  store i32 %v2, ptr %out2
+  store i32 %v3, ptr %out3
+  store i32 %v4, ptr %out4
+  store i32 %v5, ptr %out5
+  store i32 %v6, ptr %out6
+
+  %iv.next = add nuw nsw i64 %iv.0, 7
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-8.ll
new file mode 100644
index 0000000000000..fa655335d3e90
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i32-stride-8.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i32 %v7, ptr %out7"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x i8] zeroinitializer, align 128
+ at B = global [1024 x i32] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v7, ptr %out7, align 4
+; SSE2:  LV: Found an estimated cost of 56 for VF 2 For instruction: store i32 %v7, ptr %out7, align 4
+; SSE2:  LV: Found an estimated cost of 120 for VF 4 For instruction: store i32 %v7, ptr %out7, align 4
+; SSE2:  LV: Found an estimated cost of 240 for VF 8 For instruction: store i32 %v7, ptr %out7, align 4
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v7, ptr %out7, align 4
+; AVX1:  LV: Found an estimated cost of 38 for VF 2 For instruction: store i32 %v7, ptr %out7, align 4
+; AVX1:  LV: Found an estimated cost of 76 for VF 4 For instruction: store i32 %v7, ptr %out7, align 4
+; AVX1:  LV: Found an estimated cost of 184 for VF 8 For instruction: store i32 %v7, ptr %out7, align 4
+; AVX1:  LV: Found an estimated cost of 368 for VF 16 For instruction: store i32 %v7, ptr %out7, align 4
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v7, ptr %out7, align 4
+; AVX2:  LV: Found an estimated cost of 38 for VF 2 For instruction: store i32 %v7, ptr %out7, align 4
+; AVX2:  LV: Found an estimated cost of 76 for VF 4 For instruction: store i32 %v7, ptr %out7, align 4
+; AVX2:  LV: Found an estimated cost of 184 for VF 8 For instruction: store i32 %v7, ptr %out7, align 4
+; AVX2:  LV: Found an estimated cost of 368 for VF 16 For instruction: store i32 %v7, ptr %out7, align 4
+;
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v7, ptr %out7, align 4
+; AVX512:  LV: Found an estimated cost of 11 for VF 2 For instruction: store i32 %v7, ptr %out7, align 4
+; AVX512:  LV: Found an estimated cost of 23 for VF 4 For instruction: store i32 %v7, ptr %out7, align 4
+; AVX512:  LV: Found an estimated cost of 46 for VF 8 For instruction: store i32 %v7, ptr %out7, align 4
+; AVX512:  LV: Found an estimated cost of 92 for VF 16 For instruction: store i32 %v7, ptr %out7, align 4
+; AVX512:  LV: Found an estimated cost of 184 for VF 32 For instruction: store i32 %v7, ptr %out7, align 4
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+  %iv.5 = add nuw nsw i64 %iv, 5
+  %iv.6 = add nuw nsw i64 %iv, 6
+  %iv.7 = add nuw nsw i64 %iv, 7
+
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
+
+  %v = zext i8 %v.narrow to i32
+
+  %v0 = add i32 %v, 0
+  %v1 = add i32 %v, 1
+  %v2 = add i32 %v, 2
+  %v3 = add i32 %v, 3
+  %v4 = add i32 %v, 4
+  %v5 = add i32 %v, 5
+  %v6 = add i32 %v, 6
+  %v7 = add i32 %v, 7
+
+  %out0 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.2
+  %out3 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.3
+  %out4 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.4
+  %out5 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.5
+  %out6 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.6
+  %out7 = getelementptr inbounds [1024 x i32], ptr @B, i64 0, i64 %iv.7
+
+  store i32 %v0, ptr %out0
+  store i32 %v1, ptr %out1
+  store i32 %v2, ptr %out2
+  store i32 %v3, ptr %out3
+  store i32 %v4, ptr %out4
+  store i32 %v5, ptr %out5
+  store i32 %v6, ptr %out6
+  store i32 %v7, ptr %out7
+
+  %iv.next = add nuw nsw i64 %iv.0, 8
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-2.ll
index 9fe32785ad9c3..5a5ca679ca6ee 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-2.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i64 %v1, ptr %out1"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +11,39 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i64] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
-;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %v1, i64* %out1, align 8
-; SSE2: LV: Found an estimated cost of 14 for VF 2 For instruction:   store i64 %v1, i64* %out1, align 8
-; SSE2: LV: Found an estimated cost of 28 for VF 4 For instruction:   store i64 %v1, i64* %out1, align 8
-; SSE2: LV: Found an estimated cost of 56 for VF 8 For instruction:   store i64 %v1, i64* %out1, align 8
-; SSE2: LV: Found an estimated cost of 112 for VF 16 For instruction:   store i64 %v1, i64* %out1, align 8
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8
+; SSE2:  LV: Found an estimated cost of 14 for VF 2 For instruction: store i64 %v1, ptr %out1, align 8
+; SSE2:  LV: Found an estimated cost of 28 for VF 4 For instruction: store i64 %v1, ptr %out1, align 8
+; SSE2:  LV: Found an estimated cost of 56 for VF 8 For instruction: store i64 %v1, ptr %out1, align 8
+; SSE2:  LV: Found an estimated cost of 112 for VF 16 For instruction: store i64 %v1, ptr %out1, align 8
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %v1, i64* %out1, align 8
-; AVX1: LV: Found an estimated cost of 11 for VF 2 For instruction:   store i64 %v1, i64* %out1, align 8
-; AVX1: LV: Found an estimated cost of 26 for VF 4 For instruction:   store i64 %v1, i64* %out1, align 8
-; AVX1: LV: Found an estimated cost of 52 for VF 8 For instruction:   store i64 %v1, i64* %out1, align 8
-; AVX1: LV: Found an estimated cost of 104 for VF 16 For instruction:   store i64 %v1, i64* %out1, align 8
-; AVX1: LV: Found an estimated cost of 208 for VF 32 For instruction:   store i64 %v1, i64* %out1, align 8
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX1:  LV: Found an estimated cost of 11 for VF 2 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX1:  LV: Found an estimated cost of 26 for VF 4 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX1:  LV: Found an estimated cost of 52 for VF 8 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX1:  LV: Found an estimated cost of 104 for VF 16 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX1:  LV: Found an estimated cost of 208 for VF 32 For instruction: store i64 %v1, ptr %out1, align 8
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %v1, i64* %out1, align 8
-; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction:   store i64 %v1, i64* %out1, align 8
-; AVX2: LV: Found an estimated cost of 6 for VF 4 For instruction:   store i64 %v1, i64* %out1, align 8
-; AVX2: LV: Found an estimated cost of 12 for VF 8 For instruction:   store i64 %v1, i64* %out1, align 8
-; AVX2: LV: Found an estimated cost of 24 for VF 16 For instruction:   store i64 %v1, i64* %out1, align 8
-; AVX2: LV: Found an estimated cost of 48 for VF 32 For instruction:   store i64 %v1, i64* %out1, align 8
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX2:  LV: Found an estimated cost of 6 for VF 4 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX2:  LV: Found an estimated cost of 12 for VF 8 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX2:  LV: Found an estimated cost of 24 for VF 16 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX2:  LV: Found an estimated cost of 48 for VF 32 For instruction: store i64 %v1, ptr %out1, align 8
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %v1, i64* %out1, align 8
-; AVX512: LV: Found an estimated cost of 2 for VF 2 For instruction:   store i64 %v1, i64* %out1, align 8
-; AVX512: LV: Found an estimated cost of 2 for VF 4 For instruction:   store i64 %v1, i64* %out1, align 8
-; AVX512: LV: Found an estimated cost of 5 for VF 8 For instruction:   store i64 %v1, i64* %out1, align 8
-; AVX512: LV: Found an estimated cost of 10 for VF 16 For instruction:   store i64 %v1, i64* %out1, align 8
-; AVX512: LV: Found an estimated cost of 20 for VF 32 For instruction:   store i64 %v1, i64* %out1, align 8
-; AVX512: LV: Found an estimated cost of 40 for VF 64 For instruction:   store i64 %v1, i64* %out1, align 8
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 2 for VF 4 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 5 for VF 8 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 10 for VF 16 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 32 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 64 For instruction: store i64 %v1, ptr %out1, align 8
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store i64 %v1, i64* %out1, align 8
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -52,19 +53,19 @@ for.body:
   %iv.0 = add nuw nsw i64 %iv, 0
   %iv.1 = add nuw nsw i64 %iv, 1
 
-  %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
-  %v.narrow = load i8, i8* %in
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
 
   %v = zext i8 %v.narrow to i64
 
   %v0 = add i64 %v, 0
   %v1 = add i64 %v, 1
 
-  %out0 = getelementptr inbounds [1024 x i64], [1024 x i64]* @B, i64 0, i64 %iv.0
-  %out1 = getelementptr inbounds [1024 x i64], [1024 x i64]* @B, i64 0, i64 %iv.1
+  %out0 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.1
 
-  store i64 %v0, i64* %out0
-  store i64 %v1, i64* %out1
+  store i64 %v0, ptr %out0
+  store i64 %v1, ptr %out1
 
   %iv.next = add nuw nsw i64 %iv.0, 2
   %cmp = icmp ult i64 %iv.next, 1024

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-3.ll
index 3111387bd0997..0cac57729eb87 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-3.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i64 %v2, ptr %out2"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,36 +11,36 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i64] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
-;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %v2, i64* %out2, align 8
-; SSE2: LV: Found an estimated cost of 22 for VF 2 For instruction:   store i64 %v2, i64* %out2, align 8
-; SSE2: LV: Found an estimated cost of 44 for VF 4 For instruction:   store i64 %v2, i64* %out2, align 8
-; SSE2: LV: Found an estimated cost of 88 for VF 8 For instruction:   store i64 %v2, i64* %out2, align 8
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8
+; SSE2:  LV: Found an estimated cost of 22 for VF 2 For instruction: store i64 %v2, ptr %out2, align 8
+; SSE2:  LV: Found an estimated cost of 44 for VF 4 For instruction: store i64 %v2, ptr %out2, align 8
+; SSE2:  LV: Found an estimated cost of 88 for VF 8 For instruction: store i64 %v2, ptr %out2, align 8
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %v2, i64* %out2, align 8
-; AVX1: LV: Found an estimated cost of 17 for VF 2 For instruction:   store i64 %v2, i64* %out2, align 8
-; AVX1: LV: Found an estimated cost of 39 for VF 4 For instruction:   store i64 %v2, i64* %out2, align 8
-; AVX1: LV: Found an estimated cost of 78 for VF 8 For instruction:   store i64 %v2, i64* %out2, align 8
-; AVX1: LV: Found an estimated cost of 156 for VF 16 For instruction:   store i64 %v2, i64* %out2, align 8
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX1:  LV: Found an estimated cost of 17 for VF 2 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX1:  LV: Found an estimated cost of 39 for VF 4 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX1:  LV: Found an estimated cost of 78 for VF 8 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX1:  LV: Found an estimated cost of 156 for VF 16 For instruction: store i64 %v2, ptr %out2, align 8
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %v2, i64* %out2, align 8
-; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i64 %v2, i64* %out2, align 8
-; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction:   store i64 %v2, i64* %out2, align 8
-; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction:   store i64 %v2, i64* %out2, align 8
-; AVX2: LV: Found an estimated cost of 36 for VF 16 For instruction:   store i64 %v2, i64* %out2, align 8
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX2:  LV: Found an estimated cost of 6 for VF 2 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX2:  LV: Found an estimated cost of 9 for VF 4 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX2:  LV: Found an estimated cost of 18 for VF 8 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX2:  LV: Found an estimated cost of 36 for VF 16 For instruction: store i64 %v2, ptr %out2, align 8
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %v2, i64* %out2, align 8
-; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction:   store i64 %v2, i64* %out2, align 8
-; AVX512: LV: Found an estimated cost of 8 for VF 4 For instruction:   store i64 %v2, i64* %out2, align 8
-; AVX512: LV: Found an estimated cost of 12 for VF 8 For instruction:   store i64 %v2, i64* %out2, align 8
-; AVX512: LV: Found an estimated cost of 24 for VF 16 For instruction:   store i64 %v2, i64* %out2, align 8
-; AVX512: LV: Found an estimated cost of 48 for VF 32 For instruction:   store i64 %v2, i64* %out2, align 8
-; AVX512: LV: Found an estimated cost of 96 for VF 64 For instruction:   store i64 %v2, i64* %out2, align 8
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 8 for VF 4 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 12 for VF 8 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 24 for VF 16 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 48 for VF 32 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX512:  LV: Found an estimated cost of 96 for VF 64 For instruction: store i64 %v2, ptr %out2, align 8
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store i64 %v2, i64* %out2, align 8
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -50,8 +51,8 @@ for.body:
   %iv.1 = add nuw nsw i64 %iv, 1
   %iv.2 = add nuw nsw i64 %iv, 2
 
-  %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
-  %v.narrow = load i8, i8* %in
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
 
   %v = zext i8 %v.narrow to i64
 
@@ -59,13 +60,13 @@ for.body:
   %v1 = add i64 %v, 1
   %v2 = add i64 %v, 2
 
-  %out0 = getelementptr inbounds [1024 x i64], [1024 x i64]* @B, i64 0, i64 %iv.0
-  %out1 = getelementptr inbounds [1024 x i64], [1024 x i64]* @B, i64 0, i64 %iv.1
-  %out2 = getelementptr inbounds [1024 x i64], [1024 x i64]* @B, i64 0, i64 %iv.2
+  %out0 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.2
 
-  store i64 %v0, i64* %out0
-  store i64 %v1, i64* %out1
-  store i64 %v2, i64* %out2
+  store i64 %v0, ptr %out0
+  store i64 %v1, ptr %out1
+  store i64 %v2, ptr %out2
 
   %iv.next = add nuw nsw i64 %iv.0, 3
   %cmp = icmp ult i64 %iv.next, 1024

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-4.ll
index 85bd6736e428a..62aaef625e119 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-4.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i64 %v3, ptr %out3"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,35 +11,35 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i64] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
-;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %v3, i64* %out3, align 8
-; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction:   store i64 %v3, i64* %out3, align 8
-; SSE2: LV: Found an estimated cost of 56 for VF 4 For instruction:   store i64 %v3, i64* %out3, align 8
-; SSE2: LV: Found an estimated cost of 112 for VF 8 For instruction:   store i64 %v3, i64* %out3, align 8
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8
+; SSE2:  LV: Found an estimated cost of 28 for VF 2 For instruction: store i64 %v3, ptr %out3, align 8
+; SSE2:  LV: Found an estimated cost of 56 for VF 4 For instruction: store i64 %v3, ptr %out3, align 8
+; SSE2:  LV: Found an estimated cost of 112 for VF 8 For instruction: store i64 %v3, ptr %out3, align 8
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %v3, i64* %out3, align 8
-; AVX1: LV: Found an estimated cost of 22 for VF 2 For instruction:   store i64 %v3, i64* %out3, align 8
-; AVX1: LV: Found an estimated cost of 52 for VF 4 For instruction:   store i64 %v3, i64* %out3, align 8
-; AVX1: LV: Found an estimated cost of 104 for VF 8 For instruction:   store i64 %v3, i64* %out3, align 8
-; AVX1: LV: Found an estimated cost of 208 for VF 16 For instruction:   store i64 %v3, i64* %out3, align 8
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX1:  LV: Found an estimated cost of 22 for VF 2 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX1:  LV: Found an estimated cost of 52 for VF 4 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX1:  LV: Found an estimated cost of 104 for VF 8 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX1:  LV: Found an estimated cost of 208 for VF 16 For instruction: store i64 %v3, ptr %out3, align 8
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %v3, i64* %out3, align 8
-; AVX2: LV: Found an estimated cost of 8 for VF 2 For instruction:   store i64 %v3, i64* %out3, align 8
-; AVX2: LV: Found an estimated cost of 12 for VF 4 For instruction:   store i64 %v3, i64* %out3, align 8
-; AVX2: LV: Found an estimated cost of 28 for VF 8 For instruction:   store i64 %v3, i64* %out3, align 8
-; AVX2: LV: Found an estimated cost of 56 for VF 16 For instruction:   store i64 %v3, i64* %out3, align 8
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX2:  LV: Found an estimated cost of 8 for VF 2 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX2:  LV: Found an estimated cost of 12 for VF 4 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX2:  LV: Found an estimated cost of 28 for VF 8 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX2:  LV: Found an estimated cost of 56 for VF 16 For instruction: store i64 %v3, ptr %out3, align 8
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %v3, i64* %out3, align 8
-; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction:   store i64 %v3, i64* %out3, align 8
-; AVX512: LV: Found an estimated cost of 11 for VF 4 For instruction:   store i64 %v3, i64* %out3, align 8
-; AVX512: LV: Found an estimated cost of 22 for VF 8 For instruction:   store i64 %v3, i64* %out3, align 8
-; AVX512: LV: Found an estimated cost of 44 for VF 16 For instruction:   store i64 %v3, i64* %out3, align 8
-; AVX512: LV: Found an estimated cost of 88 for VF 32 For instruction:   store i64 %v3, i64* %out3, align 8
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 5 for VF 2 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 11 for VF 4 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 22 for VF 8 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 44 for VF 16 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX512:  LV: Found an estimated cost of 88 for VF 32 For instruction: store i64 %v3, ptr %out3, align 8
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store i64 %v3, i64* %out3, align 8
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -50,8 +51,8 @@ for.body:
   %iv.2 = add nuw nsw i64 %iv, 2
   %iv.3 = add nuw nsw i64 %iv, 3
 
-  %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
-  %v.narrow = load i8, i8* %in
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
 
   %v = zext i8 %v.narrow to i64
 
@@ -60,15 +61,15 @@ for.body:
   %v2 = add i64 %v, 2
   %v3 = add i64 %v, 3
 
-  %out0 = getelementptr inbounds [1024 x i64], [1024 x i64]* @B, i64 0, i64 %iv.0
-  %out1 = getelementptr inbounds [1024 x i64], [1024 x i64]* @B, i64 0, i64 %iv.1
-  %out2 = getelementptr inbounds [1024 x i64], [1024 x i64]* @B, i64 0, i64 %iv.2
-  %out3 = getelementptr inbounds [1024 x i64], [1024 x i64]* @B, i64 0, i64 %iv.3
+  %out0 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.2
+  %out3 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.3
 
-  store i64 %v0, i64* %out0
-  store i64 %v1, i64* %out1
-  store i64 %v2, i64* %out2
-  store i64 %v3, i64* %out3
+  store i64 %v0, ptr %out0
+  store i64 %v1, ptr %out1
+  store i64 %v2, ptr %out2
+  store i64 %v3, ptr %out3
 
   %iv.next = add nuw nsw i64 %iv.0, 4
   %cmp = icmp ult i64 %iv.next, 1024

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-5.ll
new file mode 100644
index 0000000000000..0e21b5bd3af28
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-5.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i64 %v4, ptr %out4"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x i8] zeroinitializer, align 128
+ at B = global [1024 x i64] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8
+; SSE2:  LV: Found an estimated cost of 38 for VF 2 For instruction: store i64 %v4, ptr %out4, align 8
+; SSE2:  LV: Found an estimated cost of 76 for VF 4 For instruction: store i64 %v4, ptr %out4, align 8
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX1:  LV: Found an estimated cost of 28 for VF 2 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX1:  LV: Found an estimated cost of 65 for VF 4 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX1:  LV: Found an estimated cost of 130 for VF 8 For instruction: store i64 %v4, ptr %out4, align 8
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX2:  LV: Found an estimated cost of 28 for VF 2 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX2:  LV: Found an estimated cost of 65 for VF 4 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX2:  LV: Found an estimated cost of 130 for VF 8 For instruction: store i64 %v4, ptr %out4, align 8
+;
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 14 for VF 2 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 21 for VF 4 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 35 for VF 8 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 70 for VF 16 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX512:  LV: Found an estimated cost of 140 for VF 32 For instruction: store i64 %v4, ptr %out4, align 8
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
+
+  %v = zext i8 %v.narrow to i64
+
+  %v0 = add i64 %v, 0
+  %v1 = add i64 %v, 1
+  %v2 = add i64 %v, 2
+  %v3 = add i64 %v, 3
+  %v4 = add i64 %v, 4
+
+  %out0 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.2
+  %out3 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.3
+  %out4 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.4
+
+  store i64 %v0, ptr %out0
+  store i64 %v1, ptr %out1
+  store i64 %v2, ptr %out2
+  store i64 %v3, ptr %out3
+  store i64 %v4, ptr %out4
+
+  %iv.next = add nuw nsw i64 %iv.0, 5
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-6.ll
index 96401638067ef..56bc67ba2cfa9 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-6.ll
@@ -1,7 +1,8 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i64 %v5, ptr %out5"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,32 +11,32 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i64] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
-;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %v5, i64* %out5, align 8
-; SSE2: LV: Found an estimated cost of 44 for VF 2 For instruction:   store i64 %v5, i64* %out5, align 8
-; SSE2: LV: Found an estimated cost of 88 for VF 4 For instruction:   store i64 %v5, i64* %out5, align 8
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8
+; SSE2:  LV: Found an estimated cost of 44 for VF 2 For instruction: store i64 %v5, ptr %out5, align 8
+; SSE2:  LV: Found an estimated cost of 88 for VF 4 For instruction: store i64 %v5, ptr %out5, align 8
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %v5, i64* %out5, align 8
-; AVX1: LV: Found an estimated cost of 33 for VF 2 For instruction:   store i64 %v5, i64* %out5, align 8
-; AVX1: LV: Found an estimated cost of 78 for VF 4 For instruction:   store i64 %v5, i64* %out5, align 8
-; AVX1: LV: Found an estimated cost of 156 for VF 8 For instruction:   store i64 %v5, i64* %out5, align 8
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX1:  LV: Found an estimated cost of 33 for VF 2 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX1:  LV: Found an estimated cost of 78 for VF 4 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX1:  LV: Found an estimated cost of 156 for VF 8 For instruction: store i64 %v5, ptr %out5, align 8
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %v5, i64* %out5, align 8
-; AVX2: LV: Found an estimated cost of 11 for VF 2 For instruction:   store i64 %v5, i64* %out5, align 8
-; AVX2: LV: Found an estimated cost of 21 for VF 4 For instruction:   store i64 %v5, i64* %out5, align 8
-; AVX2: LV: Found an estimated cost of 42 for VF 8 For instruction:   store i64 %v5, i64* %out5, align 8
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX2:  LV: Found an estimated cost of 11 for VF 2 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX2:  LV: Found an estimated cost of 21 for VF 4 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX2:  LV: Found an estimated cost of 42 for VF 8 For instruction: store i64 %v5, ptr %out5, align 8
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i64 %v5, i64* %out5, align 8
-; AVX512: LV: Found an estimated cost of 17 for VF 2 For instruction:   store i64 %v5, i64* %out5, align 8
-; AVX512: LV: Found an estimated cost of 25 for VF 4 For instruction:   store i64 %v5, i64* %out5, align 8
-; AVX512: LV: Found an estimated cost of 51 for VF 8 For instruction:   store i64 %v5, i64* %out5, align 8
-; AVX512: LV: Found an estimated cost of 102 for VF 16 For instruction:   store i64 %v5, i64* %out5, align 8
-; AVX512: LV: Found an estimated cost of 204 for VF 32 For instruction:   store i64 %v5, i64* %out5, align 8
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 17 for VF 2 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 25 for VF 4 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 51 for VF 8 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 102 for VF 16 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX512:  LV: Found an estimated cost of 204 for VF 32 For instruction: store i64 %v5, ptr %out5, align 8
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store i64 %v5, i64* %out5, align 8
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -49,8 +50,8 @@ for.body:
   %iv.4 = add nuw nsw i64 %iv, 4
   %iv.5 = add nuw nsw i64 %iv, 5
 
-  %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
-  %v.narrow = load i8, i8* %in
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
 
   %v = zext i8 %v.narrow to i64
 
@@ -61,19 +62,19 @@ for.body:
   %v4 = add i64 %v, 4
   %v5 = add i64 %v, 5
 
-  %out0 = getelementptr inbounds [1024 x i64], [1024 x i64]* @B, i64 0, i64 %iv.0
-  %out1 = getelementptr inbounds [1024 x i64], [1024 x i64]* @B, i64 0, i64 %iv.1
-  %out2 = getelementptr inbounds [1024 x i64], [1024 x i64]* @B, i64 0, i64 %iv.2
-  %out3 = getelementptr inbounds [1024 x i64], [1024 x i64]* @B, i64 0, i64 %iv.3
-  %out4 = getelementptr inbounds [1024 x i64], [1024 x i64]* @B, i64 0, i64 %iv.4
-  %out5 = getelementptr inbounds [1024 x i64], [1024 x i64]* @B, i64 0, i64 %iv.5
+  %out0 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.2
+  %out3 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.3
+  %out4 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.4
+  %out5 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.5
 
-  store i64 %v0, i64* %out0
-  store i64 %v1, i64* %out1
-  store i64 %v2, i64* %out2
-  store i64 %v3, i64* %out3
-  store i64 %v4, i64* %out4
-  store i64 %v5, i64* %out5
+  store i64 %v0, ptr %out0
+  store i64 %v1, ptr %out1
+  store i64 %v2, ptr %out2
+  store i64 %v3, ptr %out3
+  store i64 %v4, ptr %out4
+  store i64 %v5, ptr %out5
 
   %iv.next = add nuw nsw i64 %iv.0, 6
   %cmp = icmp ult i64 %iv.next, 1024

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-7.ll
new file mode 100644
index 0000000000000..13f4123ee8c11
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-7.ll
@@ -0,0 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i64 %v6, ptr %out6"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x i8] zeroinitializer, align 128
+ at B = global [1024 x i64] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8
+; SSE2:  LV: Found an estimated cost of 50 for VF 2 For instruction: store i64 %v6, ptr %out6, align 8
+; SSE2:  LV: Found an estimated cost of 100 for VF 4 For instruction: store i64 %v6, ptr %out6, align 8
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8
+; AVX1:  LV: Found an estimated cost of 39 for VF 2 For instruction: store i64 %v6, ptr %out6, align 8
+; AVX1:  LV: Found an estimated cost of 91 for VF 4 For instruction: store i64 %v6, ptr %out6, align 8
+; AVX1:  LV: Found an estimated cost of 182 for VF 8 For instruction: store i64 %v6, ptr %out6, align 8
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8
+; AVX2:  LV: Found an estimated cost of 39 for VF 2 For instruction: store i64 %v6, ptr %out6, align 8
+; AVX2:  LV: Found an estimated cost of 91 for VF 4 For instruction: store i64 %v6, ptr %out6, align 8
+; AVX2:  LV: Found an estimated cost of 182 for VF 8 For instruction: store i64 %v6, ptr %out6, align 8
+;
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8
+; AVX512:  LV: Found an estimated cost of 20 for VF 2 For instruction: store i64 %v6, ptr %out6, align 8
+; AVX512:  LV: Found an estimated cost of 40 for VF 4 For instruction: store i64 %v6, ptr %out6, align 8
+; AVX512:  LV: Found an estimated cost of 70 for VF 8 For instruction: store i64 %v6, ptr %out6, align 8
+; AVX512:  LV: Found an estimated cost of 140 for VF 16 For instruction: store i64 %v6, ptr %out6, align 8
+; AVX512:  LV: Found an estimated cost of 280 for VF 32 For instruction: store i64 %v6, ptr %out6, align 8
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+  %iv.5 = add nuw nsw i64 %iv, 5
+  %iv.6 = add nuw nsw i64 %iv, 6
+
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
+
+  %v = zext i8 %v.narrow to i64
+
+  %v0 = add i64 %v, 0
+  %v1 = add i64 %v, 1
+  %v2 = add i64 %v, 2
+  %v3 = add i64 %v, 3
+  %v4 = add i64 %v, 4
+  %v5 = add i64 %v, 5
+  %v6 = add i64 %v, 6
+
+  %out0 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.2
+  %out3 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.3
+  %out4 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.4
+  %out5 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.5
+  %out6 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.6
+
+  store i64 %v0, ptr %out0
+  store i64 %v1, ptr %out1
+  store i64 %v2, ptr %out2
+  store i64 %v3, ptr %out3
+  store i64 %v4, ptr %out4
+  store i64 %v5, ptr %out5
+  store i64 %v6, ptr %out6
+
+  %iv.next = add nuw nsw i64 %iv.0, 7
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll
new file mode 100644
index 0000000000000..4d871ef84c0b1
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll
@@ -0,0 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i64 %v7, ptr %out7"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x i8] zeroinitializer, align 128
+ at B = global [1024 x i64] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v7, ptr %out7, align 8
+; SSE2:  LV: Found an estimated cost of 56 for VF 2 For instruction: store i64 %v7, ptr %out7, align 8
+; SSE2:  LV: Found an estimated cost of 112 for VF 4 For instruction: store i64 %v7, ptr %out7, align 8
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX1:  LV: Found an estimated cost of 44 for VF 2 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX1:  LV: Found an estimated cost of 104 for VF 4 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX1:  LV: Found an estimated cost of 208 for VF 8 For instruction: store i64 %v7, ptr %out7, align 8
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX2:  LV: Found an estimated cost of 44 for VF 2 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX2:  LV: Found an estimated cost of 104 for VF 4 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX2:  LV: Found an estimated cost of 208 for VF 8 For instruction: store i64 %v7, ptr %out7, align 8
+;
+; AVX512-LABEL: 'test'
+; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX512:  LV: Found an estimated cost of 23 for VF 2 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX512:  LV: Found an estimated cost of 46 for VF 4 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX512:  LV: Found an estimated cost of 80 for VF 8 For instruction: store i64 %v7, ptr %out7, align 8
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+  %iv.5 = add nuw nsw i64 %iv, 5
+  %iv.6 = add nuw nsw i64 %iv, 6
+  %iv.7 = add nuw nsw i64 %iv, 7
+
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v.narrow = load i8, ptr %in
+
+  %v = zext i8 %v.narrow to i64
+
+  %v0 = add i64 %v, 0
+  %v1 = add i64 %v, 1
+  %v2 = add i64 %v, 2
+  %v3 = add i64 %v, 3
+  %v4 = add i64 %v, 4
+  %v5 = add i64 %v, 5
+  %v6 = add i64 %v, 6
+  %v7 = add i64 %v, 7
+
+  %out0 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.2
+  %out3 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.3
+  %out4 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.4
+  %out5 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.5
+  %out6 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.6
+  %out7 = getelementptr inbounds [1024 x i64], ptr @B, i64 0, i64 %iv.7
+
+  store i64 %v0, ptr %out0
+  store i64 %v1, ptr %out1
+  store i64 %v2, ptr %out2
+  store i64 %v3, ptr %out3
+  store i64 %v4, ptr %out4
+  store i64 %v5, ptr %out5
+  store i64 %v6, ptr %out6
+  store i64 %v7, ptr %out7
+
+  %iv.next = add nuw nsw i64 %iv.0, 8
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-2.ll
index ac7ac428b6bf6..a4f19d0a45fa2 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-2.ll
@@ -1,7 +1,9 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i8 %v1, ptr %out1"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512DQ
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl,+avx512bw --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512BW
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +12,48 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v1, ptr %out1, align 1
+; SSE2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i8 %v1, ptr %out1, align 1
+; SSE2:  LV: Found an estimated cost of 2 for VF 4 For instruction: store i8 %v1, ptr %out1, align 1
+; SSE2:  LV: Found an estimated cost of 2 for VF 8 For instruction: store i8 %v1, ptr %out1, align 1
+; SSE2:  LV: Found an estimated cost of 126 for VF 16 For instruction: store i8 %v1, ptr %out1, align 1
 ;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %v1, i8* %out1, align 1
-; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction:   store i8 %v1, i8* %out1, align 1
-; SSE2: LV: Found an estimated cost of 2 for VF 4 For instruction:   store i8 %v1, i8* %out1, align 1
-; SSE2: LV: Found an estimated cost of 2 for VF 8 For instruction:   store i8 %v1, i8* %out1, align 1
-; SSE2: LV: Found an estimated cost of 126 for VF 16 For instruction:   store i8 %v1, i8* %out1, align 1
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v1, ptr %out1, align 1
+; AVX1:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i8 %v1, ptr %out1, align 1
+; AVX1:  LV: Found an estimated cost of 2 for VF 4 For instruction: store i8 %v1, ptr %out1, align 1
+; AVX1:  LV: Found an estimated cost of 2 for VF 8 For instruction: store i8 %v1, ptr %out1, align 1
+; AVX1:  LV: Found an estimated cost of 67 for VF 16 For instruction: store i8 %v1, ptr %out1, align 1
+; AVX1:  LV: Found an estimated cost of 166 for VF 32 For instruction: store i8 %v1, ptr %out1, align 1
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %v1, i8* %out1, align 1
-; AVX1: LV: Found an estimated cost of 2 for VF 2 For instruction:   store i8 %v1, i8* %out1, align 1
-; AVX1: LV: Found an estimated cost of 2 for VF 4 For instruction:   store i8 %v1, i8* %out1, align 1
-; AVX1: LV: Found an estimated cost of 2 for VF 8 For instruction:   store i8 %v1, i8* %out1, align 1
-; AVX1: LV: Found an estimated cost of 67 for VF 16 For instruction:   store i8 %v1, i8* %out1, align 1
-; AVX1: LV: Found an estimated cost of 166 for VF 32 For instruction:   store i8 %v1, i8* %out1, align 1
-
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %v1, i8* %out1, align 1
-; AVX2: LV: Found an estimated cost of 2 for VF 2 For instruction:   store i8 %v1, i8* %out1, align 1
-; AVX2: LV: Found an estimated cost of 2 for VF 4 For instruction:   store i8 %v1, i8* %out1, align 1
-; AVX2: LV: Found an estimated cost of 2 for VF 8 For instruction:   store i8 %v1, i8* %out1, align 1
-; AVX2: LV: Found an estimated cost of 4 for VF 16 For instruction:   store i8 %v1, i8* %out1, align 1
-; AVX2: LV: Found an estimated cost of 6 for VF 32 For instruction:   store i8 %v1, i8* %out1, align 1
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v1, ptr %out1, align 1
+; AVX2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i8 %v1, ptr %out1, align 1
+; AVX2:  LV: Found an estimated cost of 2 for VF 4 For instruction: store i8 %v1, ptr %out1, align 1
+; AVX2:  LV: Found an estimated cost of 2 for VF 8 For instruction: store i8 %v1, ptr %out1, align 1
+; AVX2:  LV: Found an estimated cost of 4 for VF 16 For instruction: store i8 %v1, ptr %out1, align 1
+; AVX2:  LV: Found an estimated cost of 6 for VF 32 For instruction: store i8 %v1, ptr %out1, align 1
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %v1, i8* %out1, align 1
-; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction:   store i8 %v1, i8* %out1, align 1
-; AVX512: LV: Found an estimated cost of 4 for VF 4 For instruction:   store i8 %v1, i8* %out1, align 1
-; AVX512: LV: Found an estimated cost of 4 for VF 8 For instruction:   store i8 %v1, i8* %out1, align 1
-; AVX512: LV: Found an estimated cost of 8 for VF 16 For instruction:   store i8 %v1, i8* %out1, align 1
-; AVX512: LV: Found an estimated cost of 20 for VF 32 For instruction:   store i8 %v1, i8* %out1, align 1
-; AVX512: LV: Found an estimated cost of 41 for VF 64 For instruction:   store i8 %v1, i8* %out1, align 1
+; AVX512DQ-LABEL: 'test'
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v1, ptr %out1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i8 %v1, ptr %out1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 2 for VF 4 For instruction: store i8 %v1, ptr %out1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 2 for VF 8 For instruction: store i8 %v1, ptr %out1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 4 for VF 16 For instruction: store i8 %v1, ptr %out1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 5 for VF 32 For instruction: store i8 %v1, ptr %out1, align 1
+; AVX512DQ:  LV: Found an estimated cost of 362 for VF 64 For instruction: store i8 %v1, ptr %out1, align 1
+;
+; AVX512BW-LABEL: 'test'
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v1, ptr %out1, align 1
+; AVX512BW:  LV: Found an estimated cost of 4 for VF 2 For instruction: store i8 %v1, ptr %out1, align 1
+; AVX512BW:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i8 %v1, ptr %out1, align 1
+; AVX512BW:  LV: Found an estimated cost of 4 for VF 8 For instruction: store i8 %v1, ptr %out1, align 1
+; AVX512BW:  LV: Found an estimated cost of 8 for VF 16 For instruction: store i8 %v1, ptr %out1, align 1
+; AVX512BW:  LV: Found an estimated cost of 20 for VF 32 For instruction: store i8 %v1, ptr %out1, align 1
+; AVX512BW:  LV: Found an estimated cost of 41 for VF 64 For instruction: store i8 %v1, ptr %out1, align 1
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store i8 %v1, i8* %out1, align 1
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -52,17 +63,17 @@ for.body:
   %iv.0 = add nuw nsw i64 %iv, 0
   %iv.1 = add nuw nsw i64 %iv, 1
 
-  %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
-  %v = load i8, i8* %in
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v = load i8, ptr %in
 
   %v0 = add i8 %v, 0
   %v1 = add i8 %v, 1
 
-  %out0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
-  %out1 = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.1
+  %out0 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.1
 
-  store i8 %v0, i8* %out0
-  store i8 %v1, i8* %out1
+  store i8 %v0, ptr %out0
+  store i8 %v1, ptr %out1
 
   %iv.next = add nuw nsw i64 %iv.0, 2
   %cmp = icmp ult i64 %iv.next, 1024

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-3.ll
index d633dbcd3c63b..7809ff49ee36c 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-3.ll
@@ -1,7 +1,9 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i8 %v2, ptr %out2"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512DQ
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl,+avx512bw --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512BW
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +12,48 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v2, ptr %out2, align 1
+; SSE2:  LV: Found an estimated cost of 25 for VF 2 For instruction: store i8 %v2, ptr %out2, align 1
+; SSE2:  LV: Found an estimated cost of 52 for VF 4 For instruction: store i8 %v2, ptr %out2, align 1
+; SSE2:  LV: Found an estimated cost of 101 for VF 8 For instruction: store i8 %v2, ptr %out2, align 1
+; SSE2:  LV: Found an estimated cost of 204 for VF 16 For instruction: store i8 %v2, ptr %out2, align 1
 ;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %v2, i8* %out2, align 1
-; SSE2: LV: Found an estimated cost of 25 for VF 2 For instruction:   store i8 %v2, i8* %out2, align 1
-; SSE2: LV: Found an estimated cost of 52 for VF 4 For instruction:   store i8 %v2, i8* %out2, align 1
-; SSE2: LV: Found an estimated cost of 101 for VF 8 For instruction:   store i8 %v2, i8* %out2, align 1
-; SSE2: LV: Found an estimated cost of 204 for VF 16 For instruction:   store i8 %v2, i8* %out2, align 1
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v2, ptr %out2, align 1
+; AVX1:  LV: Found an estimated cost of 15 for VF 2 For instruction: store i8 %v2, ptr %out2, align 1
+; AVX1:  LV: Found an estimated cost of 27 for VF 4 For instruction: store i8 %v2, ptr %out2, align 1
+; AVX1:  LV: Found an estimated cost of 54 for VF 8 For instruction: store i8 %v2, ptr %out2, align 1
+; AVX1:  LV: Found an estimated cost of 101 for VF 16 For instruction: store i8 %v2, ptr %out2, align 1
+; AVX1:  LV: Found an estimated cost of 249 for VF 32 For instruction: store i8 %v2, ptr %out2, align 1
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %v2, i8* %out2, align 1
-; AVX1: LV: Found an estimated cost of 15 for VF 2 For instruction:   store i8 %v2, i8* %out2, align 1
-; AVX1: LV: Found an estimated cost of 27 for VF 4 For instruction:   store i8 %v2, i8* %out2, align 1
-; AVX1: LV: Found an estimated cost of 54 for VF 8 For instruction:   store i8 %v2, i8* %out2, align 1
-; AVX1: LV: Found an estimated cost of 101 for VF 16 For instruction:   store i8 %v2, i8* %out2, align 1
-; AVX1: LV: Found an estimated cost of 249 for VF 32 For instruction:   store i8 %v2, i8* %out2, align 1
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v2, ptr %out2, align 1
+; AVX2:  LV: Found an estimated cost of 7 for VF 2 For instruction: store i8 %v2, ptr %out2, align 1
+; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: store i8 %v2, ptr %out2, align 1
+; AVX2:  LV: Found an estimated cost of 9 for VF 8 For instruction: store i8 %v2, ptr %out2, align 1
+; AVX2:  LV: Found an estimated cost of 13 for VF 16 For instruction: store i8 %v2, ptr %out2, align 1
+; AVX2:  LV: Found an estimated cost of 16 for VF 32 For instruction: store i8 %v2, ptr %out2, align 1
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %v2, i8* %out2, align 1
-; AVX2: LV: Found an estimated cost of 7 for VF 2 For instruction:   store i8 %v2, i8* %out2, align 1
-; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction:   store i8 %v2, i8* %out2, align 1
-; AVX2: LV: Found an estimated cost of 9 for VF 8 For instruction:   store i8 %v2, i8* %out2, align 1
-; AVX2: LV: Found an estimated cost of 13 for VF 16 For instruction:   store i8 %v2, i8* %out2, align 1
-; AVX2: LV: Found an estimated cost of 16 for VF 32 For instruction:   store i8 %v2, i8* %out2, align 1
+; AVX512DQ-LABEL: 'test'
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v2, ptr %out2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 7 for VF 2 For instruction: store i8 %v2, ptr %out2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 7 for VF 4 For instruction: store i8 %v2, ptr %out2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 9 for VF 8 For instruction: store i8 %v2, ptr %out2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 14 for VF 16 For instruction: store i8 %v2, ptr %out2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 15 for VF 32 For instruction: store i8 %v2, ptr %out2, align 1
+; AVX512DQ:  LV: Found an estimated cost of 543 for VF 64 For instruction: store i8 %v2, ptr %out2, align 1
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %v2, i8* %out2, align 1
-; AVX512: LV: Found an estimated cost of 8 for VF 2 For instruction:   store i8 %v2, i8* %out2, align 1
-; AVX512: LV: Found an estimated cost of 8 for VF 4 For instruction:   store i8 %v2, i8* %out2, align 1
-; AVX512: LV: Found an estimated cost of 16 for VF 8 For instruction:   store i8 %v2, i8* %out2, align 1
-; AVX512: LV: Found an estimated cost of 13 for VF 16 For instruction:   store i8 %v2, i8* %out2, align 1
-; AVX512: LV: Found an estimated cost of 16 for VF 32 For instruction:   store i8 %v2, i8* %out2, align 1
-; AVX512: LV: Found an estimated cost of 29 for VF 64 For instruction:   store i8 %v2, i8* %out2, align 1
+; AVX512BW-LABEL: 'test'
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v2, ptr %out2, align 1
+; AVX512BW:  LV: Found an estimated cost of 8 for VF 2 For instruction: store i8 %v2, ptr %out2, align 1
+; AVX512BW:  LV: Found an estimated cost of 8 for VF 4 For instruction: store i8 %v2, ptr %out2, align 1
+; AVX512BW:  LV: Found an estimated cost of 16 for VF 8 For instruction: store i8 %v2, ptr %out2, align 1
+; AVX512BW:  LV: Found an estimated cost of 13 for VF 16 For instruction: store i8 %v2, ptr %out2, align 1
+; AVX512BW:  LV: Found an estimated cost of 16 for VF 32 For instruction: store i8 %v2, ptr %out2, align 1
+; AVX512BW:  LV: Found an estimated cost of 29 for VF 64 For instruction: store i8 %v2, ptr %out2, align 1
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store i8 %v2, i8* %out2, align 1
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -53,20 +64,20 @@ for.body:
   %iv.1 = add nuw nsw i64 %iv, 1
   %iv.2 = add nuw nsw i64 %iv, 2
 
-  %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
-  %v = load i8, i8* %in
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v = load i8, ptr %in
 
   %v0 = add i8 %v, 0
   %v1 = add i8 %v, 1
   %v2 = add i8 %v, 2
 
-  %out0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
-  %out1 = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.1
-  %out2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.2
+  %out0 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.2
 
-  store i8 %v0, i8* %out0
-  store i8 %v1, i8* %out1
-  store i8 %v2, i8* %out2
+  store i8 %v0, ptr %out0
+  store i8 %v1, ptr %out1
+  store i8 %v2, ptr %out2
 
   %iv.next = add nuw nsw i64 %iv.0, 3
   %cmp = icmp ult i64 %iv.next, 1024

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-4.ll
index 8353e40154320..f0c5533868006 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-4.ll
@@ -1,7 +1,9 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i8 %v3, ptr %out3"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512DQ
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl,+avx512bw --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512BW
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +12,48 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v3, ptr %out3, align 1
+; SSE2:  LV: Found an estimated cost of 28 for VF 2 For instruction: store i8 %v3, ptr %out3, align 1
+; SSE2:  LV: Found an estimated cost of 60 for VF 4 For instruction: store i8 %v3, ptr %out3, align 1
+; SSE2:  LV: Found an estimated cost of 124 for VF 8 For instruction: store i8 %v3, ptr %out3, align 1
+; SSE2:  LV: Found an estimated cost of 252 for VF 16 For instruction: store i8 %v3, ptr %out3, align 1
 ;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %v3, i8* %out3, align 1
-; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction:   store i8 %v3, i8* %out3, align 1
-; SSE2: LV: Found an estimated cost of 60 for VF 4 For instruction:   store i8 %v3, i8* %out3, align 1
-; SSE2: LV: Found an estimated cost of 124 for VF 8 For instruction:   store i8 %v3, i8* %out3, align 1
-; SSE2: LV: Found an estimated cost of 252 for VF 16 For instruction:   store i8 %v3, i8* %out3, align 1
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v3, ptr %out3, align 1
+; AVX1:  LV: Found an estimated cost of 17 for VF 2 For instruction: store i8 %v3, ptr %out3, align 1
+; AVX1:  LV: Found an estimated cost of 33 for VF 4 For instruction: store i8 %v3, ptr %out3, align 1
+; AVX1:  LV: Found an estimated cost of 67 for VF 8 For instruction: store i8 %v3, ptr %out3, align 1
+; AVX1:  LV: Found an estimated cost of 134 for VF 16 For instruction: store i8 %v3, ptr %out3, align 1
+; AVX1:  LV: Found an estimated cost of 332 for VF 32 For instruction: store i8 %v3, ptr %out3, align 1
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %v3, i8* %out3, align 1
-; AVX1: LV: Found an estimated cost of 17 for VF 2 For instruction:   store i8 %v3, i8* %out3, align 1
-; AVX1: LV: Found an estimated cost of 33 for VF 4 For instruction:   store i8 %v3, i8* %out3, align 1
-; AVX1: LV: Found an estimated cost of 67 for VF 8 For instruction:   store i8 %v3, i8* %out3, align 1
-; AVX1: LV: Found an estimated cost of 134 for VF 16 For instruction:   store i8 %v3, i8* %out3, align 1
-; AVX1: LV: Found an estimated cost of 332 for VF 32 For instruction:   store i8 %v3, i8* %out3, align 1
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v3, ptr %out3, align 1
+; AVX2:  LV: Found an estimated cost of 5 for VF 2 For instruction: store i8 %v3, ptr %out3, align 1
+; AVX2:  LV: Found an estimated cost of 5 for VF 4 For instruction: store i8 %v3, ptr %out3, align 1
+; AVX2:  LV: Found an estimated cost of 5 for VF 8 For instruction: store i8 %v3, ptr %out3, align 1
+; AVX2:  LV: Found an estimated cost of 10 for VF 16 For instruction: store i8 %v3, ptr %out3, align 1
+; AVX2:  LV: Found an estimated cost of 16 for VF 32 For instruction: store i8 %v3, ptr %out3, align 1
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %v3, i8* %out3, align 1
-; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction:   store i8 %v3, i8* %out3, align 1
-; AVX2: LV: Found an estimated cost of 5 for VF 4 For instruction:   store i8 %v3, i8* %out3, align 1
-; AVX2: LV: Found an estimated cost of 5 for VF 8 For instruction:   store i8 %v3, i8* %out3, align 1
-; AVX2: LV: Found an estimated cost of 10 for VF 16 For instruction:   store i8 %v3, i8* %out3, align 1
-; AVX2: LV: Found an estimated cost of 16 for VF 32 For instruction:   store i8 %v3, i8* %out3, align 1
+; AVX512DQ-LABEL: 'test'
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v3, ptr %out3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 5 for VF 2 For instruction: store i8 %v3, ptr %out3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 5 for VF 4 For instruction: store i8 %v3, ptr %out3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 5 for VF 8 For instruction: store i8 %v3, ptr %out3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 9 for VF 16 For instruction: store i8 %v3, ptr %out3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 14 for VF 32 For instruction: store i8 %v3, ptr %out3, align 1
+; AVX512DQ:  LV: Found an estimated cost of 724 for VF 64 For instruction: store i8 %v3, ptr %out3, align 1
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %v3, i8* %out3, align 1
-; AVX512: LV: Found an estimated cost of 11 for VF 2 For instruction:   store i8 %v3, i8* %out3, align 1
-; AVX512: LV: Found an estimated cost of 11 for VF 4 For instruction:   store i8 %v3, i8* %out3, align 1
-; AVX512: LV: Found an estimated cost of 11 for VF 8 For instruction:   store i8 %v3, i8* %out3, align 1
-; AVX512: LV: Found an estimated cost of 12 for VF 16 For instruction:   store i8 %v3, i8* %out3, align 1
-; AVX512: LV: Found an estimated cost of 16 for VF 32 For instruction:   store i8 %v3, i8* %out3, align 1
-; AVX512: LV: Found an estimated cost of 28 for VF 64 For instruction:   store i8 %v3, i8* %out3, align 1
+; AVX512BW-LABEL: 'test'
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v3, ptr %out3, align 1
+; AVX512BW:  LV: Found an estimated cost of 11 for VF 2 For instruction: store i8 %v3, ptr %out3, align 1
+; AVX512BW:  LV: Found an estimated cost of 11 for VF 4 For instruction: store i8 %v3, ptr %out3, align 1
+; AVX512BW:  LV: Found an estimated cost of 11 for VF 8 For instruction: store i8 %v3, ptr %out3, align 1
+; AVX512BW:  LV: Found an estimated cost of 12 for VF 16 For instruction: store i8 %v3, ptr %out3, align 1
+; AVX512BW:  LV: Found an estimated cost of 16 for VF 32 For instruction: store i8 %v3, ptr %out3, align 1
+; AVX512BW:  LV: Found an estimated cost of 28 for VF 64 For instruction: store i8 %v3, ptr %out3, align 1
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store i8 %v3, i8* %out3, align 1
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -54,23 +65,23 @@ for.body:
   %iv.2 = add nuw nsw i64 %iv, 2
   %iv.3 = add nuw nsw i64 %iv, 3
 
-  %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
-  %v = load i8, i8* %in
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v = load i8, ptr %in
 
   %v0 = add i8 %v, 0
   %v1 = add i8 %v, 1
   %v2 = add i8 %v, 2
   %v3 = add i8 %v, 3
 
-  %out0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
-  %out1 = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.1
-  %out2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.2
-  %out3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.3
+  %out0 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.2
+  %out3 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.3
 
-  store i8 %v0, i8* %out0
-  store i8 %v1, i8* %out1
-  store i8 %v2, i8* %out2
-  store i8 %v3, i8* %out3
+  store i8 %v0, ptr %out0
+  store i8 %v1, ptr %out1
+  store i8 %v2, ptr %out2
+  store i8 %v3, ptr %out3
 
   %iv.next = add nuw nsw i64 %iv.0, 4
   %cmp = icmp ult i64 %iv.next, 1024

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-5.ll
new file mode 100644
index 0000000000000..334ea24823284
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-5.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i8 %v4, ptr %out4"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512DQ
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl,+avx512bw --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512BW
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x i8] zeroinitializer, align 128
+ at B = global [1024 x i8] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v4, ptr %out4, align 1
+; SSE2:  LV: Found an estimated cost of 43 for VF 2 For instruction: store i8 %v4, ptr %out4, align 1
+; SSE2:  LV: Found an estimated cost of 87 for VF 4 For instruction: store i8 %v4, ptr %out4, align 1
+; SSE2:  LV: Found an estimated cost of 178 for VF 8 For instruction: store i8 %v4, ptr %out4, align 1
+; SSE2:  LV: Found an estimated cost of 360 for VF 16 For instruction: store i8 %v4, ptr %out4, align 1
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v4, ptr %out4, align 1
+; AVX1:  LV: Found an estimated cost of 23 for VF 2 For instruction: store i8 %v4, ptr %out4, align 1
+; AVX1:  LV: Found an estimated cost of 47 for VF 4 For instruction: store i8 %v4, ptr %out4, align 1
+; AVX1:  LV: Found an estimated cost of 85 for VF 8 For instruction: store i8 %v4, ptr %out4, align 1
+; AVX1:  LV: Found an estimated cost of 168 for VF 16 For instruction: store i8 %v4, ptr %out4, align 1
+; AVX1:  LV: Found an estimated cost of 415 for VF 32 For instruction: store i8 %v4, ptr %out4, align 1
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v4, ptr %out4, align 1
+; AVX2:  LV: Found an estimated cost of 23 for VF 2 For instruction: store i8 %v4, ptr %out4, align 1
+; AVX2:  LV: Found an estimated cost of 47 for VF 4 For instruction: store i8 %v4, ptr %out4, align 1
+; AVX2:  LV: Found an estimated cost of 85 for VF 8 For instruction: store i8 %v4, ptr %out4, align 1
+; AVX2:  LV: Found an estimated cost of 168 for VF 16 For instruction: store i8 %v4, ptr %out4, align 1
+; AVX2:  LV: Found an estimated cost of 415 for VF 32 For instruction: store i8 %v4, ptr %out4, align 1
+;
+; AVX512DQ-LABEL: 'test'
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v4, ptr %out4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 23 for VF 2 For instruction: store i8 %v4, ptr %out4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 47 for VF 4 For instruction: store i8 %v4, ptr %out4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 87 for VF 8 For instruction: store i8 %v4, ptr %out4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 167 for VF 16 For instruction: store i8 %v4, ptr %out4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 413 for VF 32 For instruction: store i8 %v4, ptr %out4, align 1
+; AVX512DQ:  LV: Found an estimated cost of 905 for VF 64 For instruction: store i8 %v4, ptr %out4, align 1
+;
+; AVX512BW-LABEL: 'test'
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v4, ptr %out4, align 1
+; AVX512BW:  LV: Found an estimated cost of 15 for VF 2 For instruction: store i8 %v4, ptr %out4, align 1
+; AVX512BW:  LV: Found an estimated cost of 31 for VF 4 For instruction: store i8 %v4, ptr %out4, align 1
+; AVX512BW:  LV: Found an estimated cost of 79 for VF 8 For instruction: store i8 %v4, ptr %out4, align 1
+; AVX512BW:  LV: Found an estimated cost of 158 for VF 16 For instruction: store i8 %v4, ptr %out4, align 1
+; AVX512BW:  LV: Found an estimated cost of 237 for VF 32 For instruction: store i8 %v4, ptr %out4, align 1
+; AVX512BW:  LV: Found an estimated cost of 395 for VF 64 For instruction: store i8 %v4, ptr %out4, align 1
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v = load i8, ptr %in
+
+  %v0 = add i8 %v, 0
+  %v1 = add i8 %v, 1
+  %v2 = add i8 %v, 2
+  %v3 = add i8 %v, 3
+  %v4 = add i8 %v, 4
+
+  %out0 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.2
+  %out3 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.3
+  %out4 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.4
+
+  store i8 %v0, ptr %out0
+  store i8 %v1, ptr %out1
+  store i8 %v2, ptr %out2
+  store i8 %v3, ptr %out3
+  store i8 %v4, ptr %out4
+
+  %iv.next = add nuw nsw i64 %iv.0, 5
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-6.ll
index ceeb6235c452f..795c7dc70b2cc 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-6.ll
@@ -1,7 +1,9 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512bw,+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX512
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i8 %v5, ptr %out5"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512DQ
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl,+avx512bw --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512BW
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -10,39 +12,48 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = global [1024 x i8] zeroinitializer, align 128
 @B = global [1024 x i8] zeroinitializer, align 128
 
-; CHECK: LV: Checking a loop in 'test'
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v5, ptr %out5, align 1
+; SSE2:  LV: Found an estimated cost of 49 for VF 2 For instruction: store i8 %v5, ptr %out5, align 1
+; SSE2:  LV: Found an estimated cost of 98 for VF 4 For instruction: store i8 %v5, ptr %out5, align 1
+; SSE2:  LV: Found an estimated cost of 201 for VF 8 For instruction: store i8 %v5, ptr %out5, align 1
+; SSE2:  LV: Found an estimated cost of 408 for VF 16 For instruction: store i8 %v5, ptr %out5, align 1
 ;
-; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %v5, i8* %out5, align 1
-; SSE2: LV: Found an estimated cost of 49 for VF 2 For instruction:   store i8 %v5, i8* %out5, align 1
-; SSE2: LV: Found an estimated cost of 98 for VF 4 For instruction:   store i8 %v5, i8* %out5, align 1
-; SSE2: LV: Found an estimated cost of 201 for VF 8 For instruction:   store i8 %v5, i8* %out5, align 1
-; SSE2: LV: Found an estimated cost of 408 for VF 16 For instruction:   store i8 %v5, i8* %out5, align 1
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v5, ptr %out5, align 1
+; AVX1:  LV: Found an estimated cost of 27 for VF 2 For instruction: store i8 %v5, ptr %out5, align 1
+; AVX1:  LV: Found an estimated cost of 54 for VF 4 For instruction: store i8 %v5, ptr %out5, align 1
+; AVX1:  LV: Found an estimated cost of 101 for VF 8 For instruction: store i8 %v5, ptr %out5, align 1
+; AVX1:  LV: Found an estimated cost of 201 for VF 16 For instruction: store i8 %v5, ptr %out5, align 1
+; AVX1:  LV: Found an estimated cost of 498 for VF 32 For instruction: store i8 %v5, ptr %out5, align 1
 ;
-; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %v5, i8* %out5, align 1
-; AVX1: LV: Found an estimated cost of 27 for VF 2 For instruction:   store i8 %v5, i8* %out5, align 1
-; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction:   store i8 %v5, i8* %out5, align 1
-; AVX1: LV: Found an estimated cost of 101 for VF 8 For instruction:   store i8 %v5, i8* %out5, align 1
-; AVX1: LV: Found an estimated cost of 201 for VF 16 For instruction:   store i8 %v5, i8* %out5, align 1
-; AVX1: LV: Found an estimated cost of 498 for VF 32 For instruction:   store i8 %v5, i8* %out5, align 1
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v5, ptr %out5, align 1
+; AVX2:  LV: Found an estimated cost of 10 for VF 2 For instruction: store i8 %v5, ptr %out5, align 1
+; AVX2:  LV: Found an estimated cost of 12 for VF 4 For instruction: store i8 %v5, ptr %out5, align 1
+; AVX2:  LV: Found an estimated cost of 18 for VF 8 For instruction: store i8 %v5, ptr %out5, align 1
+; AVX2:  LV: Found an estimated cost of 30 for VF 16 For instruction: store i8 %v5, ptr %out5, align 1
+; AVX2:  LV: Found an estimated cost of 96 for VF 32 For instruction: store i8 %v5, ptr %out5, align 1
 ;
-; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %v5, i8* %out5, align 1
-; AVX2: LV: Found an estimated cost of 10 for VF 2 For instruction:   store i8 %v5, i8* %out5, align 1
-; AVX2: LV: Found an estimated cost of 12 for VF 4 For instruction:   store i8 %v5, i8* %out5, align 1
-; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction:   store i8 %v5, i8* %out5, align 1
-; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction:   store i8 %v5, i8* %out5, align 1
-; AVX2: LV: Found an estimated cost of 96 for VF 32 For instruction:   store i8 %v5, i8* %out5, align 1
+; AVX512DQ-LABEL: 'test'
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v5, ptr %out5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 10 for VF 2 For instruction: store i8 %v5, ptr %out5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 12 for VF 4 For instruction: store i8 %v5, ptr %out5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 19 for VF 8 For instruction: store i8 %v5, ptr %out5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 29 for VF 16 For instruction: store i8 %v5, ptr %out5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 93 for VF 32 For instruction: store i8 %v5, ptr %out5, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1086 for VF 64 For instruction: store i8 %v5, ptr %out5, align 1
 ;
-; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %v5, i8* %out5, align 1
-; AVX512: LV: Found an estimated cost of 18 for VF 2 For instruction:   store i8 %v5, i8* %out5, align 1
-; AVX512: LV: Found an estimated cost of 38 for VF 4 For instruction:   store i8 %v5, i8* %out5, align 1
-; AVX512: LV: Found an estimated cost of 98 for VF 8 For instruction:   store i8 %v5, i8* %out5, align 1
-; AVX512: LV: Found an estimated cost of 197 for VF 16 For instruction:   store i8 %v5, i8* %out5, align 1
-; AVX512: LV: Found an estimated cost of 295 for VF 32 For instruction:   store i8 %v5, i8* %out5, align 1
-; AVX512: LV: Found an estimated cost of 591 for VF 64 For instruction:   store i8 %v5, i8* %out5, align 1
+; AVX512BW-LABEL: 'test'
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v5, ptr %out5, align 1
+; AVX512BW:  LV: Found an estimated cost of 18 for VF 2 For instruction: store i8 %v5, ptr %out5, align 1
+; AVX512BW:  LV: Found an estimated cost of 38 for VF 4 For instruction: store i8 %v5, ptr %out5, align 1
+; AVX512BW:  LV: Found an estimated cost of 98 for VF 8 For instruction: store i8 %v5, ptr %out5, align 1
+; AVX512BW:  LV: Found an estimated cost of 197 for VF 16 For instruction: store i8 %v5, ptr %out5, align 1
+; AVX512BW:  LV: Found an estimated cost of 295 for VF 32 For instruction: store i8 %v5, ptr %out5, align 1
+; AVX512BW:  LV: Found an estimated cost of 591 for VF 64 For instruction: store i8 %v5, ptr %out5, align 1
 ;
-; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction:   store i8 %v5, i8* %out5, align 1
-
-define void @test() {
 entry:
   br label %for.body
 
@@ -56,8 +67,8 @@ for.body:
   %iv.4 = add nuw nsw i64 %iv, 4
   %iv.5 = add nuw nsw i64 %iv, 5
 
-  %in = getelementptr inbounds [1024 x i8], [1024 x i8]* @A, i64 0, i64 %iv.0
-  %v = load i8, i8* %in
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v = load i8, ptr %in
 
   %v0 = add i8 %v, 0
   %v1 = add i8 %v, 1
@@ -66,19 +77,19 @@ for.body:
   %v4 = add i8 %v, 4
   %v5 = add i8 %v, 5
 
-  %out0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.0
-  %out1 = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.1
-  %out2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.2
-  %out3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.3
-  %out4 = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.4
-  %out5 = getelementptr inbounds [1024 x i8], [1024 x i8]* @B, i64 0, i64 %iv.5
+  %out0 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.2
+  %out3 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.3
+  %out4 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.4
+  %out5 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.5
 
-  store i8 %v0, i8* %out0
-  store i8 %v1, i8* %out1
-  store i8 %v2, i8* %out2
-  store i8 %v3, i8* %out3
-  store i8 %v4, i8* %out4
-  store i8 %v5, i8* %out5
+  store i8 %v0, ptr %out0
+  store i8 %v1, ptr %out1
+  store i8 %v2, ptr %out2
+  store i8 %v3, ptr %out3
+  store i8 %v4, ptr %out4
+  store i8 %v5, ptr %out5
 
   %iv.next = add nuw nsw i64 %iv.0, 6
   %cmp = icmp ult i64 %iv.next, 1024

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-7.ll
new file mode 100644
index 0000000000000..3d07d969afbf1
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-7.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i8 %v6, ptr %out6"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512DQ
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl,+avx512bw --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512BW
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x i8] zeroinitializer, align 128
+ at B = global [1024 x i8] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v6, ptr %out6, align 1
+; SSE2:  LV: Found an estimated cost of 56 for VF 2 For instruction: store i8 %v6, ptr %out6, align 1
+; SSE2:  LV: Found an estimated cost of 112 for VF 4 For instruction: store i8 %v6, ptr %out6, align 1
+; SSE2:  LV: Found an estimated cost of 225 for VF 8 For instruction: store i8 %v6, ptr %out6, align 1
+; SSE2:  LV: Found an estimated cost of 456 for VF 16 For instruction: store i8 %v6, ptr %out6, align 1
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v6, ptr %out6, align 1
+; AVX1:  LV: Found an estimated cost of 33 for VF 2 For instruction: store i8 %v6, ptr %out6, align 1
+; AVX1:  LV: Found an estimated cost of 64 for VF 4 For instruction: store i8 %v6, ptr %out6, align 1
+; AVX1:  LV: Found an estimated cost of 121 for VF 8 For instruction: store i8 %v6, ptr %out6, align 1
+; AVX1:  LV: Found an estimated cost of 235 for VF 16 For instruction: store i8 %v6, ptr %out6, align 1
+; AVX1:  LV: Found an estimated cost of 581 for VF 32 For instruction: store i8 %v6, ptr %out6, align 1
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v6, ptr %out6, align 1
+; AVX2:  LV: Found an estimated cost of 33 for VF 2 For instruction: store i8 %v6, ptr %out6, align 1
+; AVX2:  LV: Found an estimated cost of 64 for VF 4 For instruction: store i8 %v6, ptr %out6, align 1
+; AVX2:  LV: Found an estimated cost of 121 for VF 8 For instruction: store i8 %v6, ptr %out6, align 1
+; AVX2:  LV: Found an estimated cost of 235 for VF 16 For instruction: store i8 %v6, ptr %out6, align 1
+; AVX2:  LV: Found an estimated cost of 581 for VF 32 For instruction: store i8 %v6, ptr %out6, align 1
+;
+; AVX512DQ-LABEL: 'test'
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v6, ptr %out6, align 1
+; AVX512DQ:  LV: Found an estimated cost of 33 for VF 2 For instruction: store i8 %v6, ptr %out6, align 1
+; AVX512DQ:  LV: Found an estimated cost of 64 for VF 4 For instruction: store i8 %v6, ptr %out6, align 1
+; AVX512DQ:  LV: Found an estimated cost of 122 for VF 8 For instruction: store i8 %v6, ptr %out6, align 1
+; AVX512DQ:  LV: Found an estimated cost of 235 for VF 16 For instruction: store i8 %v6, ptr %out6, align 1
+; AVX512DQ:  LV: Found an estimated cost of 578 for VF 32 For instruction: store i8 %v6, ptr %out6, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1267 for VF 64 For instruction: store i8 %v6, ptr %out6, align 1
+;
+; AVX512BW-LABEL: 'test'
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v6, ptr %out6, align 1
+; AVX512BW:  LV: Found an estimated cost of 22 for VF 2 For instruction: store i8 %v6, ptr %out6, align 1
+; AVX512BW:  LV: Found an estimated cost of 46 for VF 4 For instruction: store i8 %v6, ptr %out6, align 1
+; AVX512BW:  LV: Found an estimated cost of 118 for VF 8 For instruction: store i8 %v6, ptr %out6, align 1
+; AVX512BW:  LV: Found an estimated cost of 236 for VF 16 For instruction: store i8 %v6, ptr %out6, align 1
+; AVX512BW:  LV: Found an estimated cost of 472 for VF 32 For instruction: store i8 %v6, ptr %out6, align 1
+; AVX512BW:  LV: Found an estimated cost of 826 for VF 64 For instruction: store i8 %v6, ptr %out6, align 1
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+  %iv.5 = add nuw nsw i64 %iv, 5
+  %iv.6 = add nuw nsw i64 %iv, 6
+
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v = load i8, ptr %in
+
+  %v0 = add i8 %v, 0
+  %v1 = add i8 %v, 1
+  %v2 = add i8 %v, 2
+  %v3 = add i8 %v, 3
+  %v4 = add i8 %v, 4
+  %v5 = add i8 %v, 5
+  %v6 = add i8 %v, 6
+
+  %out0 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.2
+  %out3 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.3
+  %out4 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.4
+  %out5 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.5
+  %out6 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.6
+
+  store i8 %v0, ptr %out0
+  store i8 %v1, ptr %out1
+  store i8 %v2, ptr %out2
+  store i8 %v3, ptr %out3
+  store i8 %v4, ptr %out4
+  store i8 %v5, ptr %out5
+  store i8 %v6, ptr %out6
+
+  %iv.next = add nuw nsw i64 %iv.0, 7
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-8.ll
new file mode 100644
index 0000000000000..492bbe006b00f
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-8.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i8 %v7, ptr %out7"
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx  --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512DQ
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx512vl,+avx512bw --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX512BW
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at A = global [1024 x i8] zeroinitializer, align 128
+ at B = global [1024 x i8] zeroinitializer, align 128
+
+define void @test() {
+; SSE2-LABEL: 'test'
+; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v7, ptr %out7, align 1
+; SSE2:  LV: Found an estimated cost of 56 for VF 2 For instruction: store i8 %v7, ptr %out7, align 1
+; SSE2:  LV: Found an estimated cost of 120 for VF 4 For instruction: store i8 %v7, ptr %out7, align 1
+; SSE2:  LV: Found an estimated cost of 248 for VF 8 For instruction: store i8 %v7, ptr %out7, align 1
+; SSE2:  LV: Found an estimated cost of 504 for VF 16 For instruction: store i8 %v7, ptr %out7, align 1
+;
+; AVX1-LABEL: 'test'
+; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v7, ptr %out7, align 1
+; AVX1:  LV: Found an estimated cost of 33 for VF 2 For instruction: store i8 %v7, ptr %out7, align 1
+; AVX1:  LV: Found an estimated cost of 67 for VF 4 For instruction: store i8 %v7, ptr %out7, align 1
+; AVX1:  LV: Found an estimated cost of 134 for VF 8 For instruction: store i8 %v7, ptr %out7, align 1
+; AVX1:  LV: Found an estimated cost of 268 for VF 16 For instruction: store i8 %v7, ptr %out7, align 1
+; AVX1:  LV: Found an estimated cost of 664 for VF 32 For instruction: store i8 %v7, ptr %out7, align 1
+;
+; AVX2-LABEL: 'test'
+; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v7, ptr %out7, align 1
+; AVX2:  LV: Found an estimated cost of 33 for VF 2 For instruction: store i8 %v7, ptr %out7, align 1
+; AVX2:  LV: Found an estimated cost of 67 for VF 4 For instruction: store i8 %v7, ptr %out7, align 1
+; AVX2:  LV: Found an estimated cost of 134 for VF 8 For instruction: store i8 %v7, ptr %out7, align 1
+; AVX2:  LV: Found an estimated cost of 268 for VF 16 For instruction: store i8 %v7, ptr %out7, align 1
+; AVX2:  LV: Found an estimated cost of 664 for VF 32 For instruction: store i8 %v7, ptr %out7, align 1
+;
+; AVX512DQ-LABEL: 'test'
+; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v7, ptr %out7, align 1
+; AVX512DQ:  LV: Found an estimated cost of 33 for VF 2 For instruction: store i8 %v7, ptr %out7, align 1
+; AVX512DQ:  LV: Found an estimated cost of 67 for VF 4 For instruction: store i8 %v7, ptr %out7, align 1
+; AVX512DQ:  LV: Found an estimated cost of 133 for VF 8 For instruction: store i8 %v7, ptr %out7, align 1
+; AVX512DQ:  LV: Found an estimated cost of 266 for VF 16 For instruction: store i8 %v7, ptr %out7, align 1
+; AVX512DQ:  LV: Found an estimated cost of 660 for VF 32 For instruction: store i8 %v7, ptr %out7, align 1
+; AVX512DQ:  LV: Found an estimated cost of 1448 for VF 64 For instruction: store i8 %v7, ptr %out7, align 1
+;
+; AVX512BW-LABEL: 'test'
+; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v7, ptr %out7, align 1
+; AVX512BW:  LV: Found an estimated cost of 25 for VF 2 For instruction: store i8 %v7, ptr %out7, align 1
+; AVX512BW:  LV: Found an estimated cost of 53 for VF 4 For instruction: store i8 %v7, ptr %out7, align 1
+; AVX512BW:  LV: Found an estimated cost of 137 for VF 8 For instruction: store i8 %v7, ptr %out7, align 1
+; AVX512BW:  LV: Found an estimated cost of 275 for VF 16 For instruction: store i8 %v7, ptr %out7, align 1
+; AVX512BW:  LV: Found an estimated cost of 550 for VF 32 For instruction: store i8 %v7, ptr %out7, align 1
+; AVX512BW:  LV: Found an estimated cost of 1100 for VF 64 For instruction: store i8 %v7, ptr %out7, align 1
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+  %iv.2 = add nuw nsw i64 %iv, 2
+  %iv.3 = add nuw nsw i64 %iv, 3
+  %iv.4 = add nuw nsw i64 %iv, 4
+  %iv.5 = add nuw nsw i64 %iv, 5
+  %iv.6 = add nuw nsw i64 %iv, 6
+  %iv.7 = add nuw nsw i64 %iv, 7
+
+  %in = getelementptr inbounds [1024 x i8], ptr @A, i64 0, i64 %iv.0
+  %v = load i8, ptr %in
+
+  %v0 = add i8 %v, 0
+  %v1 = add i8 %v, 1
+  %v2 = add i8 %v, 2
+  %v3 = add i8 %v, 3
+  %v4 = add i8 %v, 4
+  %v5 = add i8 %v, 5
+  %v6 = add i8 %v, 6
+  %v7 = add i8 %v, 7
+
+  %out0 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.0
+  %out1 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.1
+  %out2 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.2
+  %out3 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.3
+  %out4 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.4
+  %out5 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.5
+  %out6 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.6
+  %out7 = getelementptr inbounds [1024 x i8], ptr @B, i64 0, i64 %iv.7
+
+  store i8 %v0, ptr %out0
+  store i8 %v1, ptr %out1
+  store i8 %v2, ptr %out2
+  store i8 %v3, ptr %out3
+  store i8 %v4, ptr %out4
+  store i8 %v5, ptr %out5
+  store i8 %v6, ptr %out6
+  store i8 %v7, ptr %out7
+
+  %iv.next = add nuw nsw i64 %iv.0, 8
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}


        


More information about the llvm-commits mailing list