[llvm] 17f1fc1 - [TTI] BasicTTI::getInterleavedMemoryOpCost(): use getScalarizationOverhead()

Wed Sep 29 08:42:15 PDT 2021

Author: Simon Pilgrim
Date: 2021-09-29T16:41:53+01:00
New Revision: 17f1fc1e5452186795b04fd388532f8bca26c31b

URL: https://github.com/llvm/llvm-project/commit/17f1fc1e5452186795b04fd388532f8bca26c31b
DIFF: https://github.com/llvm/llvm-project/commit/17f1fc1e5452186795b04fd388532f8bca26c31b.diff

LOG: [TTI] BasicTTI::getInterleavedMemoryOpCost(): use getScalarizationOverhead()

getScalarizationOverhead() results in a somewhat better cost estimation than counting the insertion/extraction costs directly. Notably, this is still overestimating the costs.

Original Patch by: @lebedev.ri (Roman Lebedev)

Differential Revision: https://reviews.llvm.org/D110713

Added: 
    

Modified: 
    llvm/include/llvm/CodeGen/BasicTTIImpl.h
    llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-2.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-3.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-4.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll
    llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-2.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-2.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-3.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-4.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-5.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-6.ll
    llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-2.ll
    llvm/test/Transforms/LoopVectorize/X86/interleaving.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index de4f97740c9f7..2e3bd424040ca 100644

--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1237,6 +1237,14 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     // Then plus the cost of interleave operation.
     assert(Indices.size() <= Factor &&
            "Interleaved memory op has too many members");
+
+    APInt DemandedLoadStoreElts = APInt::getNullValue(NumElts);
+    for (unsigned Index : Indices) {
+      assert(Index < Factor && "Invalid index for interleaved memory op");
+      for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
+        DemandedLoadStoreElts.setBit(Index + Elm * Factor);
+    }
+
     if (Opcode == Instruction::Load) {
       // The interleave cost is similar to extract sub vectors' elements
       // from the wide vector, and insert them into sub vectors.
@@ -1246,21 +1254,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       //      %v0 = shuffle %vec, undef, <0, 2, 4, 6>         ; Index 0
       // The cost is estimated as extract elements at 0, 2, 4, 6 from the
       // <8 x i32> vector and insert them into a <4 x i32> vector.
-      for (unsigned Index : Indices) {
-        assert(Index < Factor && "Invalid index for interleaved memory op");
-
-        // Extract elements from loaded vector for each sub vector.
-        for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
-          Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VT,
-                                              Index + Elm * Factor);
-      }
-
-      InstructionCost InsSubCost = 0;
-      for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
-        InsSubCost +=
-            thisT()->getVectorInstrCost(Instruction::InsertElement, SubVT, Elm);
-
+      InstructionCost InsSubCost =
+          getScalarizationOverhead(SubVT, /*Insert*/ true, /*Extract*/ false);
       Cost += Indices.size() * InsSubCost;
+      Cost +=
+          thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
+                                            /*Insert*/ false, /*Extract*/ true);
     } else {
       // The interleave cost is extract elements from sub vectors, and
       // insert them into the wide vector.
@@ -1275,20 +1274,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       // The cost is estimated as extract all elements (of actual members,
       // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
       // i32> vector.
-      InstructionCost ExtSubCost = 0;
-      for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
-        ExtSubCost += thisT()->getVectorInstrCost(Instruction::ExtractElement,
-                                                  SubVT, Elm);
+      InstructionCost ExtSubCost =
+          getScalarizationOverhead(SubVT, /*Insert*/ false, /*Extract*/ true);
       Cost += ExtSubCost * Indices.size();
-
-      for (unsigned Index : Indices) {
-        assert(Index < Factor && "Invalid index for interleaved memory op");
-
-        // Insert elements from loaded vector for each sub vector.
-        for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
-          Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VT,
-                                              Index + Elm * Factor);
-      }
+      Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
+                                                /*Insert*/ true,
+                                                /*Extract*/ false);
     }
 
     if (!UseMaskForCond)
@@ -1308,13 +1299,9 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     // The cost is estimated as extract all mask elements from the <8xi1> mask
     // vector and insert them factor times into the <24xi1> shuffled mask
     // vector.
-    for (unsigned i = 0; i < NumSubElts; i++)
-      Cost +=
-          thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i);
-
-    for (unsigned i = 0; i < NumElts; i++)
-      Cost +=
-          thisT()->getVectorInstrCost(Instruction::InsertElement, MaskVT, i);
+    Cost += getScalarizationOverhead(SubVT, /*Insert*/ false, /*Extract*/ true);
+    Cost +=
+        getScalarizationOverhead(MaskVT, /*Insert*/ true, /*Extract*/ false);
 
     // The Gaps mask is invariant and created outside the loop, therefore the
     // cost of creating it is not accounted for here. However if we have both

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-2.ll
index 6cb3a41dc79d4..a08974e5ee74f 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-2.ll
@@ -22,8 +22,8 @@ target triple = "x86_64-unknown-linux-gnu"
 ; AVX1: LV: Found an estimated cost of 9 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX1: LV: Found an estimated cost of 17 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX1: LV: Found an estimated cost of 41 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 114 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 228 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX1: LV: Found an estimated cost of 86 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX1: LV: Found an estimated cost of 172 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-3.ll
index 7145ddfb6978b..ceb0310b10c8b 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-3.ll
@@ -22,15 +22,15 @@ target triple = "x86_64-unknown-linux-gnu"
 ; AVX1: LV: Found an estimated cost of 15 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX1: LV: Found an estimated cost of 31 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX1: LV: Found an estimated cost of 58 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 171 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 342 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX1: LV: Found an estimated cost of 129 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX1: LV: Found an estimated cost of 258 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX2: LV: Found an estimated cost of 15 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX2: LV: Found an estimated cost of 31 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX2: LV: Found an estimated cost of 58 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 171 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 342 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX2: LV: Found an estimated cost of 129 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX2: LV: Found an estimated cost of 258 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-4.ll
index ef806e33bfb91..e7387d33ae5a5 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-4.ll
@@ -22,8 +22,8 @@ target triple = "x86_64-unknown-linux-gnu"
 ; AVX1: LV: Found an estimated cost of 17 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX1: LV: Found an estimated cost of 41 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX1: LV: Found an estimated cost of 82 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 228 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 456 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX1: LV: Found an estimated cost of 172 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX1: LV: Found an estimated cost of 344 for VF 32 For instruction:   %v0 = load i16, i16* %in0, align 2
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX2: LV: Found an estimated cost of 7 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
index 86057c092bc3b..8868b6baa0ce1 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
@@ -21,13 +21,13 @@ target triple = "x86_64-unknown-linux-gnu"
 ; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX1: LV: Found an estimated cost of 50 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX1: LV: Found an estimated cost of 99 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 285 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX1: LV: Found an estimated cost of 215 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX2: LV: Found an estimated cost of 26 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX2: LV: Found an estimated cost of 50 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX2: LV: Found an estimated cost of 99 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX2: LV: Found an estimated cost of 285 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX2: LV: Found an estimated cost of 215 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX512: LV: Found an estimated cost of 11 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll
index ec89a35c41e2d..0a980370b4e54 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-6.ll
@@ -21,7 +21,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; AVX1: LV: Found an estimated cost of 31 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX1: LV: Found an estimated cost of 58 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX1: LV: Found an estimated cost of 123 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
-; AVX1: LV: Found an estimated cost of 342 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
+; AVX1: LV: Found an estimated cost of 258 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; AVX2: LV: Found an estimated cost of 16 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-2.ll
index a111ddb8e4a6e..d817994b9ee19 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-2.ll
@@ -13,24 +13,24 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: LV: Checking a loop in "test"
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i8, i8* %in0, align 1
-; SSE2: LV: Found an estimated cost of 20 for VF 2 For instruction:   %v0 = load i8, i8* %in0, align 1
-; SSE2: LV: Found an estimated cost of 56 for VF 4 For instruction:   %v0 = load i8, i8* %in0, align 1
-; SSE2: LV: Found an estimated cost of 160 for VF 8 For instruction:   %v0 = load i8, i8* %in0, align 1
-; SSE2: LV: Found an estimated cost of 478 for VF 16 For instruction:   %v0 = load i8, i8* %in0, align 1
+; SSE2: LV: Found an estimated cost of 14 for VF 2 For instruction:   %v0 = load i8, i8* %in0, align 1
+; SSE2: LV: Found an estimated cost of 30 for VF 4 For instruction:   %v0 = load i8, i8* %in0, align 1
+; SSE2: LV: Found an estimated cost of 62 for VF 8 For instruction:   %v0 = load i8, i8* %in0, align 1
+; SSE2: LV: Found an estimated cost of 126 for VF 16 For instruction:   %v0 = load i8, i8* %in0, align 1
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i8, i8* %in0, align 1
 ; AVX1: LV: Found an estimated cost of 9 for VF 2 For instruction:   %v0 = load i8, i8* %in0, align 1
 ; AVX1: LV: Found an estimated cost of 17 for VF 4 For instruction:   %v0 = load i8, i8* %in0, align 1
 ; AVX1: LV: Found an estimated cost of 33 for VF 8 For instruction:   %v0 = load i8, i8* %in0, align 1
 ; AVX1: LV: Found an estimated cost of 81 for VF 16 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX1: LV: Found an estimated cost of 226 for VF 32 For instruction:   %v0 = load i8, i8* %in0, align 1
+; AVX1: LV: Found an estimated cost of 166 for VF 32 For instruction:   %v0 = load i8, i8* %in0, align 1
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i8, i8* %in0, align 1
 ; AVX2: LV: Found an estimated cost of 9 for VF 2 For instruction:   %v0 = load i8, i8* %in0, align 1
 ; AVX2: LV: Found an estimated cost of 17 for VF 4 For instruction:   %v0 = load i8, i8* %in0, align 1
 ; AVX2: LV: Found an estimated cost of 33 for VF 8 For instruction:   %v0 = load i8, i8* %in0, align 1
 ; AVX2: LV: Found an estimated cost of 81 for VF 16 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX2: LV: Found an estimated cost of 226 for VF 32 For instruction:   %v0 = load i8, i8* %in0, align 1
+; AVX2: LV: Found an estimated cost of 166 for VF 32 For instruction:   %v0 = load i8, i8* %in0, align 1
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i8, i8* %in0, align 1
 ; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction:   %v0 = load i8, i8* %in0, align 1

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-2.ll
index e4f009adce767..5ab66c0b31541 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-2.ll
@@ -21,9 +21,9 @@ target triple = "x86_64-unknown-linux-gnu"
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v1, i16* %out1, align 2
 ; AVX1: LV: Found an estimated cost of 9 for VF 2 For instruction:   store i16 %v1, i16* %out1, align 2
 ; AVX1: LV: Found an estimated cost of 17 for VF 4 For instruction:   store i16 %v1, i16* %out1, align 2
-; AVX1: LV: Found an estimated cost of 49 for VF 8 For instruction:   store i16 %v1, i16* %out1, align 2
-; AVX1: LV: Found an estimated cost of 114 for VF 16 For instruction:   store i16 %v1, i16* %out1, align 2
-; AVX1: LV: Found an estimated cost of 228 for VF 32 For instruction:   store i16 %v1, i16* %out1, align 2
+; AVX1: LV: Found an estimated cost of 35 for VF 8 For instruction:   store i16 %v1, i16* %out1, align 2
+; AVX1: LV: Found an estimated cost of 86 for VF 16 For instruction:   store i16 %v1, i16* %out1, align 2
+; AVX1: LV: Found an estimated cost of 172 for VF 32 For instruction:   store i16 %v1, i16* %out1, align 2
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v1, i16* %out1, align 2
 ; AVX2: LV: Found an estimated cost of 2 for VF 2 For instruction:   store i16 %v1, i16* %out1, align 2

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-3.ll
index 5d002b42cf496..b42da831676a7 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-3.ll
@@ -20,17 +20,17 @@ target triple = "x86_64-unknown-linux-gnu"
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v2, i16* %out2, align 2
 ; AVX1: LV: Found an estimated cost of 15 for VF 2 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX1: LV: Found an estimated cost of 35 for VF 4 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX1: LV: Found an estimated cost of 66 for VF 8 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX1: LV: Found an estimated cost of 171 for VF 16 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX1: LV: Found an estimated cost of 342 for VF 32 For instruction:   store i16 %v2, i16* %out2, align 2
+; AVX1: LV: Found an estimated cost of 30 for VF 4 For instruction:   store i16 %v2, i16* %out2, align 2
+; AVX1: LV: Found an estimated cost of 53 for VF 8 For instruction:   store i16 %v2, i16* %out2, align 2
+; AVX1: LV: Found an estimated cost of 129 for VF 16 For instruction:   store i16 %v2, i16* %out2, align 2
+; AVX1: LV: Found an estimated cost of 258 for VF 32 For instruction:   store i16 %v2, i16* %out2, align 2
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v2, i16* %out2, align 2
 ; AVX2: LV: Found an estimated cost of 15 for VF 2 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX2: LV: Found an estimated cost of 35 for VF 4 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX2: LV: Found an estimated cost of 66 for VF 8 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX2: LV: Found an estimated cost of 171 for VF 16 For instruction:   store i16 %v2, i16* %out2, align 2
-; AVX2: LV: Found an estimated cost of 342 for VF 32 For instruction:   store i16 %v2, i16* %out2, align 2
+; AVX2: LV: Found an estimated cost of 30 for VF 4 For instruction:   store i16 %v2, i16* %out2, align 2
+; AVX2: LV: Found an estimated cost of 53 for VF 8 For instruction:   store i16 %v2, i16* %out2, align 2
+; AVX2: LV: Found an estimated cost of 129 for VF 16 For instruction:   store i16 %v2, i16* %out2, align 2
+; AVX2: LV: Found an estimated cost of 258 for VF 32 For instruction:   store i16 %v2, i16* %out2, align 2
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v2, i16* %out2, align 2
 ; AVX512: LV: Found an estimated cost of 6 for VF 2 For instruction:   store i16 %v2, i16* %out2, align 2

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-4.ll
index 94d8c24c16f3f..6498d88cbe530 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-4.ll
@@ -20,10 +20,10 @@ target triple = "x86_64-unknown-linux-gnu"
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v3, i16* %out3, align 2
 ; AVX1: LV: Found an estimated cost of 17 for VF 2 For instruction:   store i16 %v3, i16* %out3, align 2
-; AVX1: LV: Found an estimated cost of 49 for VF 4 For instruction:   store i16 %v3, i16* %out3, align 2
-; AVX1: LV: Found an estimated cost of 98 for VF 8 For instruction:   store i16 %v3, i16* %out3, align 2
-; AVX1: LV: Found an estimated cost of 228 for VF 16 For instruction:   store i16 %v3, i16* %out3, align 2
-; AVX1: LV: Found an estimated cost of 456 for VF 32 For instruction:   store i16 %v3, i16* %out3, align 2
+; AVX1: LV: Found an estimated cost of 35 for VF 4 For instruction:   store i16 %v3, i16* %out3, align 2
+; AVX1: LV: Found an estimated cost of 70 for VF 8 For instruction:   store i16 %v3, i16* %out3, align 2
+; AVX1: LV: Found an estimated cost of 172 for VF 16 For instruction:   store i16 %v3, i16* %out3, align 2
+; AVX1: LV: Found an estimated cost of 344 for VF 32 For instruction:   store i16 %v3, i16* %out3, align 2
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v3, i16* %out3, align 2
 ; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction:   store i16 %v3, i16* %out3, align 2

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-5.ll
index 8a62afafcbda8..d530664146acd 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-5.ll
@@ -18,16 +18,16 @@ target triple = "x86_64-unknown-linux-gnu"
 ; SSE2: LV: Found an estimated cost of 85 for VF 8 For instruction:   store i16 %v4, i16* %out4, align 2
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v4, i16* %out4, align 2
-; AVX1: LV: Found an estimated cost of 28 for VF 2 For instruction:   store i16 %v4, i16* %out4, align 2
-; AVX1: LV: Found an estimated cost of 58 for VF 4 For instruction:   store i16 %v4, i16* %out4, align 2
-; AVX1: LV: Found an estimated cost of 115 for VF 8 For instruction:   store i16 %v4, i16* %out4, align 2
-; AVX1: LV: Found an estimated cost of 285 for VF 16 For instruction:   store i16 %v4, i16* %out4, align 2
+; AVX1: LV: Found an estimated cost of 27 for VF 2 For instruction:   store i16 %v4, i16* %out4, align 2
+; AVX1: LV: Found an estimated cost of 45 for VF 4 For instruction:   store i16 %v4, i16* %out4, align 2
+; AVX1: LV: Found an estimated cost of 88 for VF 8 For instruction:   store i16 %v4, i16* %out4, align 2
+; AVX1: LV: Found an estimated cost of 215 for VF 16 For instruction:   store i16 %v4, i16* %out4, align 2
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v4, i16* %out4, align 2
-; AVX2: LV: Found an estimated cost of 28 for VF 2 For instruction:   store i16 %v4, i16* %out4, align 2
-; AVX2: LV: Found an estimated cost of 58 for VF 4 For instruction:   store i16 %v4, i16* %out4, align 2
-; AVX2: LV: Found an estimated cost of 115 for VF 8 For instruction:   store i16 %v4, i16* %out4, align 2
-; AVX2: LV: Found an estimated cost of 285 for VF 16 For instruction:   store i16 %v4, i16* %out4, align 2
+; AVX2: LV: Found an estimated cost of 27 for VF 2 For instruction:   store i16 %v4, i16* %out4, align 2
+; AVX2: LV: Found an estimated cost of 45 for VF 4 For instruction:   store i16 %v4, i16* %out4, align 2
+; AVX2: LV: Found an estimated cost of 88 for VF 8 For instruction:   store i16 %v4, i16* %out4, align 2
+; AVX2: LV: Found an estimated cost of 215 for VF 16 For instruction:   store i16 %v4, i16* %out4, align 2
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v4, i16* %out4, align 2
 ; AVX512: LV: Found an estimated cost of 11 for VF 2 For instruction:   store i16 %v4, i16* %out4, align 2

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-6.ll
index 6a68e443f2c51..6df7dd58dfad1 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-6.ll
@@ -18,10 +18,10 @@ target triple = "x86_64-unknown-linux-gnu"
 ; SSE2: LV: Found an estimated cost of 102 for VF 8 For instruction:   store i16 %v5, i16* %out5, align 2
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v5, i16* %out5, align 2
-; AVX1: LV: Found an estimated cost of 35 for VF 2 For instruction:   store i16 %v5, i16* %out5, align 2
-; AVX1: LV: Found an estimated cost of 66 for VF 4 For instruction:   store i16 %v5, i16* %out5, align 2
-; AVX1: LV: Found an estimated cost of 147 for VF 8 For instruction:   store i16 %v5, i16* %out5, align 2
-; AVX1: LV: Found an estimated cost of 342 for VF 16 For instruction:   store i16 %v5, i16* %out5, align 2
+; AVX1: LV: Found an estimated cost of 30 for VF 2 For instruction:   store i16 %v5, i16* %out5, align 2
+; AVX1: LV: Found an estimated cost of 53 for VF 4 For instruction:   store i16 %v5, i16* %out5, align 2
+; AVX1: LV: Found an estimated cost of 105 for VF 8 For instruction:   store i16 %v5, i16* %out5, align 2
+; AVX1: LV: Found an estimated cost of 258 for VF 16 For instruction:   store i16 %v5, i16* %out5, align 2
 ;
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v5, i16* %out5, align 2
 ; AVX2: LV: Found an estimated cost of 13 for VF 2 For instruction:   store i16 %v5, i16* %out5, align 2

diff  --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-2.ll
index 3dc6e1c8326f5..c04c4aa9f040c 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-2.ll
@@ -13,24 +13,24 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: LV: Checking a loop in "test"
 ;
 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %v1, i8* %out1, align 1
-; SSE2: LV: Found an estimated cost of 27 for VF 2 For instruction:   store i8 %v1, i8* %out1, align 1
-; SSE2: LV: Found an estimated cost of 79 for VF 4 For instruction:   store i8 %v1, i8* %out1, align 1
-; SSE2: LV: Found an estimated cost of 238 for VF 8 For instruction:   store i8 %v1, i8* %out1, align 1
-; SSE2: LV: Found an estimated cost of 478 for VF 16 For instruction:   store i8 %v1, i8* %out1, align 1
+; SSE2: LV: Found an estimated cost of 14 for VF 2 For instruction:   store i8 %v1, i8* %out1, align 1
+; SSE2: LV: Found an estimated cost of 30 for VF 4 For instruction:   store i8 %v1, i8* %out1, align 1
+; SSE2: LV: Found an estimated cost of 62 for VF 8 For instruction:   store i8 %v1, i8* %out1, align 1
+; SSE2: LV: Found an estimated cost of 126 for VF 16 For instruction:   store i8 %v1, i8* %out1, align 1
 ;
 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %v1, i8* %out1, align 1
 ; AVX1: LV: Found an estimated cost of 9 for VF 2 For instruction:   store i8 %v1, i8* %out1, align 1
 ; AVX1: LV: Found an estimated cost of 17 for VF 4 For instruction:   store i8 %v1, i8* %out1, align 1
 ; AVX1: LV: Found an estimated cost of 33 for VF 8 For instruction:   store i8 %v1, i8* %out1, align 1
-; AVX1: LV: Found an estimated cost of 97 for VF 16 For instruction:   store i8 %v1, i8* %out1, align 1
-; AVX1: LV: Found an estimated cost of 226 for VF 32 For instruction:   store i8 %v1, i8* %out1, align 1
+; AVX1: LV: Found an estimated cost of 67 for VF 16 For instruction:   store i8 %v1, i8* %out1, align 1
+; AVX1: LV: Found an estimated cost of 166 for VF 32 For instruction:   store i8 %v1, i8* %out1, align 1
 
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %v1, i8* %out1, align 1
 ; AVX2: LV: Found an estimated cost of 9 for VF 2 For instruction:   store i8 %v1, i8* %out1, align 1
 ; AVX2: LV: Found an estimated cost of 17 for VF 4 For instruction:   store i8 %v1, i8* %out1, align 1
 ; AVX2: LV: Found an estimated cost of 33 for VF 8 For instruction:   store i8 %v1, i8* %out1, align 1
-; AVX2: LV: Found an estimated cost of 97 for VF 16 For instruction:   store i8 %v1, i8* %out1, align 1
-; AVX2: LV: Found an estimated cost of 226 for VF 32 For instruction:   store i8 %v1, i8* %out1, align 1
+; AVX2: LV: Found an estimated cost of 67 for VF 16 For instruction:   store i8 %v1, i8* %out1, align 1
+; AVX2: LV: Found an estimated cost of 166 for VF 32 For instruction:   store i8 %v1, i8* %out1, align 1
 ;
 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %v1, i8* %out1, align 1
 ; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction:   store i8 %v1, i8* %out1, align 1

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
index 36bccb7cda208..86facbb65b6a2 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine < %s | FileCheck %s --check-prefix=SSE
-; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=sandybridge < %s | FileCheck %s --check-prefix=AVX
-; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=haswell < %s | FileCheck %s --check-prefix=AVX
+; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=sandybridge < %s | FileCheck %s --check-prefix=AVX1
+; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=haswell < %s | FileCheck %s --check-prefix=AVX2
 ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=slm < %s | FileCheck %s --check-prefix=SSE
 ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=atom < %s | FileCheck %s --check-prefix=SSE
 
@@ -26,34 +26,63 @@ define void @foo(i32* noalias nocapture %a, i32* noalias nocapture readonly %b)
 ; SSE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
 ; SSE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
 ;
-; AVX-LABEL: @foo(
-; AVX-NEXT:  entry:
-; AVX-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; AVX:       vector.ph:
-; AVX-NEXT:    br label [[VECTOR_BODY:%.*]]
-; AVX:       vector.body:
-; AVX-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
-; AVX-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
-; AVX-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <8 x i32>*
-; AVX-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4
-; AVX-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; AVX-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; AVX-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]]
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
-; AVX-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; AVX-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP5]], align 4
-; AVX-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; AVX-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; AVX-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
-; AVX:       middle.block:
-; AVX-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; AVX:       scalar.ph:
-; AVX-NEXT:    br label [[FOR_BODY:%.*]]
-; AVX:       for.cond.cleanup:
-; AVX-NEXT:    ret void
-; AVX:       for.body:
-; AVX-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP2:!llvm.loop !.*]]
+; AVX1-LABEL: @foo(
+; AVX1-NEXT:  entry:
+; AVX1-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; AVX1:       vector.ph:
+; AVX1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; AVX1:       vector.body:
+; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AVX1-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; AVX1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
+; AVX1-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <8 x i32>*
+; AVX1-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4
+; AVX1-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX1-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX1-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]]
+; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
+; AVX1-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP5]], align 4
+; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; AVX1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; AVX1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; AVX1:       middle.block:
+; AVX1-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; AVX1:       scalar.ph:
+; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX1:       for.cond.cleanup:
+; AVX1-NEXT:    ret void
+; AVX1:       for.body:
+; AVX1-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+;
+; AVX2-LABEL: @foo(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; AVX2:       vector.ph:
+; AVX2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; AVX2:       vector.body:
+; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AVX2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; AVX2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
+; AVX2-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <16 x i32>*
+; AVX2-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[TMP2]], align 4
+; AVX2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; AVX2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; AVX2-NEXT:    [[TMP3:%.*]] = add nsw <8 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]]
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
+; AVX2-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* [[TMP5]], align 4
+; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; AVX2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; AVX2-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; AVX2:       middle.block:
+; AVX2-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; AVX2:       scalar.ph:
+; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX2:       for.cond.cleanup:
+; AVX2-NEXT:    ret void
+; AVX2:       for.body:
+; AVX2-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ;
 entry:
   br label %for.body