[llvm] [LV] Stability fix for outerloop vectorization (PR #68118)

Kolya Panchenko via llvm-commits llvm-commits at lists.llvm.org
Fri Nov 17 14:04:25 PST 2023


https://github.com/nikolaypanchenko updated https://github.com/llvm/llvm-project/pull/68118

>From 8090c997c008be21ed9a38984f1777ec56042413 Mon Sep 17 00:00:00 2001
From: Kolya Panchenko <kolya.panchenko at sifive.com>
Date: Mon, 2 Oct 2023 13:52:26 -0700
Subject: [PATCH 1/3] [LV] Stability fix for outerloop vectorization

HCFG builder doesn't correctly handle cases when non-outermost loop is
requested to be vectorized

[Original] Differential Revision: https://reviews.llvm.org/D150700
---
 .../Transforms/Vectorize/VPlanHCFGBuilder.cpp |  15 +-
 .../outer_loop_hcfg_construction.ll           | 461 ++++++++++++++++++
 2 files changed, 475 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/outer_loop_hcfg_construction.ll

diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
index e90f5b94a64238b..6114287b93e67d3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -156,6 +156,19 @@ static bool isHeaderVPBB(VPBasicBlock *VPBB) {
   return VPBB->getParent() && VPBB->getParent()->getEntry() == VPBB;
 }
 
+/// Return true of \p L loop is contained within \p OuterLoop
+static bool isNestedLoop(const Loop *L, const Loop *OuterLoop) {
+  if (L->getLoopDepth() < OuterLoop->getLoopDepth())
+    return false;
+  const Loop *P = L;
+  while (P) {
+    if (P == OuterLoop)
+      return true;
+    P = P->getParentLoop();
+  }
+  return false;
+}
+
 // Create a new empty VPBasicBlock for an incoming BasicBlock in the region
 // corresponding to the containing loop  or retrieve an existing one if it was
 // already created. If no region exists yet for the loop containing \p BB, a new
@@ -174,7 +187,7 @@ VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {
 
   // Get or create a region for the loop containing BB.
   Loop *LoopOfBB = LI->getLoopFor(BB);
-  if (!LoopOfBB)
+  if (!LoopOfBB || !isNestedLoop(LoopOfBB, TheLoop))
     return VPBB;
 
   auto *RegionOfVPBB = Loop2Region.lookup(LoopOfBB);
diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_hcfg_construction.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_hcfg_construction.ll
new file mode 100644
index 000000000000000..895a129d231ba96
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/outer_loop_hcfg_construction.ll
@@ -0,0 +1,461 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=loop-vectorize -enable-vplan-native-path < %s -S | FileCheck %s
+; void test(int n, int **a)
+; {
+;   for (int k = 0; k < n; ++k) {
+;     a[k][0] = 0;
+;     #pragma clang loop vectorize_width(4)
+;     for (int i = 0; i < n; ++i) {
+;         for (int j = 0; j < n; ++j) {
+;             a[i][j] = 2 + k;
+;         }
+;     }
+;   }
+; }
+
+define void @test(i64 %n, ptr %a) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: i64 [[N:%.*]], ptr [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP34:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP34]], label [[FOR_BODY_US_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.us.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY_US:%.*]]
+; CHECK:       for.body.us:
+; CHECK-NEXT:    [[IV42:%.*]] = phi i64 [ 0, [[FOR_BODY_US_PREHEADER]] ], [ [[IV_NEXT43:%.*]], [[FOR_COND2_FOR_COND_CLEANUP4_CRIT_EDGE_SPLIT_US_US:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[IV42]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX_US]], align 8
+; CHECK-NEXT:    store i32 0, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[IV42]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT2]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_COND6_FOR_COND_CLEANUP8_CRIT_EDGE_US_US4:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_COND6_FOR_COND_CLEANUP8_CRIT_EDGE_US_US4]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[A]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP3]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x ptr> poison)
+; CHECK-NEXT:    br label [[FOR_BODY9_US_US1:%.*]]
+; CHECK:       for.body9.us.us1:
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP5:%.*]], [[FOR_BODY9_US_US1]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, <4 x ptr> [[WIDE_MASKED_GATHER]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[BROADCAST_SPLAT]], <4 x ptr> [[TMP4]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP5]] = add nuw nsw <4 x i64> [[VEC_PHI]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP5]], [[BROADCAST_SPLAT3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
+; CHECK-NEXT:    br i1 [[TMP7]], label [[FOR_COND6_FOR_COND_CLEANUP8_CRIT_EDGE_US_US4]], label [[FOR_BODY9_US_US1]]
+; CHECK:       for.cond6.for.cond.cleanup8_crit_edge.us.us4:
+; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i64> [[TMP8]], [[BROADCAST_SPLAT3]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND2_FOR_COND_CLEANUP4_CRIT_EDGE_SPLIT_US_US]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
+; CHECK-NEXT:    br label [[FOR_COND6_PREHEADER_US_US:%.*]]
+; CHECK:       for.cond6.preheader.us.us:
+; CHECK-NEXT:    [[IV37:%.*]] = phi i64 [ [[IV_NEXT38:%.*]], [[FOR_COND6_FOR_COND_CLEANUP8_CRIT_EDGE_US_US:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX11_US_US:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[IV37]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[ARRAYIDX11_US_US]], align 8
+; CHECK-NEXT:    br label [[FOR_BODY9_US_US:%.*]]
+; CHECK:       for.body9.us.us:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY9_US_US]] ], [ 0, [[FOR_COND6_PREHEADER_US_US]] ]
+; CHECK-NEXT:    [[ARRAYIDX13_US_US:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[TMP2]], ptr [[ARRAYIDX13_US_US]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND6_FOR_COND_CLEANUP8_CRIT_EDGE_US_US]], label [[FOR_BODY9_US_US]]
+; CHECK:       for.cond6.for.cond.cleanup8_crit_edge.us.us:
+; CHECK-NEXT:    [[IV_NEXT38]] = add nuw nsw i64 [[IV37]], 1
+; CHECK-NEXT:    [[EXITCOND41_NOT:%.*]] = icmp eq i64 [[IV_NEXT38]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND41_NOT]], label [[FOR_COND2_FOR_COND_CLEANUP4_CRIT_EDGE_SPLIT_US_US]], label [[FOR_COND6_PREHEADER_US_US]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       for.cond2.for.cond.cleanup4_crit_edge.split.us.us:
+; CHECK-NEXT:    [[IV_NEXT43]] = add nuw nsw i64 [[IV42]], 1
+; CHECK-NEXT:    [[EXITCOND47_NOT:%.*]] = icmp eq i64 [[IV_NEXT43]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND47_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY_US]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp34 = icmp sgt i64 %n, 0
+  br i1 %cmp34, label %for.body.us.preheader, label %for.cond.cleanup
+
+for.body.us.preheader:
+  br label %for.body.us
+
+for.body.us:
+  %iv42 = phi i64 [ 0, %for.body.us.preheader ], [ %iv.next43, %for.cond2.for.cond.cleanup4_crit_edge.split.us.us ]
+  %arrayidx.us = getelementptr inbounds ptr, ptr %a, i64 %iv42
+  %0 = load ptr, ptr %arrayidx.us, align 8
+  store i32 0, ptr %0, align 4
+  %1 = trunc i64 %iv42 to i32
+  %2 = add i32 %1, 2
+  br label %for.cond6.preheader.us.us
+
+for.cond6.preheader.us.us:
+  %iv37 = phi i64 [ %iv.next38, %for.cond6.for.cond.cleanup8_crit_edge.us.us ], [ 0, %for.body.us ]
+  %arrayidx11.us.us = getelementptr inbounds ptr, ptr %a, i64 %iv37
+  %3 = load ptr, ptr %arrayidx11.us.us, align 8
+  br label %for.body9.us.us
+
+for.body9.us.us:
+  %iv = phi i64 [ %iv.next, %for.body9.us.us ], [ 0, %for.cond6.preheader.us.us ]
+  %arrayidx13.us.us = getelementptr inbounds i32, ptr %3, i64 %iv
+  store i32 %2, ptr %arrayidx13.us.us, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.cond6.for.cond.cleanup8_crit_edge.us.us, label %for.body9.us.us
+
+for.cond6.for.cond.cleanup8_crit_edge.us.us:
+  %iv.next38 = add nuw nsw i64 %iv37, 1
+  %exitcond41.not = icmp eq i64 %iv.next38, %n
+  br i1 %exitcond41.not, label %for.cond2.for.cond.cleanup4_crit_edge.split.us.us, label %for.cond6.preheader.us.us, !llvm.loop !3
+
+for.cond2.for.cond.cleanup4_crit_edge.split.us.us:
+  %iv.next43 = add nuw nsw i64 %iv42, 1
+  %exitcond47.not = icmp eq i64 %iv.next43, %n
+  br i1 %exitcond47.not, label %for.cond.cleanup.loopexit, label %for.body.us
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+; void test1(int n, int **a)
+; {
+;   for (int k = 0; k < n; ++k) {
+;     a[k][0] = 0;
+;     for (int i = 0; i < n; ++i) {
+;     #pragma clang loop vectorize_width(4)
+;         for (int j = 0; j < n; ++j) {
+;             for (int x = 0; x < n; ++x) {
+;               a[i][j] = 2 + k+x;
+;             }
+;         }
+;
+;         for (int j = 0; j < n; ++j) {
+;             for (int x = 0; x < n; ++x) {
+;               a[i][j] += 2 + k+x;
+;             }
+;         }
+;     }
+;   }
+; }
+define void @test1(i32 %n, ptr %a) {
+; CHECK-LABEL: define void @test1(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP84:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP84]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT104:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_COND6_PREHEADER_LR_PH:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.cond6.preheader.lr.ph:
+; CHECK-NEXT:    [[IV99:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[IV_NEXT100:%.*]], [[FOR_COND_CLEANUP4:%.*]] ]
+; CHECK-NEXT:    [[IV87:%.*]] = phi i32 [ [[N]], [[FOR_BODY_LR_PH]] ], [ [[IV_NEXT88:%.*]], [[FOR_COND_CLEANUP4]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[IV99]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    store i32 0, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[IV99]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], [[N]]
+; CHECK-NEXT:    br label [[FOR_COND6_PREHEADER:%.*]]
+; CHECK:       for.cond6.preheader:
+; CHECK-NEXT:    [[IV94:%.*]] = phi i64 [ 0, [[FOR_COND6_PREHEADER_LR_PH]] ], [ [[IV_NEXT95:%.*]], [[FOR_COND_CLEANUP25:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[IV94]]
+; CHECK-NEXT:    [[DOTPRE:%.*]] = load ptr, ptr [[ARRAYIDX16]], align 8
+; CHECK-NEXT:    br label [[FOR_COND10_PREHEADER:%.*]]
+; CHECK:       for.cond.cleanup4:
+; CHECK-NEXT:    [[IV_NEXT100]] = add nuw nsw i64 [[IV99]], 1
+; CHECK-NEXT:    [[IV_NEXT88]] = add i32 [[IV87]], [[N]]
+; CHECK-NEXT:    [[EXITCOND105_NOT:%.*]] = icmp eq i64 [[IV_NEXT100]], [[WIDE_TRIP_COUNT104]]
+; CHECK-NEXT:    br i1 [[EXITCOND105_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_COND6_PREHEADER_LR_PH]]
+; CHECK:       for.cond10.preheader:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[FOR_COND6_PREHEADER]] ], [ [[IV_NEXT:%.*]], [[FOR_COND10_PREHEADER]] ]
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, ptr [[DOTPRE]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[TMP3]], ptr [[ARRAYIDX18]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT104]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND28_PREHEADER_PREHEADER:%.*]], label [[FOR_COND10_PREHEADER]]
+; CHECK:       for.cond28.preheader.preheader:
+; CHECK-NEXT:    br label [[FOR_COND28_PREHEADER:%.*]]
+; CHECK:       for.cond28.preheader:
+; CHECK-NEXT:    [[IV89:%.*]] = phi i64 [ [[IV_NEXT90:%.*]], [[FOR_COND28_PREHEADER]] ], [ 0, [[FOR_COND28_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i32, ptr [[DOTPRE]], i64 [[IV89]]
+; CHECK-NEXT:    [[ARRAYIDX37_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX37]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[IV87]], [[ARRAYIDX37_PROMOTED]]
+; CHECK-NEXT:    store i32 [[TMP4]], ptr [[ARRAYIDX37]], align 4
+; CHECK-NEXT:    [[IV_NEXT90]] = add nuw nsw i64 [[IV89]], 1
+; CHECK-NEXT:    [[EXITCOND93_NOT:%.*]] = icmp eq i64 [[IV_NEXT90]], [[WIDE_TRIP_COUNT104]]
+; CHECK-NEXT:    br i1 [[EXITCOND93_NOT]], label [[FOR_COND_CLEANUP25]], label [[FOR_COND28_PREHEADER]]
+; CHECK:       for.cond.cleanup25:
+; CHECK-NEXT:    [[IV_NEXT95]] = add nuw nsw i64 [[IV94]], 1
+; CHECK-NEXT:    [[EXITCOND98_NOT:%.*]] = icmp eq i64 [[IV_NEXT95]], [[WIDE_TRIP_COUNT104]]
+; CHECK-NEXT:    br i1 [[EXITCOND98_NOT]], label [[FOR_COND_CLEANUP4]], label [[FOR_COND6_PREHEADER]]
+;
+entry:
+  %cmp84 = icmp sgt i32 %n, 0
+  br i1 %cmp84, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:
+  %wide.trip.count104 = zext i32 %n to i64
+  br label %for.cond6.preheader.lr.ph
+
+for.cond.cleanup:
+  ret void
+
+for.cond6.preheader.lr.ph:
+  %iv99 = phi i64 [ 0, %for.body.lr.ph ], [ %iv.next100, %for.cond.cleanup4 ]
+  %iv87 = phi i32 [ %n, %for.body.lr.ph ], [ %iv.next88, %for.cond.cleanup4 ]
+  %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %iv99
+  %0 = load ptr, ptr %arrayidx, align 8
+  store i32 0, ptr %0, align 4
+  %1 = trunc i64 %iv99 to i32
+  %2 = add i32 %1, 2
+  %3 = add i32 %2, %n
+  br label %for.cond6.preheader
+
+for.cond6.preheader:
+  %iv94 = phi i64 [ 0, %for.cond6.preheader.lr.ph ], [ %iv.next95, %for.cond.cleanup25 ]
+  %arrayidx16 = getelementptr inbounds ptr, ptr %a, i64 %iv94
+  %.pre = load ptr, ptr %arrayidx16, align 8
+  br label %for.cond10.preheader
+
+for.cond.cleanup4:
+  %iv.next100 = add nuw nsw i64 %iv99, 1
+  %iv.next88 = add i32 %iv87, %n
+  %exitcond105.not = icmp eq i64 %iv.next100, %wide.trip.count104
+  br i1 %exitcond105.not, label %for.cond.cleanup, label %for.cond6.preheader.lr.ph
+
+for.cond10.preheader:
+  %iv = phi i64 [ 0, %for.cond6.preheader ], [ %iv.next, %for.cond10.preheader ]
+  %arrayidx18 = getelementptr inbounds i32, ptr %.pre, i64 %iv
+  store i32 %3, ptr %arrayidx18, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count104
+  br i1 %exitcond.not, label %for.cond28.preheader, label %for.cond10.preheader
+
+for.cond28.preheader:
+  %iv89 = phi i64 [ %iv.next90, %for.cond28.preheader ], [ 0, %for.cond10.preheader ]
+  %arrayidx37 = getelementptr inbounds i32, ptr %.pre, i64 %iv89
+  %arrayidx37.promoted = load i32, ptr %arrayidx37, align 4
+  %4 = add i32 %iv87, %arrayidx37.promoted
+  store i32 %4, ptr %arrayidx37, align 4
+  %iv.next90 = add nuw nsw i64 %iv89, 1
+  %exitcond93.not = icmp eq i64 %iv.next90, %wide.trip.count104
+  br i1 %exitcond93.not, label %for.cond.cleanup25, label %for.cond28.preheader
+
+for.cond.cleanup25:
+  %iv.next95 = add nuw nsw i64 %iv94, 1
+  %exitcond98.not = icmp eq i64 %iv.next95, %wide.trip.count104
+  br i1 %exitcond98.not, label %for.cond.cleanup4, label %for.cond6.preheader
+}
+
+; void test2(int n, int **a)
+; {
+;   for (int k = 0; k < n; ++k) {
+;     a[k][0] = 0;
+;     #pragma clang loop vectorize_width(4)
+;     for (int i = 0; i < n; ++i) {
+;         for (int j = 0; j < n; ++j) {
+;             for (int x = 0; x < n; ++x) {
+;               a[i][j] = 2 + k+x;
+;             }
+;         }
+;
+;         for (int j = 0; j < n; ++j) {
+;             for (int x = 0; x < n; ++x) {
+;               a[i][j] += 2 + k+x;
+;             }
+;         }
+;     }
+;   }
+; }
+define void @test2(i32 %n, ptr %a) {
+; CHECK-LABEL: define void @test2(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP84:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP84]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT104:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_COND6_PREHEADER_LR_PH:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.cond6.preheader.lr.ph:
+; CHECK-NEXT:    [[IV99:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[IV_NEXT100:%.*]], [[FOR_COND_CLEANUP4:%.*]] ]
+; CHECK-NEXT:    [[IV87:%.*]] = phi i32 [ [[N]], [[FOR_BODY_LR_PH]] ], [ [[IV_NEXT88:%.*]], [[FOR_COND_CLEANUP4]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[IV99]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    store i32 0, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[IV99]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], [[N]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT104]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT104]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT104]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i64> poison, i64 [[WIDE_TRIP_COUNT104]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT2]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[IV87]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_COND_CLEANUP2510:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_COND_CLEANUP2510]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds ptr, ptr [[A]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP4]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x ptr> poison)
+; CHECK-NEXT:    br label [[FOR_COND10_PREHEADER1:%.*]]
+; CHECK:       for.cond10.preheader1:
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP6:%.*]], [[FOR_COND10_PREHEADER1]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, <4 x ptr> [[WIDE_MASKED_GATHER]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[BROADCAST_SPLAT]], <4 x ptr> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP6]] = add nuw nsw <4 x i64> [[VEC_PHI]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i64> [[TMP6]], [[BROADCAST_SPLAT3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP7]], i32 0
+; CHECK-NEXT:    br i1 [[TMP8]], label [[FOR_COND28_PREHEADER_PREHEADER4:%.*]], label [[FOR_COND10_PREHEADER1]]
+; CHECK:       for.cond28.preheader.preheader4:
+; CHECK-NEXT:    br label [[FOR_COND28_PREHEADER5:%.*]]
+; CHECK:       for.cond28.preheader5:
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i64> [ zeroinitializer, [[FOR_COND28_PREHEADER_PREHEADER4]] ], [ [[TMP11:%.*]], [[FOR_COND28_PREHEADER5]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, <4 x ptr> [[WIDE_MASKED_GATHER]], <4 x i64> [[VEC_PHI6]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP9]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
+; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[BROADCAST_SPLAT9]], [[WIDE_MASKED_GATHER7]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP10]], <4 x ptr> [[TMP9]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP11]] = add nuw nsw <4 x i64> [[VEC_PHI6]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq <4 x i64> [[TMP11]], [[BROADCAST_SPLAT3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP12]], i32 0
+; CHECK-NEXT:    br i1 [[TMP13]], label [[FOR_COND_CLEANUP2510]], label [[FOR_COND28_PREHEADER5]]
+; CHECK:       for.cond.cleanup2510:
+; CHECK-NEXT:    [[TMP14:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq <4 x i64> [[TMP14]], [[BROADCAST_SPLAT3]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT104]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP4]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_COND6_PREHEADER_LR_PH]] ]
+; CHECK-NEXT:    br label [[FOR_COND6_PREHEADER:%.*]]
+; CHECK:       for.cond6.preheader:
+; CHECK-NEXT:    [[IV94:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT95:%.*]], [[FOR_COND_CLEANUP25:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[IV94]]
+; CHECK-NEXT:    [[DOTPRE:%.*]] = load ptr, ptr [[ARRAYIDX16]], align 8
+; CHECK-NEXT:    br label [[FOR_COND10_PREHEADER:%.*]]
+; CHECK:       for.cond.cleanup4:
+; CHECK-NEXT:    [[IV_NEXT100]] = add nuw nsw i64 [[IV99]], 1
+; CHECK-NEXT:    [[IV_NEXT88]] = add i32 [[IV87]], [[N]]
+; CHECK-NEXT:    [[EXITCOND105_NOT:%.*]] = icmp eq i64 [[IV_NEXT100]], [[WIDE_TRIP_COUNT104]]
+; CHECK-NEXT:    br i1 [[EXITCOND105_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_COND6_PREHEADER_LR_PH]]
+; CHECK:       for.cond10.preheader:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[FOR_COND6_PREHEADER]] ], [ [[IV_NEXT:%.*]], [[FOR_COND10_PREHEADER]] ]
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, ptr [[DOTPRE]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[TMP3]], ptr [[ARRAYIDX18]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT104]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND28_PREHEADER_PREHEADER:%.*]], label [[FOR_COND10_PREHEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       for.cond28.preheader.preheader:
+; CHECK-NEXT:    br label [[FOR_COND28_PREHEADER:%.*]]
+; CHECK:       for.cond28.preheader:
+; CHECK-NEXT:    [[IV89:%.*]] = phi i64 [ [[IV_NEXT90:%.*]], [[FOR_COND28_PREHEADER]] ], [ 0, [[FOR_COND28_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i32, ptr [[DOTPRE]], i64 [[IV89]]
+; CHECK-NEXT:    [[ARRAYIDX37_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX37]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[IV87]], [[ARRAYIDX37_PROMOTED]]
+; CHECK-NEXT:    store i32 [[TMP17]], ptr [[ARRAYIDX37]], align 4
+; CHECK-NEXT:    [[IV_NEXT90]] = add nuw nsw i64 [[IV89]], 1
+; CHECK-NEXT:    [[EXITCOND93_NOT:%.*]] = icmp eq i64 [[IV_NEXT90]], [[WIDE_TRIP_COUNT104]]
+; CHECK-NEXT:    br i1 [[EXITCOND93_NOT]], label [[FOR_COND_CLEANUP25]], label [[FOR_COND28_PREHEADER]]
+; CHECK:       for.cond.cleanup25:
+; CHECK-NEXT:    [[IV_NEXT95]] = add nuw nsw i64 [[IV94]], 1
+; CHECK-NEXT:    [[EXITCOND98_NOT:%.*]] = icmp eq i64 [[IV_NEXT95]], [[WIDE_TRIP_COUNT104]]
+; CHECK-NEXT:    br i1 [[EXITCOND98_NOT]], label [[FOR_COND_CLEANUP4]], label [[FOR_COND6_PREHEADER]], !llvm.loop [[LOOP9:![0-9]+]]
+;
+entry:
+  %cmp84 = icmp sgt i32 %n, 0
+  br i1 %cmp84, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:
+  %wide.trip.count104 = zext i32 %n to i64
+  br label %for.cond6.preheader.lr.ph
+
+for.cond.cleanup:
+  ret void
+
+for.cond6.preheader.lr.ph:
+  %iv99 = phi i64 [ 0, %for.body.lr.ph ], [ %iv.next100, %for.cond.cleanup4 ]
+  %iv87 = phi i32 [ %n, %for.body.lr.ph ], [ %iv.next88, %for.cond.cleanup4 ]
+  %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %iv99
+  %0 = load ptr, ptr %arrayidx, align 8
+  store i32 0, ptr %0, align 4
+  %1 = trunc i64 %iv99 to i32
+  %2 = add i32 %1, 2
+  %3 = add i32 %2, %n
+  br label %for.cond6.preheader
+
+for.cond6.preheader:
+  %iv94 = phi i64 [ 0, %for.cond6.preheader.lr.ph ], [ %iv.next95, %for.cond.cleanup25 ]
+  %arrayidx16 = getelementptr inbounds ptr, ptr %a, i64 %iv94
+  %.pre = load ptr, ptr %arrayidx16, align 8
+  br label %for.cond10.preheader
+
+for.cond.cleanup4:
+  %iv.next100 = add nuw nsw i64 %iv99, 1
+  %iv.next88 = add i32 %iv87, %n
+  %exitcond105.not = icmp eq i64 %iv.next100, %wide.trip.count104
+  br i1 %exitcond105.not, label %for.cond.cleanup, label %for.cond6.preheader.lr.ph
+
+for.cond10.preheader:
+  %iv = phi i64 [ 0, %for.cond6.preheader ], [ %iv.next, %for.cond10.preheader ]
+  %arrayidx18 = getelementptr inbounds i32, ptr %.pre, i64 %iv
+  store i32 %3, ptr %arrayidx18, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count104
+  br i1 %exitcond.not, label %for.cond28.preheader, label %for.cond10.preheader, !llvm.loop !3
+
+for.cond28.preheader:
+  %iv89 = phi i64 [ %iv.next90, %for.cond28.preheader ], [ 0, %for.cond10.preheader ]
+  %arrayidx37 = getelementptr inbounds i32, ptr %.pre, i64 %iv89
+  %arrayidx37.promoted = load i32, ptr %arrayidx37, align 4
+  %4 = add i32 %iv87, %arrayidx37.promoted
+  store i32 %4, ptr %arrayidx37, align 4
+  %iv.next90 = add nuw nsw i64 %iv89, 1
+  %exitcond93.not = icmp eq i64 %iv.next90, %wide.trip.count104
+  br i1 %exitcond93.not, label %for.cond.cleanup25, label %for.cond28.preheader
+
+for.cond.cleanup25:
+  %iv.next95 = add nuw nsw i64 %iv94, 1
+  %exitcond98.not = icmp eq i64 %iv.next95, %wide.trip.count104
+  br i1 %exitcond98.not, label %for.cond.cleanup4, label %for.cond6.preheader, !llvm.loop !3
+}
+
+!3 = distinct !{!3, !4, !5, !6}
+!4 = !{!"llvm.loop.vectorize.width", i32 4}
+!5 = !{!"llvm.loop.vectorize.scalable.enable", i1 false}
+!6 = !{!"llvm.loop.vectorize.enable", i1 true}

>From 3a41fe496b66041ba22460368b0dc113c0d25e52 Mon Sep 17 00:00:00 2001
From: Kolya Panchenko <kolya.panchenko at sifive.com>
Date: Thu, 2 Nov 2023 15:52:20 -0700
Subject: [PATCH 2/3] Updated test

---
 .../outer_loop_hcfg_construction.ll           | 564 +++++++-----------
 1 file changed, 213 insertions(+), 351 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_hcfg_construction.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_hcfg_construction.ll
index 895a129d231ba96..1c20791fb42ad40 100644
--- a/llvm/test/Transforms/LoopVectorize/outer_loop_hcfg_construction.ll
+++ b/llvm/test/Transforms/LoopVectorize/outer_loop_hcfg_construction.ll
@@ -1,5 +1,5 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -S -passes=loop-vectorize -enable-vplan-native-path < %s -S | FileCheck %s
+
 ; void test(int n, int **a)
 ; {
 ;   for (int k = 0; k < n; ++k) {
@@ -12,21 +12,20 @@
 ;     }
 ;   }
 ; }
-
-define void @test(i64 %n, ptr %a) {
-; CHECK-LABEL: define void @test(
+;
+; Make sure VPlan HCFG is constructed when we try to vectorize non-outermost loop
+;
+define void @non_outermost_loop_hcfg_construction(i64 %n, ptr %a) {
+; CHECK-LABEL: define void @non_outermost_loop_hcfg_construction(
 ; CHECK-SAME: i64 [[N:%.*]], ptr [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP34:%.*]] = icmp sgt i64 [[N]], 0
-; CHECK-NEXT:    br i1 [[CMP34]], label [[FOR_BODY_US_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.body.us.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY_US:%.*]]
-; CHECK:       for.body.us:
-; CHECK-NEXT:    [[IV42:%.*]] = phi i64 [ 0, [[FOR_BODY_US_PREHEADER]] ], [ [[IV_NEXT43:%.*]], [[FOR_COND2_FOR_COND_CLEANUP4_CRIT_EDGE_SPLIT_US_US:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[IV42]]
+; CHECK-NEXT:    br label [[OUTERMOST_LOOP:%.*]]
+; CHECK:       outermost.loop:
+; CHECK-NEXT:    [[K:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[K_NEXT:%.*]], [[OUTERMOST_LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[K]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX_US]], align 8
 ; CHECK-NEXT:    store i32 0, ptr [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[IV42]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[K]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -39,20 +38,20 @@ define void @test(i64 %n, ptr %a) {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT2]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_COND6_FOR_COND_CLEANUP8_CRIT_EDGE_US_US4:%.*]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_COND6_FOR_COND_CLEANUP8_CRIT_EDGE_US_US4]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[MIDDLE_LOOP_LATCH4:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[MIDDLE_LOOP_LATCH4]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[A]], <4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP3]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x ptr> poison)
-; CHECK-NEXT:    br label [[FOR_BODY9_US_US1:%.*]]
-; CHECK:       for.body9.us.us1:
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP5:%.*]], [[FOR_BODY9_US_US1]] ]
+; CHECK-NEXT:    br label [[INNERMOST_LOOP1:%.*]]
+; CHECK:       innermost.loop1:
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP5:%.*]], [[INNERMOST_LOOP1]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, <4 x ptr> [[WIDE_MASKED_GATHER]], <4 x i64> [[VEC_PHI]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[BROADCAST_SPLAT]], <4 x ptr> [[TMP4]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
 ; CHECK-NEXT:    [[TMP5]] = add nuw nsw <4 x i64> [[VEC_PHI]], <i64 1, i64 1, i64 1, i64 1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP5]], [[BROADCAST_SPLAT3]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
-; CHECK-NEXT:    br i1 [[TMP7]], label [[FOR_COND6_FOR_COND_CLEANUP8_CRIT_EDGE_US_US4]], label [[FOR_BODY9_US_US1]]
-; CHECK:       for.cond6.for.cond.cleanup8_crit_edge.us.us4:
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_LOOP_LATCH4]], label [[INNERMOST_LOOP1]]
+; CHECK:       middle.loop.latch4:
 ; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i64> [[TMP8]], [[BROADCAST_SPLAT3]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -61,398 +60,261 @@ define void @test(i64 %n, ptr %a) {
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND2_FOR_COND_CLEANUP4_CRIT_EDGE_SPLIT_US_US]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[OUTERMOST_LOOP_LATCH]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_US]] ]
-; CHECK-NEXT:    br label [[FOR_COND6_PREHEADER_US_US:%.*]]
-; CHECK:       for.cond6.preheader.us.us:
-; CHECK-NEXT:    [[IV37:%.*]] = phi i64 [ [[IV_NEXT38:%.*]], [[FOR_COND6_FOR_COND_CLEANUP8_CRIT_EDGE_US_US:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX11_US_US:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[IV37]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[OUTERMOST_LOOP]] ]
+; CHECK-NEXT:    br label [[MIDDLE_LOOP:%.*]]
+; CHECK:       middle.loop:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[MIDDLE_LOOP_LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX11_US_US:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[I]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[ARRAYIDX11_US_US]], align 8
-; CHECK-NEXT:    br label [[FOR_BODY9_US_US:%.*]]
-; CHECK:       for.body9.us.us:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY9_US_US]] ], [ 0, [[FOR_COND6_PREHEADER_US_US]] ]
-; CHECK-NEXT:    [[ARRAYIDX13_US_US:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 [[IV]]
+; CHECK-NEXT:    br label [[INNERMOST_LOOP:%.*]]
+; CHECK:       innermost.loop:
+; CHECK-NEXT:    [[J:%.*]] = phi i64 [ [[J_NEXT:%.*]], [[INNERMOST_LOOP]] ], [ 0, [[MIDDLE_LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX13_US_US:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 [[J]]
 ; CHECK-NEXT:    store i32 [[TMP2]], ptr [[ARRAYIDX13_US_US]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND6_FOR_COND_CLEANUP8_CRIT_EDGE_US_US]], label [[FOR_BODY9_US_US]]
-; CHECK:       for.cond6.for.cond.cleanup8_crit_edge.us.us:
-; CHECK-NEXT:    [[IV_NEXT38]] = add nuw nsw i64 [[IV37]], 1
-; CHECK-NEXT:    [[EXITCOND41_NOT:%.*]] = icmp eq i64 [[IV_NEXT38]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND41_NOT]], label [[FOR_COND2_FOR_COND_CLEANUP4_CRIT_EDGE_SPLIT_US_US]], label [[FOR_COND6_PREHEADER_US_US]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       for.cond2.for.cond.cleanup4_crit_edge.split.us.us:
-; CHECK-NEXT:    [[IV_NEXT43]] = add nuw nsw i64 [[IV42]], 1
-; CHECK-NEXT:    [[EXITCOND47_NOT:%.*]] = icmp eq i64 [[IV_NEXT43]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND47_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY_US]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK-NEXT:    [[J_NEXT]] = add nuw nsw i64 [[J]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[J_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[MIDDLE_LOOP_LATCH]], label [[INNERMOST_LOOP]]
+; CHECK:       middle.loop.latch:
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[EXITCOND41_NOT:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND41_NOT]], label [[OUTERMOST_LOOP_LATCH]], label [[MIDDLE_LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       outermost.loop.latch:
+; CHECK-NEXT:    [[K_NEXT]] = add nuw nsw i64 [[K]], 1
+; CHECK-NEXT:    [[EXITCOND47_NOT:%.*]] = icmp eq i64 [[K_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND47_NOT]], label [[OUTERMOST_LOOP_POSTEXIT:%.*]], label [[OUTERMOST_LOOP]]
+; CHECK:       outermost.loop.postexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %cmp34 = icmp sgt i64 %n, 0
-  br i1 %cmp34, label %for.body.us.preheader, label %for.cond.cleanup
-
-for.body.us.preheader:
-  br label %for.body.us
+  br label %outermost.loop
 
-for.body.us:
-  %iv42 = phi i64 [ 0, %for.body.us.preheader ], [ %iv.next43, %for.cond2.for.cond.cleanup4_crit_edge.split.us.us ]
-  %arrayidx.us = getelementptr inbounds ptr, ptr %a, i64 %iv42
+outermost.loop:
+  %k = phi i64 [ 0, %entry ], [ %k.next, %outermost.loop.latch ]
+  %arrayidx.us = getelementptr inbounds ptr, ptr %a, i64 %k
   %0 = load ptr, ptr %arrayidx.us, align 8
   store i32 0, ptr %0, align 4
-  %1 = trunc i64 %iv42 to i32
+  %1 = trunc i64 %k to i32
   %2 = add i32 %1, 2
-  br label %for.cond6.preheader.us.us
+  br label %middle.loop
 
-for.cond6.preheader.us.us:
-  %iv37 = phi i64 [ %iv.next38, %for.cond6.for.cond.cleanup8_crit_edge.us.us ], [ 0, %for.body.us ]
-  %arrayidx11.us.us = getelementptr inbounds ptr, ptr %a, i64 %iv37
+middle.loop:
+  %i = phi i64 [ %i.next, %middle.loop.latch ], [ 0, %outermost.loop ]
+  %arrayidx11.us.us = getelementptr inbounds ptr, ptr %a, i64 %i
   %3 = load ptr, ptr %arrayidx11.us.us, align 8
-  br label %for.body9.us.us
+  br label %innermost.loop
 
-for.body9.us.us:
-  %iv = phi i64 [ %iv.next, %for.body9.us.us ], [ 0, %for.cond6.preheader.us.us ]
-  %arrayidx13.us.us = getelementptr inbounds i32, ptr %3, i64 %iv
+innermost.loop:
+  %j = phi i64 [ %j.next, %innermost.loop ], [ 0, %middle.loop ]
+  %arrayidx13.us.us = getelementptr inbounds i32, ptr %3, i64 %j
   store i32 %2, ptr %arrayidx13.us.us, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %n
-  br i1 %exitcond.not, label %for.cond6.for.cond.cleanup8_crit_edge.us.us, label %for.body9.us.us
+  %j.next = add nuw nsw i64 %j, 1
+  %exitcond.not = icmp eq i64 %j.next, %n
+  br i1 %exitcond.not, label %middle.loop.latch, label %innermost.loop
 
-for.cond6.for.cond.cleanup8_crit_edge.us.us:
-  %iv.next38 = add nuw nsw i64 %iv37, 1
-  %exitcond41.not = icmp eq i64 %iv.next38, %n
-  br i1 %exitcond41.not, label %for.cond2.for.cond.cleanup4_crit_edge.split.us.us, label %for.cond6.preheader.us.us, !llvm.loop !3
+middle.loop.latch:
+  %i.next = add nuw nsw i64 %i, 1
+  %exitcond41.not = icmp eq i64 %i.next, %n
+  br i1 %exitcond41.not, label %outermost.loop.latch, label %middle.loop, !llvm.loop !3
 
-for.cond2.for.cond.cleanup4_crit_edge.split.us.us:
-  %iv.next43 = add nuw nsw i64 %iv42, 1
-  %exitcond47.not = icmp eq i64 %iv.next43, %n
-  br i1 %exitcond47.not, label %for.cond.cleanup.loopexit, label %for.body.us
+outermost.loop.latch:
+  %k.next = add nuw nsw i64 %k, 1
+  %exitcond47.not = icmp eq i64 %k.next, %n
+  br i1 %exitcond47.not, label %outermost.loop.postexit, label %outermost.loop
 
-for.cond.cleanup.loopexit:
+outermost.loop.postexit:
   br label %for.cond.cleanup
 
 for.cond.cleanup:
   ret void
 }
 
-; void test1(int n, int **a)
+; void non_outermost_loop_hcfg_construction_other_loops_at_same_level(long n, int **a)
 ; {
-;   for (int k = 0; k < n; ++k) {
+;   for (long k = 0; k < n; ++k) {
 ;     a[k][0] = 0;
-;     for (int i = 0; i < n; ++i) {
-;     #pragma clang loop vectorize_width(4)
-;         for (int j = 0; j < n; ++j) {
-;             for (int x = 0; x < n; ++x) {
-;               a[i][j] = 2 + k+x;
+;     for (long  i = 0; i < n; ++i) {
+;         #pragma clang loop vectorize_width(4)
+;         for (long j0 = 0; j0 < n; ++j0) {
+;             for (long x = 0; x < n; ++x) {
+;               a[x+i][j0] = 2 + k+x;
 ;             }
 ;         }
 ;
-;         for (int j = 0; j < n; ++j) {
-;             for (int x = 0; x < n; ++x) {
-;               a[i][j] += 2 + k+x;
-;             }
+;         for (long j1 = n; j1 > 0; --j1) {
+;           a[i][j1] *= j1 & 1;
 ;         }
 ;     }
 ;   }
 ; }
-define void @test1(i32 %n, ptr %a) {
-; CHECK-LABEL: define void @test1(
-; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP84:%.*]] = icmp sgt i32 [[N]], 0
-; CHECK-NEXT:    br i1 [[CMP84]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.body.lr.ph:
-; CHECK-NEXT:    [[WIDE_TRIP_COUNT104:%.*]] = zext i32 [[N]] to i64
-; CHECK-NEXT:    br label [[FOR_COND6_PREHEADER_LR_PH:%.*]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    ret void
-; CHECK:       for.cond6.preheader.lr.ph:
-; CHECK-NEXT:    [[IV99:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[IV_NEXT100:%.*]], [[FOR_COND_CLEANUP4:%.*]] ]
-; CHECK-NEXT:    [[IV87:%.*]] = phi i32 [ [[N]], [[FOR_BODY_LR_PH]] ], [ [[IV_NEXT88:%.*]], [[FOR_COND_CLEANUP4]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[IV99]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    store i32 0, ptr [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[IV99]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], [[N]]
-; CHECK-NEXT:    br label [[FOR_COND6_PREHEADER:%.*]]
-; CHECK:       for.cond6.preheader:
-; CHECK-NEXT:    [[IV94:%.*]] = phi i64 [ 0, [[FOR_COND6_PREHEADER_LR_PH]] ], [ [[IV_NEXT95:%.*]], [[FOR_COND_CLEANUP25:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[IV94]]
-; CHECK-NEXT:    [[DOTPRE:%.*]] = load ptr, ptr [[ARRAYIDX16]], align 8
-; CHECK-NEXT:    br label [[FOR_COND10_PREHEADER:%.*]]
-; CHECK:       for.cond.cleanup4:
-; CHECK-NEXT:    [[IV_NEXT100]] = add nuw nsw i64 [[IV99]], 1
-; CHECK-NEXT:    [[IV_NEXT88]] = add i32 [[IV87]], [[N]]
-; CHECK-NEXT:    [[EXITCOND105_NOT:%.*]] = icmp eq i64 [[IV_NEXT100]], [[WIDE_TRIP_COUNT104]]
-; CHECK-NEXT:    br i1 [[EXITCOND105_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_COND6_PREHEADER_LR_PH]]
-; CHECK:       for.cond10.preheader:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[FOR_COND6_PREHEADER]] ], [ [[IV_NEXT:%.*]], [[FOR_COND10_PREHEADER]] ]
-; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, ptr [[DOTPRE]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[TMP3]], ptr [[ARRAYIDX18]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT104]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND28_PREHEADER_PREHEADER:%.*]], label [[FOR_COND10_PREHEADER]]
-; CHECK:       for.cond28.preheader.preheader:
-; CHECK-NEXT:    br label [[FOR_COND28_PREHEADER:%.*]]
-; CHECK:       for.cond28.preheader:
-; CHECK-NEXT:    [[IV89:%.*]] = phi i64 [ [[IV_NEXT90:%.*]], [[FOR_COND28_PREHEADER]] ], [ 0, [[FOR_COND28_PREHEADER_PREHEADER]] ]
-; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i32, ptr [[DOTPRE]], i64 [[IV89]]
-; CHECK-NEXT:    [[ARRAYIDX37_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX37]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[IV87]], [[ARRAYIDX37_PROMOTED]]
-; CHECK-NEXT:    store i32 [[TMP4]], ptr [[ARRAYIDX37]], align 4
-; CHECK-NEXT:    [[IV_NEXT90]] = add nuw nsw i64 [[IV89]], 1
-; CHECK-NEXT:    [[EXITCOND93_NOT:%.*]] = icmp eq i64 [[IV_NEXT90]], [[WIDE_TRIP_COUNT104]]
-; CHECK-NEXT:    br i1 [[EXITCOND93_NOT]], label [[FOR_COND_CLEANUP25]], label [[FOR_COND28_PREHEADER]]
-; CHECK:       for.cond.cleanup25:
-; CHECK-NEXT:    [[IV_NEXT95]] = add nuw nsw i64 [[IV94]], 1
-; CHECK-NEXT:    [[EXITCOND98_NOT:%.*]] = icmp eq i64 [[IV_NEXT95]], [[WIDE_TRIP_COUNT104]]
-; CHECK-NEXT:    br i1 [[EXITCOND98_NOT]], label [[FOR_COND_CLEANUP4]], label [[FOR_COND6_PREHEADER]]
 ;
-entry:
-  %cmp84 = icmp sgt i32 %n, 0
-  br i1 %cmp84, label %for.body.lr.ph, label %for.cond.cleanup
-
-for.body.lr.ph:
-  %wide.trip.count104 = zext i32 %n to i64
-  br label %for.cond6.preheader.lr.ph
-
-for.cond.cleanup:
-  ret void
-
-for.cond6.preheader.lr.ph:
-  %iv99 = phi i64 [ 0, %for.body.lr.ph ], [ %iv.next100, %for.cond.cleanup4 ]
-  %iv87 = phi i32 [ %n, %for.body.lr.ph ], [ %iv.next88, %for.cond.cleanup4 ]
-  %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %iv99
-  %0 = load ptr, ptr %arrayidx, align 8
-  store i32 0, ptr %0, align 4
-  %1 = trunc i64 %iv99 to i32
-  %2 = add i32 %1, 2
-  %3 = add i32 %2, %n
-  br label %for.cond6.preheader
-
-for.cond6.preheader:
-  %iv94 = phi i64 [ 0, %for.cond6.preheader.lr.ph ], [ %iv.next95, %for.cond.cleanup25 ]
-  %arrayidx16 = getelementptr inbounds ptr, ptr %a, i64 %iv94
-  %.pre = load ptr, ptr %arrayidx16, align 8
-  br label %for.cond10.preheader
-
-for.cond.cleanup4:
-  %iv.next100 = add nuw nsw i64 %iv99, 1
-  %iv.next88 = add i32 %iv87, %n
-  %exitcond105.not = icmp eq i64 %iv.next100, %wide.trip.count104
-  br i1 %exitcond105.not, label %for.cond.cleanup, label %for.cond6.preheader.lr.ph
-
-for.cond10.preheader:
-  %iv = phi i64 [ 0, %for.cond6.preheader ], [ %iv.next, %for.cond10.preheader ]
-  %arrayidx18 = getelementptr inbounds i32, ptr %.pre, i64 %iv
-  store i32 %3, ptr %arrayidx18, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count104
-  br i1 %exitcond.not, label %for.cond28.preheader, label %for.cond10.preheader
-
-for.cond28.preheader:
-  %iv89 = phi i64 [ %iv.next90, %for.cond28.preheader ], [ 0, %for.cond10.preheader ]
-  %arrayidx37 = getelementptr inbounds i32, ptr %.pre, i64 %iv89
-  %arrayidx37.promoted = load i32, ptr %arrayidx37, align 4
-  %4 = add i32 %iv87, %arrayidx37.promoted
-  store i32 %4, ptr %arrayidx37, align 4
-  %iv.next90 = add nuw nsw i64 %iv89, 1
-  %exitcond93.not = icmp eq i64 %iv.next90, %wide.trip.count104
-  br i1 %exitcond93.not, label %for.cond.cleanup25, label %for.cond28.preheader
-
-for.cond.cleanup25:
-  %iv.next95 = add nuw nsw i64 %iv94, 1
-  %exitcond98.not = icmp eq i64 %iv.next95, %wide.trip.count104
-  br i1 %exitcond98.not, label %for.cond.cleanup4, label %for.cond6.preheader
-}
-
-; void test2(int n, int **a)
-; {
-;   for (int k = 0; k < n; ++k) {
-;     a[k][0] = 0;
-;     #pragma clang loop vectorize_width(4)
-;     for (int i = 0; i < n; ++i) {
-;         for (int j = 0; j < n; ++j) {
-;             for (int x = 0; x < n; ++x) {
-;               a[i][j] = 2 + k+x;
-;             }
-;         }
+; Make sure VPlan HCFG is constructed when we try to vectorize loop with other loops at level > 0
 ;
-;         for (int j = 0; j < n; ++j) {
-;             for (int x = 0; x < n; ++x) {
-;               a[i][j] += 2 + k+x;
-;             }
-;         }
-;     }
-;   }
-; }
-define void @test2(i32 %n, ptr %a) {
-; CHECK-LABEL: define void @test2(
-; CHECK-SAME: i32 [[N:%.*]], ptr [[A:%.*]]) {
+define void @non_outermost_loop_hcfg_construction_other_loops_at_same_level(i64 %n, ptr %a) {
+; CHECK-LABEL: define void @non_outermost_loop_hcfg_construction_other_loops_at_same_level(
+; CHECK-SAME: i64 [[N:%.*]], ptr [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP84:%.*]] = icmp sgt i32 [[N]], 0
-; CHECK-NEXT:    br i1 [[CMP84]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.body.lr.ph:
-; CHECK-NEXT:    [[WIDE_TRIP_COUNT104:%.*]] = zext i32 [[N]] to i64
-; CHECK-NEXT:    br label [[FOR_COND6_PREHEADER_LR_PH:%.*]]
-; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
-; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    br label [[OUTERMOST_LOOP_K:%.*]]
+; CHECK:       return:
 ; CHECK-NEXT:    ret void
-; CHECK:       for.cond6.preheader.lr.ph:
-; CHECK-NEXT:    [[IV99:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[IV_NEXT100:%.*]], [[FOR_COND_CLEANUP4:%.*]] ]
-; CHECK-NEXT:    [[IV87:%.*]] = phi i32 [ [[N]], [[FOR_BODY_LR_PH]] ], [ [[IV_NEXT88:%.*]], [[FOR_COND_CLEANUP4]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[IV99]]
+; CHECK:       outermost.loop.k:
+; CHECK-NEXT:    [[K:%.*]] = phi i64 [ [[K_NEXT:%.*]], [[OUTERMOST_LOOP_K_CLEANUP:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[K]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    store i32 0, ptr [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[IV99]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], [[N]]
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT104]], 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i64 [[K]], 2
+; CHECK-NEXT:    br label [[MIDDLE_LOOP_I:%.*]]
+; CHECK:       middle.loop.i:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[OUTERMOST_LOOP_K]] ], [ [[I_NEXT:%.*]], [[MIDDLE_LOOP_I_CLEANUP:%.*]] ]
+; CHECK-NEXT:    [[INVARIANT_GEP:%.*]] = getelementptr ptr, ptr [[A]], i64 [[I]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT104]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT104]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i64> poison, i64 [[WIDE_TRIP_COUNT104]], i64 0
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[ADD]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT2]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[IV87]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_COND_CLEANUP2510:%.*]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_COND_CLEANUP2510]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds ptr, ptr [[A]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP4]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x ptr> poison)
-; CHECK-NEXT:    br label [[FOR_COND10_PREHEADER1:%.*]]
-; CHECK:       for.cond10.preheader1:
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP6:%.*]], [[FOR_COND10_PREHEADER1]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, <4 x ptr> [[WIDE_MASKED_GATHER]], <4 x i64> [[VEC_PHI]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[BROADCAST_SPLAT]], <4 x ptr> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT:    [[TMP6]] = add nuw nsw <4 x i64> [[VEC_PHI]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i64> [[TMP6]], [[BROADCAST_SPLAT3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP7]], i32 0
-; CHECK-NEXT:    br i1 [[TMP8]], label [[FOR_COND28_PREHEADER_PREHEADER4:%.*]], label [[FOR_COND10_PREHEADER1]]
-; CHECK:       for.cond28.preheader.preheader4:
-; CHECK-NEXT:    br label [[FOR_COND28_PREHEADER5:%.*]]
-; CHECK:       for.cond28.preheader5:
-; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i64> [ zeroinitializer, [[FOR_COND28_PREHEADER_PREHEADER4]] ], [ [[TMP11:%.*]], [[FOR_COND28_PREHEADER5]] ]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, <4 x ptr> [[WIDE_MASKED_GATHER]], <4 x i64> [[VEC_PHI6]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP9]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
-; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[BROADCAST_SPLAT9]], [[WIDE_MASKED_GATHER7]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP10]], <4 x ptr> [[TMP9]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT:    [[TMP11]] = add nuw nsw <4 x i64> [[VEC_PHI6]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq <4 x i64> [[TMP11]], [[BROADCAST_SPLAT3]]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP12]], i32 0
-; CHECK-NEXT:    br i1 [[TMP13]], label [[FOR_COND_CLEANUP2510]], label [[FOR_COND28_PREHEADER5]]
-; CHECK:       for.cond.cleanup2510:
-; CHECK-NEXT:    [[TMP14:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq <4 x i64> [[TMP14]], [[BROADCAST_SPLAT3]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[MIDDLE_LOOP_J0_CLEANUP4:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[MIDDLE_LOOP_J0_CLEANUP4]] ]
+; CHECK-NEXT:    br label [[INNERMOST_LOOP1:%.*]]
+; CHECK:       innermost.loop1:
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP5:%.*]], [[INNERMOST_LOOP1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw <4 x i64> [[BROADCAST_SPLAT]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr ptr, ptr [[INVARIANT_GEP]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP3]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x ptr> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, <4 x ptr> [[WIDE_MASKED_GATHER]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP2]], <4 x ptr> [[TMP4]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP5]] = add nuw nsw <4 x i64> [[VEC_PHI]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP5]], [[BROADCAST_SPLAT3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_LOOP_J0_CLEANUP4]], label [[INNERMOST_LOOP1]]
+; CHECK:       middle.loop.j0.cleanup4:
+; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i64> [[TMP8]], [[BROADCAST_SPLAT3]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT104]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP4]], label [[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[INNERMOST_LOOP_J1_LR_PH:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_COND6_PREHEADER_LR_PH]] ]
-; CHECK-NEXT:    br label [[FOR_COND6_PREHEADER:%.*]]
-; CHECK:       for.cond6.preheader:
-; CHECK-NEXT:    [[IV94:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT95:%.*]], [[FOR_COND_CLEANUP25:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[IV94]]
-; CHECK-NEXT:    [[DOTPRE:%.*]] = load ptr, ptr [[ARRAYIDX16]], align 8
-; CHECK-NEXT:    br label [[FOR_COND10_PREHEADER:%.*]]
-; CHECK:       for.cond.cleanup4:
-; CHECK-NEXT:    [[IV_NEXT100]] = add nuw nsw i64 [[IV99]], 1
-; CHECK-NEXT:    [[IV_NEXT88]] = add i32 [[IV87]], [[N]]
-; CHECK-NEXT:    [[EXITCOND105_NOT:%.*]] = icmp eq i64 [[IV_NEXT100]], [[WIDE_TRIP_COUNT104]]
-; CHECK-NEXT:    br i1 [[EXITCOND105_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_COND6_PREHEADER_LR_PH]]
-; CHECK:       for.cond10.preheader:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[FOR_COND6_PREHEADER]] ], [ [[IV_NEXT:%.*]], [[FOR_COND10_PREHEADER]] ]
-; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, ptr [[DOTPRE]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[TMP3]], ptr [[ARRAYIDX18]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT104]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND28_PREHEADER_PREHEADER:%.*]], label [[FOR_COND10_PREHEADER]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       for.cond28.preheader.preheader:
-; CHECK-NEXT:    br label [[FOR_COND28_PREHEADER:%.*]]
-; CHECK:       for.cond28.preheader:
-; CHECK-NEXT:    [[IV89:%.*]] = phi i64 [ [[IV_NEXT90:%.*]], [[FOR_COND28_PREHEADER]] ], [ 0, [[FOR_COND28_PREHEADER_PREHEADER]] ]
-; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i32, ptr [[DOTPRE]], i64 [[IV89]]
-; CHECK-NEXT:    [[ARRAYIDX37_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX37]], align 4
-; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[IV87]], [[ARRAYIDX37_PROMOTED]]
-; CHECK-NEXT:    store i32 [[TMP17]], ptr [[ARRAYIDX37]], align 4
-; CHECK-NEXT:    [[IV_NEXT90]] = add nuw nsw i64 [[IV89]], 1
-; CHECK-NEXT:    [[EXITCOND93_NOT:%.*]] = icmp eq i64 [[IV_NEXT90]], [[WIDE_TRIP_COUNT104]]
-; CHECK-NEXT:    br i1 [[EXITCOND93_NOT]], label [[FOR_COND_CLEANUP25]], label [[FOR_COND28_PREHEADER]]
-; CHECK:       for.cond.cleanup25:
-; CHECK-NEXT:    [[IV_NEXT95]] = add nuw nsw i64 [[IV94]], 1
-; CHECK-NEXT:    [[EXITCOND98_NOT:%.*]] = icmp eq i64 [[IV_NEXT95]], [[WIDE_TRIP_COUNT104]]
-; CHECK-NEXT:    br i1 [[EXITCOND98_NOT]], label [[FOR_COND_CLEANUP4]], label [[FOR_COND6_PREHEADER]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[MIDDLE_LOOP_I]] ]
+; CHECK-NEXT:    br label [[MIDDLE_LOOP_J0_PH:%.*]]
+; CHECK:       outermost.loop.k.cleanup:
+; CHECK-NEXT:    [[K_NEXT]] = add nuw nsw i64 [[K]], 1
+; CHECK-NEXT:    [[EXITCOND71_NOT:%.*]] = icmp eq i64 [[K_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND71_NOT]], label [[RETURN:%.*]], label [[OUTERMOST_LOOP_K]]
+; CHECK:       innermost.loop.j1.lr.ph:
+; CHECK-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[INVARIANT_GEP]], align 8
+; CHECK-NEXT:    br label [[INNERMOST_LOOP_J1:%.*]]
+; CHECK:       middle.loop.j0.ph:
+; CHECK-NEXT:    [[J0:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[J0_NEXT:%.*]], [[MIDDLE_LOOP_J0_CLEANUP:%.*]] ]
+; CHECK-NEXT:    br label [[INNERMOST_LOOP:%.*]]
+; CHECK:       middle.loop.j0.cleanup:
+; CHECK-NEXT:    [[J0_NEXT]] = add nuw nsw i64 [[J0]], 1
+; CHECK-NEXT:    [[J0_EXIT_COND_NOT:%.*]] = icmp eq i64 [[J0_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[J0_EXIT_COND_NOT]], label [[INNERMOST_LOOP_J1_LR_PH]], label [[MIDDLE_LOOP_J0_PH]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       innermost.loop:
+; CHECK-NEXT:    [[X:%.*]] = phi i64 [ 0, [[MIDDLE_LOOP_J0_PH]] ], [ [[X_NEXT:%.*]], [[INNERMOST_LOOP]] ]
+; CHECK-NEXT:    [[ADD14:%.*]] = add nuw nsw i64 [[ADD]], [[X]]
+; CHECK-NEXT:    [[CONV:%.*]] = trunc i64 [[ADD14]] to i32
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr ptr, ptr [[INVARIANT_GEP]], i64 [[X]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[J0]]
+; CHECK-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX17]], align 4
+; CHECK-NEXT:    [[X_NEXT]] = add nuw nsw i64 [[X]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[X_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[MIDDLE_LOOP_J0_CLEANUP]], label [[INNERMOST_LOOP]]
+; CHECK:       middle.loop.i.cleanup:
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[EXITCOND70_NOT:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND70_NOT]], label [[OUTERMOST_LOOP_K_CLEANUP]], label [[MIDDLE_LOOP_I]]
+; CHECK:       innermost.loop.j1:
+; CHECK-NEXT:    [[J21_064:%.*]] = phi i64 [ [[N]], [[INNERMOST_LOOP_J1_LR_PH]] ], [ [[DEC:%.*]], [[INNERMOST_LOOP_J1]] ]
+; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 [[J21_064]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX28]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = and i64 [[J21_064]], 1
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[TMP14]], 0
+; CHECK-NEXT:    [[CONV30:%.*]] = select i1 [[DOTNOT]], i32 0, i32 [[TMP13]]
+; CHECK-NEXT:    store i32 [[CONV30]], ptr [[ARRAYIDX28]], align 4
+; CHECK-NEXT:    [[DEC]] = add nsw i64 [[J21_064]], -1
+; CHECK-NEXT:    [[CMP23:%.*]] = icmp sgt i64 [[J21_064]], 1
+; CHECK-NEXT:    br i1 [[CMP23]], label [[INNERMOST_LOOP_J1]], label [[MIDDLE_LOOP_I_CLEANUP]]
 ;
 entry:
-  %cmp84 = icmp sgt i32 %n, 0
-  br i1 %cmp84, label %for.body.lr.ph, label %for.cond.cleanup
+  br label %outermost.loop.k
 
-for.body.lr.ph:
-  %wide.trip.count104 = zext i32 %n to i64
-  br label %for.cond6.preheader.lr.ph
-
-for.cond.cleanup:
+return:
   ret void
 
-for.cond6.preheader.lr.ph:
-  %iv99 = phi i64 [ 0, %for.body.lr.ph ], [ %iv.next100, %for.cond.cleanup4 ]
-  %iv87 = phi i32 [ %n, %for.body.lr.ph ], [ %iv.next88, %for.cond.cleanup4 ]
-  %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %iv99
+outermost.loop.k:
+  %k = phi i64 [ %k.next, %outermost.loop.k.cleanup ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %k
   %0 = load ptr, ptr %arrayidx, align 8
   store i32 0, ptr %0, align 4
-  %1 = trunc i64 %iv99 to i32
-  %2 = add i32 %1, 2
-  %3 = add i32 %2, %n
-  br label %for.cond6.preheader
+  %add = add nuw nsw i64 %k, 2
+  br label %middle.loop.i
+
+middle.loop.i:
+  %i = phi i64 [ 0, %outermost.loop.k ], [ %i.next, %middle.loop.i.cleanup ]
+  %invariant.gep = getelementptr ptr, ptr %a, i64 %i
+  br label %middle.loop.j0.ph
+
+outermost.loop.k.cleanup:
+  %k.next = add nuw nsw i64 %k, 1
+  %exitcond71.not = icmp eq i64 %k.next, %n
+  br i1 %exitcond71.not, label %return, label %outermost.loop.k
+
+innermost.loop.j1.lr.ph:                                 ; preds = %middle.loop.j0.cleanup
+  %1 = load ptr, ptr %invariant.gep, align 8
+  br label %innermost.loop.j1
 
-for.cond6.preheader:
-  %iv94 = phi i64 [ 0, %for.cond6.preheader.lr.ph ], [ %iv.next95, %for.cond.cleanup25 ]
-  %arrayidx16 = getelementptr inbounds ptr, ptr %a, i64 %iv94
-  %.pre = load ptr, ptr %arrayidx16, align 8
-  br label %for.cond10.preheader
+middle.loop.j0.ph:
+  %j0 = phi i64 [ 0, %middle.loop.i ], [ %j0.next, %middle.loop.j0.cleanup ]
+  br label %innermost.loop
 
-for.cond.cleanup4:
-  %iv.next100 = add nuw nsw i64 %iv99, 1
-  %iv.next88 = add i32 %iv87, %n
-  %exitcond105.not = icmp eq i64 %iv.next100, %wide.trip.count104
-  br i1 %exitcond105.not, label %for.cond.cleanup, label %for.cond6.preheader.lr.ph
+middle.loop.j0.cleanup:
+  %j0.next = add nuw nsw i64 %j0, 1
+  %j0.exit.cond.not = icmp eq i64 %j0.next, %n
+  br i1 %j0.exit.cond.not, label %innermost.loop.j1.lr.ph, label %middle.loop.j0.ph, !llvm.loop !3
 
-for.cond10.preheader:
-  %iv = phi i64 [ 0, %for.cond6.preheader ], [ %iv.next, %for.cond10.preheader ]
-  %arrayidx18 = getelementptr inbounds i32, ptr %.pre, i64 %iv
-  store i32 %3, ptr %arrayidx18, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count104
-  br i1 %exitcond.not, label %for.cond28.preheader, label %for.cond10.preheader, !llvm.loop !3
+innermost.loop:
+  %x = phi i64 [ 0, %middle.loop.j0.ph ], [ %x.next, %innermost.loop ]
+  %add14 = add nuw nsw i64 %add, %x
+  %conv = trunc i64 %add14 to i32
+  %gep = getelementptr ptr, ptr %invariant.gep, i64 %x
+  %2 = load ptr, ptr %gep, align 8
+  %arrayidx17 = getelementptr inbounds i32, ptr %2, i64 %j0
+  store i32 %conv, ptr %arrayidx17, align 4
+  %x.next = add nuw nsw i64 %x, 1
+  %exitcond.not = icmp eq i64 %x.next, %n
+  br i1 %exitcond.not, label %middle.loop.j0.cleanup, label %innermost.loop
 
-for.cond28.preheader:
-  %iv89 = phi i64 [ %iv.next90, %for.cond28.preheader ], [ 0, %for.cond10.preheader ]
-  %arrayidx37 = getelementptr inbounds i32, ptr %.pre, i64 %iv89
-  %arrayidx37.promoted = load i32, ptr %arrayidx37, align 4
-  %4 = add i32 %iv87, %arrayidx37.promoted
-  store i32 %4, ptr %arrayidx37, align 4
-  %iv.next90 = add nuw nsw i64 %iv89, 1
-  %exitcond93.not = icmp eq i64 %iv.next90, %wide.trip.count104
-  br i1 %exitcond93.not, label %for.cond.cleanup25, label %for.cond28.preheader
+middle.loop.i.cleanup:
+  %i.next = add nuw nsw i64 %i, 1
+  %exitcond70.not = icmp eq i64 %i.next, %n
+  br i1 %exitcond70.not, label %outermost.loop.k.cleanup, label %middle.loop.i
 
-for.cond.cleanup25:
-  %iv.next95 = add nuw nsw i64 %iv94, 1
-  %exitcond98.not = icmp eq i64 %iv.next95, %wide.trip.count104
-  br i1 %exitcond98.not, label %for.cond.cleanup4, label %for.cond6.preheader, !llvm.loop !3
+innermost.loop.j1:
+  %j21.064 = phi i64 [ %n, %innermost.loop.j1.lr.ph ], [ %dec, %innermost.loop.j1 ]
+  %arrayidx28 = getelementptr inbounds i32, ptr %1, i64 %j21.064
+  %3 = load i32, ptr %arrayidx28, align 4
+  %4 = and i64 %j21.064, 1
+  %.not = icmp eq i64 %4, 0
+  %conv30 = select i1 %.not, i32 0, i32 %3
+  store i32 %conv30, ptr %arrayidx28, align 4
+  %dec = add nsw i64 %j21.064, -1
+  %cmp23 = icmp sgt i64 %j21.064, 1
+  br i1 %cmp23, label %innermost.loop.j1, label %middle.loop.i.cleanup
 }
 
 !3 = distinct !{!3, !4, !5, !6}

>From 3eb8bf8cd142147d2671627dfc54fd2daadf6a40 Mon Sep 17 00:00:00 2001
From: Kolya Panchenko <kolya.panchenko at sifive.com>
Date: Fri, 17 Nov 2023 14:03:49 -0800
Subject: [PATCH 3/3] Addressed comments before merge

---
 llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
index 6114287b93e67d3..f950d4740e413c8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -156,8 +156,8 @@ static bool isHeaderVPBB(VPBasicBlock *VPBB) {
   return VPBB->getParent() && VPBB->getParent()->getEntry() == VPBB;
 }
 
-/// Return true of \p L loop is contained within \p OuterLoop
-static bool isNestedLoop(const Loop *L, const Loop *OuterLoop) {
+/// Return true of \p L loop is contained within \p OuterLoop.
+static bool doesContainLoop(const Loop *L, const Loop *OuterLoop) {
   if (L->getLoopDepth() < OuterLoop->getLoopDepth())
     return false;
   const Loop *P = L;
@@ -187,7 +187,7 @@ VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {
 
   // Get or create a region for the loop containing BB.
   Loop *LoopOfBB = LI->getLoopFor(BB);
-  if (!LoopOfBB || !isNestedLoop(LoopOfBB, TheLoop))
+  if (!LoopOfBB || !doesContainLoop(LoopOfBB, TheLoop))
     return VPBB;
 
   auto *RegionOfVPBB = Loop2Region.lookup(LoopOfBB);



More information about the llvm-commits mailing list