[llvm] 7d6ec3b - [LV] Add more tests for vector loop removal.
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 31 12:10:52 PST 2024
Author: Florian Hahn
Date: 2024-12-31T20:08:54Z
New Revision: 7d6ec3b9680a53e58235743080bf223067050fbc
URL: https://github.com/llvm/llvm-project/commit/7d6ec3b9680a53e58235743080bf223067050fbc
DIFF: https://github.com/llvm/llvm-project/commit/7d6ec3b9680a53e58235743080bf223067050fbc.diff
LOG: [LV] Add more tests for vector loop removal.
Add missing test coverage of loops where the vector loop region can be
removed that include replicate recipes as well as nested loops.
Extra test coverage for https://github.com/llvm/llvm-project/pull/108378.
Added:
Modified:
llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
Removed:
################################################################################
diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
index fd75177c0d1062..8bcba56e0d43a0 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
@@ -1,19 +1,141 @@
-; RUN: opt -passes=loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=CHECK,VF8UF1 %s
-; RUN: opt -passes=loop-vectorize -force-vector-width=8 -force-vector-interleave=2 -S %s | FileCheck --check-prefixes=CHECK,VF8UF2 %s
-; RUN: opt -passes=loop-vectorize -force-vector-width=16 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=CHECK,VF16UF1 %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='loop-vectorize,verify<loops>' -force-vector-width=8 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=CHECK,VF8UF1 %s
+; RUN: opt -passes='loop-vectorize,verify<loops>' -force-vector-width=8 -force-vector-interleave=2 -S %s | FileCheck --check-prefixes=CHECK,VF8UF2 %s
+; RUN: opt -passes='loop-vectorize,verify<loops>' -force-vector-width=16 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=CHECK,VF16UF1 %s
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
; Check if the vector loop condition can be simplified to true for a given
; VF/IC combination.
define void @test_tc_less_than_16(ptr %A, i64 %N) {
-; CHECK-LABEL: define void @test_tc_less_than_16(
-; VF8UF1: [[CMP:%.+]] = icmp eq i64 %index.next, %n.vec
-; VF8UF1-NEXT: br i1 [[CMP]], label %middle.block, label %vector.body
+; VF8UF1-LABEL: define void @test_tc_less_than_16(
+; VF8UF1-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; VF8UF1-NEXT: [[ENTRY:.*]]:
+; VF8UF1-NEXT: [[AND:%.*]] = and i64 [[N]], 15
+; VF8UF1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[AND]], 8
+; VF8UF1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF1: [[VECTOR_PH]]:
+; VF8UF1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[AND]], 8
+; VF8UF1-NEXT: [[N_VEC:%.*]] = sub i64 [[AND]], [[N_MOD_VF]]
+; VF8UF1-NEXT: [[TMP0:%.*]] = sub i64 [[AND]], [[N_VEC]]
+; VF8UF1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; VF8UF1-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF8UF1: [[VECTOR_BODY]]:
+; VF8UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF8UF1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; VF8UF1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; VF8UF1-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; VF8UF1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP3]], align 1
+; VF8UF1-NEXT: [[TMP4:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10)
+; VF8UF1-NEXT: store <8 x i8> [[TMP4]], ptr [[TMP3]], align 1
+; VF8UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; VF8UF1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF8UF1-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF8UF1: [[MIDDLE_BLOCK]]:
+; VF8UF1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[AND]], [[N_VEC]]
+; VF8UF1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF1: [[SCALAR_PH]]:
+; VF8UF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[AND]], %[[ENTRY]] ]
+; VF8UF1-NEXT: [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; VF8UF1-NEXT: br label %[[LOOP:.*]]
+; VF8UF1: [[LOOP]]:
+; VF8UF1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF1-NEXT: [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF1-NEXT: [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1
+; VF8UF1-NEXT: [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF8UF1-NEXT: [[ADD:%.*]] = add nsw i8 [[L]], 10
+; VF8UF1-NEXT: store i8 [[ADD]], ptr [[P_SRC]], align 1
+; VF8UF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; VF8UF1-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; VF8UF1-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF8UF1: [[EXIT]]:
+; VF8UF1-NEXT: ret void
;
-; VF8UF2: br i1 true, label %middle.block, label %vector.body
+; VF8UF2-LABEL: define void @test_tc_less_than_16(
+; VF8UF2-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; VF8UF2-NEXT: [[ENTRY:.*]]:
+; VF8UF2-NEXT: [[AND:%.*]] = and i64 [[N]], 15
+; VF8UF2-NEXT: br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF2: [[VECTOR_PH]]:
+; VF8UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[AND]], 16
+; VF8UF2-NEXT: [[N_VEC:%.*]] = sub i64 [[AND]], [[N_MOD_VF]]
+; VF8UF2-NEXT: [[TMP0:%.*]] = sub i64 [[AND]], [[N_VEC]]
+; VF8UF2-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; VF8UF2-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF8UF2: [[VECTOR_BODY]]:
+; VF8UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF8UF2-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; VF8UF2-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
+; VF8UF2-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; VF8UF2-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 8
+; VF8UF2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; VF8UF2-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP3]], align 1
+; VF8UF2-NEXT: [[TMP4:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10)
+; VF8UF2-NEXT: [[TMP5:%.*]] = add nsw <8 x i8> [[WIDE_LOAD1]], splat (i8 10)
+; VF8UF2-NEXT: store <8 x i8> [[TMP4]], ptr [[TMP2]], align 1
+; VF8UF2-NEXT: store <8 x i8> [[TMP5]], ptr [[TMP3]], align 1
+; VF8UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF8UF2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF8UF2: [[MIDDLE_BLOCK]]:
+; VF8UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[AND]], [[N_VEC]]
+; VF8UF2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF2: [[SCALAR_PH]]:
+; VF8UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[AND]], %[[ENTRY]] ]
+; VF8UF2-NEXT: [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; VF8UF2-NEXT: br label %[[LOOP:.*]]
+; VF8UF2: [[LOOP]]:
+; VF8UF2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF2-NEXT: [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF2-NEXT: [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1
+; VF8UF2-NEXT: [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF8UF2-NEXT: [[ADD:%.*]] = add nsw i8 [[L]], 10
+; VF8UF2-NEXT: store i8 [[ADD]], ptr [[P_SRC]], align 1
+; VF8UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; VF8UF2-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; VF8UF2-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF8UF2: [[EXIT]]:
+; VF8UF2-NEXT: ret void
;
-; VF16UF1: br i1 true, label %middle.block, label %vector.body
+; VF16UF1-LABEL: define void @test_tc_less_than_16(
+; VF16UF1-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; VF16UF1-NEXT: [[ENTRY:.*]]:
+; VF16UF1-NEXT: [[AND:%.*]] = and i64 [[N]], 15
+; VF16UF1-NEXT: br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF16UF1: [[VECTOR_PH]]:
+; VF16UF1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[AND]], 16
+; VF16UF1-NEXT: [[N_VEC:%.*]] = sub i64 [[AND]], [[N_MOD_VF]]
+; VF16UF1-NEXT: [[TMP0:%.*]] = sub i64 [[AND]], [[N_VEC]]
+; VF16UF1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; VF16UF1-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF16UF1: [[VECTOR_BODY]]:
+; VF16UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF16UF1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; VF16UF1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP4]]
+; VF16UF1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; VF16UF1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; VF16UF1-NEXT: [[TMP3:%.*]] = add nsw <16 x i8> [[WIDE_LOAD]], splat (i8 10)
+; VF16UF1-NEXT: store <16 x i8> [[TMP3]], ptr [[TMP2]], align 1
+; VF16UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF16UF1-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF16UF1: [[MIDDLE_BLOCK]]:
+; VF16UF1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[AND]], [[N_VEC]]
+; VF16UF1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF16UF1: [[SCALAR_PH]]:
+; VF16UF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[AND]], %[[ENTRY]] ]
+; VF16UF1-NEXT: [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; VF16UF1-NEXT: br label %[[LOOP:.*]]
+; VF16UF1: [[LOOP]]:
+; VF16UF1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF16UF1-NEXT: [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ]
+; VF16UF1-NEXT: [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1
+; VF16UF1-NEXT: [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF16UF1-NEXT: [[ADD:%.*]] = add nsw i8 [[L]], 10
+; VF16UF1-NEXT: store i8 [[ADD]], ptr [[P_SRC]], align 1
+; VF16UF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; VF16UF1-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; VF16UF1-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF16UF1: [[EXIT]]:
+; VF16UF1-NEXT: ret void
;
entry:
%and = and i64 %N, 15
@@ -33,3 +155,653 @@ loop:
exit:
ret void
}
+
+define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, 10) %N) {
+; VF8UF1-LABEL: define void @remove_loop_region_with_replicate_recipe(
+; VF8UF1-SAME: ptr [[DST:%.*]], i64 range(i64 5, 10) [[N:%.*]]) {
+; VF8UF1-NEXT: [[ENTRY:.*]]:
+; VF8UF1-NEXT: [[TMP0:%.*]] = add nsw i64 [[N]], -2
+; VF8UF1-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF1: [[VECTOR_PH]]:
+; VF8UF1-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 7
+; VF8UF1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8
+; VF8UF1-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF8UF1-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
+; VF8UF1-NEXT: [[TMP1:%.*]] = add i64 2, [[N_VEC]]
+; VF8UF1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF8UF1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; VF8UF1-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF8UF1: [[VECTOR_BODY]]:
+; VF8UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE16:.*]] ]
+; VF8UF1-NEXT: [[OFFSET_IDX:%.*]] = add i64 2, [[INDEX]]
+; VF8UF1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i64 0
+; VF8UF1-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
+; VF8UF1-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT1]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; VF8UF1-NEXT: [[TMP2:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; VF8UF1-NEXT: [[TMP3:%.*]] = extractelement <8 x i1> [[TMP2]], i32 0
+; VF8UF1-NEXT: br i1 [[TMP3]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF8UF1: [[PRED_STORE_IF]]:
+; VF8UF1-NEXT: [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 0
+; VF8UF1-NEXT: [[TMP4:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP20]]
+; VF8UF1-NEXT: store i16 0, ptr [[TMP4]], align 2
+; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE]]
+; VF8UF1: [[PRED_STORE_CONTINUE]]:
+; VF8UF1-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP2]], i32 1
+; VF8UF1-NEXT: br i1 [[TMP5]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; VF8UF1: [[PRED_STORE_IF3]]:
+; VF8UF1-NEXT: [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 1
+; VF8UF1-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP21]]
+; VF8UF1-NEXT: store i16 0, ptr [[TMP6]], align 2
+; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE4]]
+; VF8UF1: [[PRED_STORE_CONTINUE4]]:
+; VF8UF1-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP2]], i32 2
+; VF8UF1-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; VF8UF1: [[PRED_STORE_IF5]]:
+; VF8UF1-NEXT: [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 2
+; VF8UF1-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP23]]
+; VF8UF1-NEXT: store i16 0, ptr [[TMP8]], align 2
+; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE6]]
+; VF8UF1: [[PRED_STORE_CONTINUE6]]:
+; VF8UF1-NEXT: [[TMP9:%.*]] = extractelement <8 x i1> [[TMP2]], i32 3
+; VF8UF1-NEXT: br i1 [[TMP9]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF8UF1: [[PRED_STORE_IF7]]:
+; VF8UF1-NEXT: [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 3
+; VF8UF1-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP24]]
+; VF8UF1-NEXT: store i16 0, ptr [[TMP10]], align 2
+; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE8]]
+; VF8UF1: [[PRED_STORE_CONTINUE8]]:
+; VF8UF1-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP2]], i32 4
+; VF8UF1-NEXT: br i1 [[TMP11]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF8UF1: [[PRED_STORE_IF9]]:
+; VF8UF1-NEXT: [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 4
+; VF8UF1-NEXT: [[TMP12:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP26]]
+; VF8UF1-NEXT: store i16 0, ptr [[TMP12]], align 2
+; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE10]]
+; VF8UF1: [[PRED_STORE_CONTINUE10]]:
+; VF8UF1-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP2]], i32 5
+; VF8UF1-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; VF8UF1: [[PRED_STORE_IF11]]:
+; VF8UF1-NEXT: [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 5
+; VF8UF1-NEXT: [[TMP14:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP19]]
+; VF8UF1-NEXT: store i16 0, ptr [[TMP14]], align 2
+; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE12]]
+; VF8UF1: [[PRED_STORE_CONTINUE12]]:
+; VF8UF1-NEXT: [[TMP15:%.*]] = extractelement <8 x i1> [[TMP2]], i32 6
+; VF8UF1-NEXT: br i1 [[TMP15]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; VF8UF1: [[PRED_STORE_IF13]]:
+; VF8UF1-NEXT: [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 6
+; VF8UF1-NEXT: [[TMP16:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP22]]
+; VF8UF1-NEXT: store i16 0, ptr [[TMP16]], align 2
+; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE14]]
+; VF8UF1: [[PRED_STORE_CONTINUE14]]:
+; VF8UF1-NEXT: [[TMP17:%.*]] = extractelement <8 x i1> [[TMP2]], i32 7
+; VF8UF1-NEXT: br i1 [[TMP17]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16]]
+; VF8UF1: [[PRED_STORE_IF15]]:
+; VF8UF1-NEXT: [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 7
+; VF8UF1-NEXT: [[TMP18:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP25]]
+; VF8UF1-NEXT: store i16 0, ptr [[TMP18]], align 2
+; VF8UF1-NEXT: br label %[[PRED_STORE_CONTINUE16]]
+; VF8UF1: [[PRED_STORE_CONTINUE16]]:
+; VF8UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; VF8UF1-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF8UF1: [[MIDDLE_BLOCK]]:
+; VF8UF1-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF1: [[SCALAR_PH]]:
+; VF8UF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ]
+; VF8UF1-NEXT: br label %[[LOOP:.*]]
+; VF8UF1: [[LOOP]]:
+; VF8UF1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF1-NEXT: [[GEP_DST:%.*]] = getelementptr i16, ptr [[DST]], i64 [[IV]]
+; VF8UF1-NEXT: store i16 0, ptr [[GEP_DST]], align 2
+; VF8UF1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; VF8UF1-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF8UF1-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF8UF1: [[EXIT]]:
+; VF8UF1-NEXT: ret void
+;
+; VF8UF2-LABEL: define void @remove_loop_region_with_replicate_recipe(
+; VF8UF2-SAME: ptr [[DST:%.*]], i64 range(i64 5, 10) [[N:%.*]]) {
+; VF8UF2-NEXT: [[ENTRY:.*]]:
+; VF8UF2-NEXT: [[TMP0:%.*]] = add nsw i64 [[N]], -2
+; VF8UF2-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF2: [[VECTOR_PH]]:
+; VF8UF2-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 15
+; VF8UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; VF8UF2-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF8UF2-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
+; VF8UF2-NEXT: [[TMP1:%.*]] = add i64 2, [[N_VEC]]
+; VF8UF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF8UF2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; VF8UF2-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF8UF2: [[VECTOR_BODY]]:
+; VF8UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE35:.*]] ]
+; VF8UF2-NEXT: [[OFFSET_IDX:%.*]] = add i64 2, [[INDEX]]
+; VF8UF2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i64 0
+; VF8UF2-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
+; VF8UF2-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT1]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; VF8UF2-NEXT: [[VEC_IV3:%.*]] = add <8 x i64> [[BROADCAST_SPLAT1]], <i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+; VF8UF2-NEXT: [[TMP2:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; VF8UF2-NEXT: [[TMP3:%.*]] = icmp ule <8 x i64> [[VEC_IV3]], [[BROADCAST_SPLAT]]
+; VF8UF2-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP2]], i32 0
+; VF8UF2-NEXT: br i1 [[TMP4]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF8UF2: [[PRED_STORE_IF]]:
+; VF8UF2-NEXT: [[TMP36:%.*]] = add i64 [[OFFSET_IDX]], 0
+; VF8UF2-NEXT: [[TMP5:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP36]]
+; VF8UF2-NEXT: store i16 0, ptr [[TMP5]], align 2
+; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE]]
+; VF8UF2: [[PRED_STORE_CONTINUE]]:
+; VF8UF2-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP2]], i32 1
+; VF8UF2-NEXT: br i1 [[TMP6]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7:.*]]
+; VF8UF2: [[PRED_STORE_IF6]]:
+; VF8UF2-NEXT: [[TMP37:%.*]] = add i64 [[OFFSET_IDX]], 1
+; VF8UF2-NEXT: [[TMP7:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP37]]
+; VF8UF2-NEXT: store i16 0, ptr [[TMP7]], align 2
+; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE7]]
+; VF8UF2: [[PRED_STORE_CONTINUE7]]:
+; VF8UF2-NEXT: [[TMP8:%.*]] = extractelement <8 x i1> [[TMP2]], i32 2
+; VF8UF2-NEXT: br i1 [[TMP8]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9:.*]]
+; VF8UF2: [[PRED_STORE_IF8]]:
+; VF8UF2-NEXT: [[TMP39:%.*]] = add i64 [[OFFSET_IDX]], 2
+; VF8UF2-NEXT: [[TMP9:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP39]]
+; VF8UF2-NEXT: store i16 0, ptr [[TMP9]], align 2
+; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE9]]
+; VF8UF2: [[PRED_STORE_CONTINUE9]]:
+; VF8UF2-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP2]], i32 3
+; VF8UF2-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF10:.*]], label %[[PRED_STORE_CONTINUE11:.*]]
+; VF8UF2: [[PRED_STORE_IF10]]:
+; VF8UF2-NEXT: [[TMP40:%.*]] = add i64 [[OFFSET_IDX]], 3
+; VF8UF2-NEXT: [[TMP11:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP40]]
+; VF8UF2-NEXT: store i16 0, ptr [[TMP11]], align 2
+; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE11]]
+; VF8UF2: [[PRED_STORE_CONTINUE11]]:
+; VF8UF2-NEXT: [[TMP12:%.*]] = extractelement <8 x i1> [[TMP2]], i32 4
+; VF8UF2-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF12:.*]], label %[[PRED_STORE_CONTINUE13:.*]]
+; VF8UF2: [[PRED_STORE_IF12]]:
+; VF8UF2-NEXT: [[TMP42:%.*]] = add i64 [[OFFSET_IDX]], 4
+; VF8UF2-NEXT: [[TMP13:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP42]]
+; VF8UF2-NEXT: store i16 0, ptr [[TMP13]], align 2
+; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE13]]
+; VF8UF2: [[PRED_STORE_CONTINUE13]]:
+; VF8UF2-NEXT: [[TMP14:%.*]] = extractelement <8 x i1> [[TMP2]], i32 5
+; VF8UF2-NEXT: br i1 [[TMP14]], label %[[PRED_STORE_IF14:.*]], label %[[PRED_STORE_CONTINUE15:.*]]
+; VF8UF2: [[PRED_STORE_IF14]]:
+; VF8UF2-NEXT: [[TMP43:%.*]] = add i64 [[OFFSET_IDX]], 5
+; VF8UF2-NEXT: [[TMP15:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP43]]
+; VF8UF2-NEXT: store i16 0, ptr [[TMP15]], align 2
+; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE15]]
+; VF8UF2: [[PRED_STORE_CONTINUE15]]:
+; VF8UF2-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP2]], i32 6
+; VF8UF2-NEXT: br i1 [[TMP16]], label %[[PRED_STORE_IF16:.*]], label %[[PRED_STORE_CONTINUE17:.*]]
+; VF8UF2: [[PRED_STORE_IF16]]:
+; VF8UF2-NEXT: [[TMP45:%.*]] = add i64 [[OFFSET_IDX]], 6
+; VF8UF2-NEXT: [[TMP17:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP45]]
+; VF8UF2-NEXT: store i16 0, ptr [[TMP17]], align 2
+; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE17]]
+; VF8UF2: [[PRED_STORE_CONTINUE17]]:
+; VF8UF2-NEXT: [[TMP18:%.*]] = extractelement <8 x i1> [[TMP2]], i32 7
+; VF8UF2-NEXT: br i1 [[TMP18]], label %[[PRED_STORE_IF18:.*]], label %[[PRED_STORE_CONTINUE19:.*]]
+; VF8UF2: [[PRED_STORE_IF18]]:
+; VF8UF2-NEXT: [[TMP46:%.*]] = add i64 [[OFFSET_IDX]], 7
+; VF8UF2-NEXT: [[TMP19:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP46]]
+; VF8UF2-NEXT: store i16 0, ptr [[TMP19]], align 2
+; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE19]]
+; VF8UF2: [[PRED_STORE_CONTINUE19]]:
+; VF8UF2-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0
+; VF8UF2-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF20:.*]], label %[[PRED_STORE_CONTINUE21:.*]]
+; VF8UF2: [[PRED_STORE_IF20]]:
+; VF8UF2-NEXT: [[TMP48:%.*]] = add i64 [[OFFSET_IDX]], 8
+; VF8UF2-NEXT: [[TMP21:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP48]]
+; VF8UF2-NEXT: store i16 0, ptr [[TMP21]], align 2
+; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE21]]
+; VF8UF2: [[PRED_STORE_CONTINUE21]]:
+; VF8UF2-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP3]], i32 1
+; VF8UF2-NEXT: br i1 [[TMP22]], label %[[PRED_STORE_IF22:.*]], label %[[PRED_STORE_CONTINUE23:.*]]
+; VF8UF2: [[PRED_STORE_IF22]]:
+; VF8UF2-NEXT: [[TMP49:%.*]] = add i64 [[OFFSET_IDX]], 9
+; VF8UF2-NEXT: [[TMP23:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP49]]
+; VF8UF2-NEXT: store i16 0, ptr [[TMP23]], align 2
+; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE23]]
+; VF8UF2: [[PRED_STORE_CONTINUE23]]:
+; VF8UF2-NEXT: [[TMP24:%.*]] = extractelement <8 x i1> [[TMP3]], i32 2
+; VF8UF2-NEXT: br i1 [[TMP24]], label %[[PRED_STORE_IF24:.*]], label %[[PRED_STORE_CONTINUE25:.*]]
+; VF8UF2: [[PRED_STORE_IF24]]:
+; VF8UF2-NEXT: [[TMP51:%.*]] = add i64 [[OFFSET_IDX]], 10
+; VF8UF2-NEXT: [[TMP25:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP51]]
+; VF8UF2-NEXT: store i16 0, ptr [[TMP25]], align 2
+; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE25]]
+; VF8UF2: [[PRED_STORE_CONTINUE25]]:
+; VF8UF2-NEXT: [[TMP26:%.*]] = extractelement <8 x i1> [[TMP3]], i32 3
+; VF8UF2-NEXT: br i1 [[TMP26]], label %[[PRED_STORE_IF26:.*]], label %[[PRED_STORE_CONTINUE27:.*]]
+; VF8UF2: [[PRED_STORE_IF26]]:
+; VF8UF2-NEXT: [[TMP38:%.*]] = add i64 [[OFFSET_IDX]], 11
+; VF8UF2-NEXT: [[TMP27:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP38]]
+; VF8UF2-NEXT: store i16 0, ptr [[TMP27]], align 2
+; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE27]]
+; VF8UF2: [[PRED_STORE_CONTINUE27]]:
+; VF8UF2-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP3]], i32 4
+; VF8UF2-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF28:.*]], label %[[PRED_STORE_CONTINUE29:.*]]
+; VF8UF2: [[PRED_STORE_IF28]]:
+; VF8UF2-NEXT: [[TMP41:%.*]] = add i64 [[OFFSET_IDX]], 12
+; VF8UF2-NEXT: [[TMP29:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP41]]
+; VF8UF2-NEXT: store i16 0, ptr [[TMP29]], align 2
+; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE29]]
+; VF8UF2: [[PRED_STORE_CONTINUE29]]:
+; VF8UF2-NEXT: [[TMP30:%.*]] = extractelement <8 x i1> [[TMP3]], i32 5
+; VF8UF2-NEXT: br i1 [[TMP30]], label %[[PRED_STORE_IF30:.*]], label %[[PRED_STORE_CONTINUE31:.*]]
+; VF8UF2: [[PRED_STORE_IF30]]:
+; VF8UF2-NEXT: [[TMP44:%.*]] = add i64 [[OFFSET_IDX]], 13
+; VF8UF2-NEXT: [[TMP31:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP44]]
+; VF8UF2-NEXT: store i16 0, ptr [[TMP31]], align 2
+; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE31]]
+; VF8UF2: [[PRED_STORE_CONTINUE31]]:
+; VF8UF2-NEXT: [[TMP32:%.*]] = extractelement <8 x i1> [[TMP3]], i32 6
+; VF8UF2-NEXT: br i1 [[TMP32]], label %[[PRED_STORE_IF32:.*]], label %[[PRED_STORE_CONTINUE33:.*]]
+; VF8UF2: [[PRED_STORE_IF32]]:
+; VF8UF2-NEXT: [[TMP47:%.*]] = add i64 [[OFFSET_IDX]], 14
+; VF8UF2-NEXT: [[TMP33:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP47]]
+; VF8UF2-NEXT: store i16 0, ptr [[TMP33]], align 2
+; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE33]]
+; VF8UF2: [[PRED_STORE_CONTINUE33]]:
+; VF8UF2-NEXT: [[TMP34:%.*]] = extractelement <8 x i1> [[TMP3]], i32 7
+; VF8UF2-NEXT: br i1 [[TMP34]], label %[[PRED_STORE_IF34:.*]], label %[[PRED_STORE_CONTINUE35]]
+; VF8UF2: [[PRED_STORE_IF34]]:
+; VF8UF2-NEXT: [[TMP50:%.*]] = add i64 [[OFFSET_IDX]], 15
+; VF8UF2-NEXT: [[TMP35:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP50]]
+; VF8UF2-NEXT: store i16 0, ptr [[TMP35]], align 2
+; VF8UF2-NEXT: br label %[[PRED_STORE_CONTINUE35]]
+; VF8UF2: [[PRED_STORE_CONTINUE35]]:
+; VF8UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF8UF2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF8UF2: [[MIDDLE_BLOCK]]:
+; VF8UF2-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF2: [[SCALAR_PH]]:
+; VF8UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ]
+; VF8UF2-NEXT: br label %[[LOOP:.*]]
+; VF8UF2: [[LOOP]]:
+; VF8UF2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF2-NEXT: [[GEP_DST:%.*]] = getelementptr i16, ptr [[DST]], i64 [[IV]]
+; VF8UF2-NEXT: store i16 0, ptr [[GEP_DST]], align 2
+; VF8UF2-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; VF8UF2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF8UF2-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF8UF2: [[EXIT]]:
+; VF8UF2-NEXT: ret void
+;
+; VF16UF1-LABEL: define void @remove_loop_region_with_replicate_recipe(
+; VF16UF1-SAME: ptr [[DST:%.*]], i64 range(i64 5, 10) [[N:%.*]]) {
+; VF16UF1-NEXT: [[ENTRY:.*]]:
+; VF16UF1-NEXT: [[TMP0:%.*]] = add nsw i64 [[N]], -2
+; VF16UF1-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF16UF1: [[VECTOR_PH]]:
+; VF16UF1-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 15
+; VF16UF1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; VF16UF1-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; VF16UF1-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
+; VF16UF1-NEXT: [[TMP1:%.*]] = add i64 2, [[N_VEC]]
+; VF16UF1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; VF16UF1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; VF16UF1-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF16UF1: [[VECTOR_BODY]]:
+; VF16UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE32:.*]] ]
+; VF16UF1-NEXT: [[OFFSET_IDX:%.*]] = add i64 2, [[INDEX]]
+; VF16UF1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[INDEX]], i64 0
+; VF16UF1-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
+; VF16UF1-NEXT: [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT1]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+; VF16UF1-NEXT: [[TMP2:%.*]] = icmp ule <16 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; VF16UF1-NEXT: [[TMP3:%.*]] = extractelement <16 x i1> [[TMP2]], i32 0
+; VF16UF1-NEXT: br i1 [[TMP3]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF16UF1: [[PRED_STORE_IF]]:
+; VF16UF1-NEXT: [[TMP35:%.*]] = add i64 [[OFFSET_IDX]], 0
+; VF16UF1-NEXT: [[TMP4:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP35]]
+; VF16UF1-NEXT: store i16 0, ptr [[TMP4]], align 2
+; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE]]
+; VF16UF1: [[PRED_STORE_CONTINUE]]:
+; VF16UF1-NEXT: [[TMP5:%.*]] = extractelement <16 x i1> [[TMP2]], i32 1
+; VF16UF1-NEXT: br i1 [[TMP5]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; VF16UF1: [[PRED_STORE_IF3]]:
+; VF16UF1-NEXT: [[TMP36:%.*]] = add i64 [[OFFSET_IDX]], 1
+; VF16UF1-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP36]]
+; VF16UF1-NEXT: store i16 0, ptr [[TMP6]], align 2
+; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE4]]
+; VF16UF1: [[PRED_STORE_CONTINUE4]]:
+; VF16UF1-NEXT: [[TMP7:%.*]] = extractelement <16 x i1> [[TMP2]], i32 2
+; VF16UF1-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; VF16UF1: [[PRED_STORE_IF5]]:
+; VF16UF1-NEXT: [[TMP38:%.*]] = add i64 [[OFFSET_IDX]], 2
+; VF16UF1-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP38]]
+; VF16UF1-NEXT: store i16 0, ptr [[TMP8]], align 2
+; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE6]]
+; VF16UF1: [[PRED_STORE_CONTINUE6]]:
+; VF16UF1-NEXT: [[TMP9:%.*]] = extractelement <16 x i1> [[TMP2]], i32 3
+; VF16UF1-NEXT: br i1 [[TMP9]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF16UF1: [[PRED_STORE_IF7]]:
+; VF16UF1-NEXT: [[TMP39:%.*]] = add i64 [[OFFSET_IDX]], 3
+; VF16UF1-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP39]]
+; VF16UF1-NEXT: store i16 0, ptr [[TMP10]], align 2
+; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE8]]
+; VF16UF1: [[PRED_STORE_CONTINUE8]]:
+; VF16UF1-NEXT: [[TMP11:%.*]] = extractelement <16 x i1> [[TMP2]], i32 4
+; VF16UF1-NEXT: br i1 [[TMP11]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF16UF1: [[PRED_STORE_IF9]]:
+; VF16UF1-NEXT: [[TMP41:%.*]] = add i64 [[OFFSET_IDX]], 4
+; VF16UF1-NEXT: [[TMP12:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP41]]
+; VF16UF1-NEXT: store i16 0, ptr [[TMP12]], align 2
+; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE10]]
+; VF16UF1: [[PRED_STORE_CONTINUE10]]:
+; VF16UF1-NEXT: [[TMP13:%.*]] = extractelement <16 x i1> [[TMP2]], i32 5
+; VF16UF1-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; VF16UF1: [[PRED_STORE_IF11]]:
+; VF16UF1-NEXT: [[TMP42:%.*]] = add i64 [[OFFSET_IDX]], 5
+; VF16UF1-NEXT: [[TMP14:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP42]]
+; VF16UF1-NEXT: store i16 0, ptr [[TMP14]], align 2
+; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE12]]
+; VF16UF1: [[PRED_STORE_CONTINUE12]]:
+; VF16UF1-NEXT: [[TMP15:%.*]] = extractelement <16 x i1> [[TMP2]], i32 6
+; VF16UF1-NEXT: br i1 [[TMP15]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; VF16UF1: [[PRED_STORE_IF13]]:
+; VF16UF1-NEXT: [[TMP44:%.*]] = add i64 [[OFFSET_IDX]], 6
+; VF16UF1-NEXT: [[TMP16:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP44]]
+; VF16UF1-NEXT: store i16 0, ptr [[TMP16]], align 2
+; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE14]]
+; VF16UF1: [[PRED_STORE_CONTINUE14]]:
+; VF16UF1-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP2]], i32 7
+; VF16UF1-NEXT: br i1 [[TMP17]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]]
+; VF16UF1: [[PRED_STORE_IF15]]:
+; VF16UF1-NEXT: [[TMP45:%.*]] = add i64 [[OFFSET_IDX]], 7
+; VF16UF1-NEXT: [[TMP18:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP45]]
+; VF16UF1-NEXT: store i16 0, ptr [[TMP18]], align 2
+; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE16]]
+; VF16UF1: [[PRED_STORE_CONTINUE16]]:
+; VF16UF1-NEXT: [[TMP19:%.*]] = extractelement <16 x i1> [[TMP2]], i32 8
+; VF16UF1-NEXT: br i1 [[TMP19]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
+; VF16UF1: [[PRED_STORE_IF17]]:
+; VF16UF1-NEXT: [[TMP47:%.*]] = add i64 [[OFFSET_IDX]], 8
+; VF16UF1-NEXT: [[TMP20:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP47]]
+; VF16UF1-NEXT: store i16 0, ptr [[TMP20]], align 2
+; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE18]]
+; VF16UF1: [[PRED_STORE_CONTINUE18]]:
+; VF16UF1-NEXT: [[TMP21:%.*]] = extractelement <16 x i1> [[TMP2]], i32 9
+; VF16UF1-NEXT: br i1 [[TMP21]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
+; VF16UF1: [[PRED_STORE_IF19]]:
+; VF16UF1-NEXT: [[TMP48:%.*]] = add i64 [[OFFSET_IDX]], 9
+; VF16UF1-NEXT: [[TMP22:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP48]]
+; VF16UF1-NEXT: store i16 0, ptr [[TMP22]], align 2
+; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE20]]
+; VF16UF1: [[PRED_STORE_CONTINUE20]]:
+; VF16UF1-NEXT: [[TMP23:%.*]] = extractelement <16 x i1> [[TMP2]], i32 10
+; VF16UF1-NEXT: br i1 [[TMP23]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
+; VF16UF1: [[PRED_STORE_IF21]]:
+; VF16UF1-NEXT: [[TMP50:%.*]] = add i64 [[OFFSET_IDX]], 10
+; VF16UF1-NEXT: [[TMP24:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP50]]
+; VF16UF1-NEXT: store i16 0, ptr [[TMP24]], align 2
+; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE22]]
+; VF16UF1: [[PRED_STORE_CONTINUE22]]:
+; VF16UF1-NEXT: [[TMP25:%.*]] = extractelement <16 x i1> [[TMP2]], i32 11
+; VF16UF1-NEXT: br i1 [[TMP25]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
+; VF16UF1: [[PRED_STORE_IF23]]:
+; VF16UF1-NEXT: [[TMP37:%.*]] = add i64 [[OFFSET_IDX]], 11
+; VF16UF1-NEXT: [[TMP26:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP37]]
+; VF16UF1-NEXT: store i16 0, ptr [[TMP26]], align 2
+; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE24]]
+; VF16UF1: [[PRED_STORE_CONTINUE24]]:
+; VF16UF1-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP2]], i32 12
+; VF16UF1-NEXT: br i1 [[TMP27]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
+; VF16UF1: [[PRED_STORE_IF25]]:
+; VF16UF1-NEXT: [[TMP40:%.*]] = add i64 [[OFFSET_IDX]], 12
+; VF16UF1-NEXT: [[TMP28:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP40]]
+; VF16UF1-NEXT: store i16 0, ptr [[TMP28]], align 2
+; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE26]]
+; VF16UF1: [[PRED_STORE_CONTINUE26]]:
+; VF16UF1-NEXT: [[TMP29:%.*]] = extractelement <16 x i1> [[TMP2]], i32 13
+; VF16UF1-NEXT: br i1 [[TMP29]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
+; VF16UF1: [[PRED_STORE_IF27]]:
+; VF16UF1-NEXT: [[TMP43:%.*]] = add i64 [[OFFSET_IDX]], 13
+; VF16UF1-NEXT: [[TMP30:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP43]]
+; VF16UF1-NEXT: store i16 0, ptr [[TMP30]], align 2
+; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE28]]
+; VF16UF1: [[PRED_STORE_CONTINUE28]]:
+; VF16UF1-NEXT: [[TMP31:%.*]] = extractelement <16 x i1> [[TMP2]], i32 14
+; VF16UF1-NEXT: br i1 [[TMP31]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]]
+; VF16UF1: [[PRED_STORE_IF29]]:
+; VF16UF1-NEXT: [[TMP46:%.*]] = add i64 [[OFFSET_IDX]], 14
+; VF16UF1-NEXT: [[TMP32:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP46]]
+; VF16UF1-NEXT: store i16 0, ptr [[TMP32]], align 2
+; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE30]]
+; VF16UF1: [[PRED_STORE_CONTINUE30]]:
+; VF16UF1-NEXT: [[TMP33:%.*]] = extractelement <16 x i1> [[TMP2]], i32 15
+; VF16UF1-NEXT: br i1 [[TMP33]], label %[[PRED_STORE_IF31:.*]], label %[[PRED_STORE_CONTINUE32]]
+; VF16UF1: [[PRED_STORE_IF31]]:
+; VF16UF1-NEXT: [[TMP49:%.*]] = add i64 [[OFFSET_IDX]], 15
+; VF16UF1-NEXT: [[TMP34:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP49]]
+; VF16UF1-NEXT: store i16 0, ptr [[TMP34]], align 2
+; VF16UF1-NEXT: br label %[[PRED_STORE_CONTINUE32]]
+; VF16UF1: [[PRED_STORE_CONTINUE32]]:
+; VF16UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF16UF1-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF16UF1: [[MIDDLE_BLOCK]]:
+; VF16UF1-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF16UF1: [[SCALAR_PH]]:
+; VF16UF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ]
+; VF16UF1-NEXT: br label %[[LOOP:.*]]
+; VF16UF1: [[LOOP]]:
+; VF16UF1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF16UF1-NEXT: [[GEP_DST:%.*]] = getelementptr i16, ptr [[DST]], i64 [[IV]]
+; VF16UF1-NEXT: store i16 0, ptr [[GEP_DST]], align 2
+; VF16UF1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; VF16UF1-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF16UF1-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF16UF1: [[EXIT]]:
+; VF16UF1-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 2, %entry ], [ %iv.next, %loop ]
+ %gep.dst = getelementptr i16, ptr %dst, i64 %iv
+ store i16 0, ptr %gep.dst, align 2
+ %iv.next = add i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, %N
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+declare i1 @cond()
+
+define void @remove_loop_region_outer_loop(i64 range(i64 8, 17) %N, ptr noalias %src, ptr %dst) {
+; VF8UF1-LABEL: define void @remove_loop_region_outer_loop(
+; VF8UF1-SAME: i64 range(i64 8, 17) [[N:%.*]], ptr noalias [[SRC:%.*]], ptr [[DST:%.*]]) {
+; VF8UF1-NEXT: [[ENTRY:.*]]:
+; VF8UF1-NEXT: br label %[[OUTER_HEADER:.*]]
+; VF8UF1: [[OUTER_HEADER]]:
+; VF8UF1-NEXT: [[OUTER_IV:%.*]] = phi ptr [ [[SRC]], %[[ENTRY]] ], [ [[OUTER_IV_NEXT:%.*]], %[[OUTER_LATCH:.*]] ]
+; VF8UF1-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF1: [[VECTOR_PH]]:
+; VF8UF1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; VF8UF1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF8UF1-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF8UF1: [[VECTOR_BODY]]:
+; VF8UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF8UF1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; VF8UF1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[TMP0]]
+; VF8UF1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; VF8UF1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; VF8UF1-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
+; VF8UF1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
+; VF8UF1-NEXT: store <8 x i8> [[WIDE_LOAD]], ptr [[TMP4]], align 1
+; VF8UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; VF8UF1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF8UF1-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF8UF1: [[MIDDLE_BLOCK]]:
+; VF8UF1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF8UF1-NEXT: br i1 [[CMP_N]], label %[[OUTER_LATCH]], label %[[SCALAR_PH]]
+; VF8UF1: [[SCALAR_PH]]:
+; VF8UF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_HEADER]] ]
+; VF8UF1-NEXT: br label %[[INNER:.*]]
+; VF8UF1: [[INNER]]:
+; VF8UF1-NEXT: [[INNER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[INNER]] ]
+; VF8UF1-NEXT: [[GEP_SRC:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[INNER_IV]]
+; VF8UF1-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; VF8UF1-NEXT: [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INNER_IV]]
+; VF8UF1-NEXT: store i8 [[L]], ptr [[GEP_DST]], align 1
+; VF8UF1-NEXT: [[IV_NEXT]] = add i64 [[INNER_IV]], 1
+; VF8UF1-NEXT: [[C_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF8UF1-NEXT: br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF8UF1: [[OUTER_LATCH]]:
+; VF8UF1-NEXT: [[OUTER_IV_NEXT]] = getelementptr i8, ptr [[OUTER_IV]], i64 1
+; VF8UF1-NEXT: [[C_2:%.*]] = call i1 @cond()
+; VF8UF1-NEXT: br i1 [[C_2]], label %[[OUTER_HEADER]], label %[[EXIT:.*]]
+; VF8UF1: [[EXIT]]:
+; VF8UF1-NEXT: ret void
+;
+; VF8UF2-LABEL: define void @remove_loop_region_outer_loop(
+; VF8UF2-SAME: i64 range(i64 8, 17) [[N:%.*]], ptr noalias [[SRC:%.*]], ptr [[DST:%.*]]) {
+; VF8UF2-NEXT: [[ENTRY:.*]]:
+; VF8UF2-NEXT: br label %[[OUTER_HEADER:.*]]
+; VF8UF2: [[OUTER_HEADER]]:
+; VF8UF2-NEXT: [[OUTER_IV:%.*]] = phi ptr [ [[SRC]], %[[ENTRY]] ], [ [[OUTER_IV_NEXT:%.*]], %[[OUTER_LATCH:.*]] ]
+; VF8UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; VF8UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF2: [[VECTOR_PH]]:
+; VF8UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; VF8UF2-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF8UF2-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF8UF2: [[VECTOR_BODY]]:
+; VF8UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF8UF2-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; VF8UF2-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[TMP6]]
+; VF8UF2-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
+; VF8UF2-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
+; VF8UF2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; VF8UF2-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; VF8UF2-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
+; VF8UF2-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
+; VF8UF2-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP3]], i32 8
+; VF8UF2-NEXT: store <8 x i8> [[WIDE_LOAD]], ptr [[TMP4]], align 1
+; VF8UF2-NEXT: store <8 x i8> [[WIDE_LOAD1]], ptr [[TMP5]], align 1
+; VF8UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF8UF2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF8UF2: [[MIDDLE_BLOCK]]:
+; VF8UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF8UF2-NEXT: br i1 [[CMP_N]], label %[[OUTER_LATCH]], label %[[SCALAR_PH]]
+; VF8UF2: [[SCALAR_PH]]:
+; VF8UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_HEADER]] ]
+; VF8UF2-NEXT: br label %[[INNER:.*]]
+; VF8UF2: [[INNER]]:
+; VF8UF2-NEXT: [[INNER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[INNER]] ]
+; VF8UF2-NEXT: [[GEP_SRC:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[INNER_IV]]
+; VF8UF2-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; VF8UF2-NEXT: [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INNER_IV]]
+; VF8UF2-NEXT: store i8 [[L]], ptr [[GEP_DST]], align 1
+; VF8UF2-NEXT: [[IV_NEXT]] = add i64 [[INNER_IV]], 1
+; VF8UF2-NEXT: [[C_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF8UF2-NEXT: br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF8UF2: [[OUTER_LATCH]]:
+; VF8UF2-NEXT: [[OUTER_IV_NEXT]] = getelementptr i8, ptr [[OUTER_IV]], i64 1
+; VF8UF2-NEXT: [[C_2:%.*]] = call i1 @cond()
+; VF8UF2-NEXT: br i1 [[C_2]], label %[[OUTER_HEADER]], label %[[EXIT:.*]]
+; VF8UF2: [[EXIT]]:
+; VF8UF2-NEXT: ret void
+;
+; VF16UF1-LABEL: define void @remove_loop_region_outer_loop(
+; VF16UF1-SAME: i64 range(i64 8, 17) [[N:%.*]], ptr noalias [[SRC:%.*]], ptr [[DST:%.*]]) {
+; VF16UF1-NEXT: [[ENTRY:.*]]:
+; VF16UF1-NEXT: br label %[[OUTER_HEADER:.*]]
+; VF16UF1: [[OUTER_HEADER]]:
+; VF16UF1-NEXT: [[OUTER_IV:%.*]] = phi ptr [ [[SRC]], %[[ENTRY]] ], [ [[OUTER_IV_NEXT:%.*]], %[[OUTER_LATCH:.*]] ]
+; VF16UF1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; VF16UF1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF16UF1: [[VECTOR_PH]]:
+; VF16UF1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; VF16UF1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF16UF1-NEXT: br label %[[VECTOR_BODY:.*]]
+; VF16UF1: [[VECTOR_BODY]]:
+; VF16UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF16UF1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; VF16UF1-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[TMP4]]
+; VF16UF1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
+; VF16UF1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; VF16UF1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP4]]
+; VF16UF1-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP2]], i32 0
+; VF16UF1-NEXT: store <16 x i8> [[WIDE_LOAD]], ptr [[TMP3]], align 1
+; VF16UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF16UF1-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF16UF1: [[MIDDLE_BLOCK]]:
+; VF16UF1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF16UF1-NEXT: br i1 [[CMP_N]], label %[[OUTER_LATCH]], label %[[SCALAR_PH]]
+; VF16UF1: [[SCALAR_PH]]:
+; VF16UF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[OUTER_HEADER]] ]
+; VF16UF1-NEXT: br label %[[INNER:.*]]
+; VF16UF1: [[INNER]]:
+; VF16UF1-NEXT: [[INNER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[INNER]] ]
+; VF16UF1-NEXT: [[GEP_SRC:%.*]] = getelementptr i8, ptr [[OUTER_IV]], i64 [[INNER_IV]]
+; VF16UF1-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; VF16UF1-NEXT: [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INNER_IV]]
+; VF16UF1-NEXT: store i8 [[L]], ptr [[GEP_DST]], align 1
+; VF16UF1-NEXT: [[IV_NEXT]] = add i64 [[INNER_IV]], 1
+; VF16UF1-NEXT: [[C_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF16UF1-NEXT: br i1 [[C_1]], label %[[OUTER_LATCH]], label %[[INNER]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF16UF1: [[OUTER_LATCH]]:
+; VF16UF1-NEXT: [[OUTER_IV_NEXT]] = getelementptr i8, ptr [[OUTER_IV]], i64 1
+; VF16UF1-NEXT: [[C_2:%.*]] = call i1 @cond()
+; VF16UF1-NEXT: br i1 [[C_2]], label %[[OUTER_HEADER]], label %[[EXIT:.*]]
+; VF16UF1: [[EXIT]]:
+; VF16UF1-NEXT: ret void
+;
+entry:
+ br label %outer.header
+
+outer.header:
+ %outer.iv = phi ptr [ %src, %entry ], [ %outer.iv.next, %outer.latch ]
+ br label %inner
+
+inner:
+ %inner.iv = phi i64 [ 0, %outer.header ], [ %iv.next, %inner ]
+ %gep.src = getelementptr i8, ptr %outer.iv, i64 %inner.iv
+ %l = load i8, ptr %gep.src, align 1
+ %gep.dst = getelementptr i8, ptr %dst, i64 %inner.iv
+ store i8 %l, ptr %gep.dst, align 1
+ %iv.next = add i64 %inner.iv, 1
+ %c.1 = icmp eq i64 %iv.next, %N
+ br i1 %c.1, label %outer.latch, label %inner
+
+outer.latch:
+ %outer.iv.next = getelementptr i8, ptr %outer.iv, i64 1
+ %c.2 = call i1 @cond()
+ br i1 %c.2, label %outer.header, label %exit
+
+exit:
+ ret void
+}
+;.
+; VF8UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; VF8UF1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; VF8UF1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; VF8UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; VF8UF1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; VF8UF1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; VF8UF1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; VF8UF1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+;.
+; VF8UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; VF8UF2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; VF8UF2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; VF8UF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; VF8UF2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; VF8UF2: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; VF8UF2: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; VF8UF2: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+;.
+; VF16UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; VF16UF1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; VF16UF1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; VF16UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; VF16UF1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; VF16UF1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; VF16UF1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; VF16UF1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+;.
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
More information about the llvm-commits
mailing list