[llvm] [LV] Optimize VPWidenIntOrFpInductionRecipe for known TC (PR #118828)
Hari Limaye via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 17 06:13:24 PST 2024
https://github.com/hazzlim updated https://github.com/llvm/llvm-project/pull/118828
>From 95613947be64f8f83ddd71f0d419bb8f722af065 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Thu, 5 Dec 2024 16:14:46 +0000
Subject: [PATCH 1/4] [LV] Pre-commit tests for optimizing induction variable
width
---
...folding-optimize-vector-induction-width.ll | 898 ++++++++++++++++++
1 file changed, 898 insertions(+)
create mode 100644 llvm/test/Transforms/LoopVectorize/tail-folding-optimize-vector-induction-width.ll
diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-optimize-vector-induction-width.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-optimize-vector-induction-width.ll
new file mode 100644
index 00000000000000..252d6b063eee9b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/tail-folding-optimize-vector-induction-width.ll
@@ -0,0 +1,898 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64-unknown-linux-gnu"
+
+define dso_local void @canonical_small_tc_i8(ptr nocapture noundef writeonly %p) {
+; CHECK-LABEL: define dso_local void @canonical_small_tc_i8(
+; CHECK-SAME: ptr nocapture noundef writeonly [[P:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE14:.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 14)
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0
+; CHECK-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK: [[PRED_STORE_IF]]:
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP2]]
+; CHECK-NEXT: store i16 1, ptr [[TMP3]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]]
+; CHECK: [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1
+; CHECK-NEXT: br i1 [[TMP4]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; CHECK: [[PRED_STORE_IF1]]:
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP5]]
+; CHECK-NEXT: store i16 1, ptr [[TMP6]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]]
+; CHECK: [[PRED_STORE_CONTINUE2]]:
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2
+; CHECK-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; CHECK: [[PRED_STORE_IF3]]:
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP8]]
+; CHECK-NEXT: store i16 1, ptr [[TMP9]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE4]]
+; CHECK: [[PRED_STORE_CONTINUE4]]:
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3
+; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; CHECK: [[PRED_STORE_IF5]]:
+; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP11]]
+; CHECK-NEXT: store i16 1, ptr [[TMP12]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]]
+; CHECK: [[PRED_STORE_CONTINUE6]]:
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4
+; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; CHECK: [[PRED_STORE_IF7]]:
+; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP14]]
+; CHECK-NEXT: store i16 1, ptr [[TMP15]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE8]]
+; CHECK: [[PRED_STORE_CONTINUE8]]:
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5
+; CHECK-NEXT: br i1 [[TMP16]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; CHECK: [[PRED_STORE_IF9]]:
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 5
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP17]]
+; CHECK-NEXT: store i16 1, ptr [[TMP18]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE10]]
+; CHECK: [[PRED_STORE_CONTINUE10]]:
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6
+; CHECK-NEXT: br i1 [[TMP19]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; CHECK: [[PRED_STORE_IF11]]:
+; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP20]]
+; CHECK-NEXT: store i16 1, ptr [[TMP21]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE12]]
+; CHECK: [[PRED_STORE_CONTINUE12]]:
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7
+; CHECK-NEXT: br i1 [[TMP22]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14]]
+; CHECK: [[PRED_STORE_IF13]]:
+; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 7
+; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP23]]
+; CHECK-NEXT: store i16 1, ptr [[TMP24]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE14]]
+; CHECK: [[PRED_STORE_CONTINUE14]]:
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
+; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br i1 true, label %[[END:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[P_IV:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IV]]
+; CHECK-NEXT: store i16 1, ptr [[P_IV]], align 2
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 15
+; CHECK-NEXT: br i1 [[COND]], label %[[END]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[END]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %p.iv = getelementptr inbounds i16, ptr %p, i64 %iv
+ store i16 1, ptr %p.iv, align 2
+ %iv.next = add nuw nsw i64 %iv, 1
+ %cond = icmp eq i64 %iv.next, 15
+ br i1 %cond, label %end, label %loop
+
+end:
+ ret void
+}
+
+define dso_local void @canonical_upper_limit_i8(ptr nocapture noundef writeonly %p) {
+; CHECK-LABEL: define dso_local void @canonical_upper_limit_i8(
+; CHECK-SAME: ptr nocapture noundef writeonly [[P:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE14:.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 254)
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0
+; CHECK-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK: [[PRED_STORE_IF]]:
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP2]]
+; CHECK-NEXT: store i16 1, ptr [[TMP3]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]]
+; CHECK: [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1
+; CHECK-NEXT: br i1 [[TMP4]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; CHECK: [[PRED_STORE_IF1]]:
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP5]]
+; CHECK-NEXT: store i16 1, ptr [[TMP6]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]]
+; CHECK: [[PRED_STORE_CONTINUE2]]:
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2
+; CHECK-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; CHECK: [[PRED_STORE_IF3]]:
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP8]]
+; CHECK-NEXT: store i16 1, ptr [[TMP9]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE4]]
+; CHECK: [[PRED_STORE_CONTINUE4]]:
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3
+; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; CHECK: [[PRED_STORE_IF5]]:
+; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP11]]
+; CHECK-NEXT: store i16 1, ptr [[TMP12]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]]
+; CHECK: [[PRED_STORE_CONTINUE6]]:
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4
+; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; CHECK: [[PRED_STORE_IF7]]:
+; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP14]]
+; CHECK-NEXT: store i16 1, ptr [[TMP15]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE8]]
+; CHECK: [[PRED_STORE_CONTINUE8]]:
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5
+; CHECK-NEXT: br i1 [[TMP16]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; CHECK: [[PRED_STORE_IF9]]:
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 5
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP17]]
+; CHECK-NEXT: store i16 1, ptr [[TMP18]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE10]]
+; CHECK: [[PRED_STORE_CONTINUE10]]:
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6
+; CHECK-NEXT: br i1 [[TMP19]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; CHECK: [[PRED_STORE_IF11]]:
+; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP20]]
+; CHECK-NEXT: store i16 1, ptr [[TMP21]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE12]]
+; CHECK: [[PRED_STORE_CONTINUE12]]:
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7
+; CHECK-NEXT: br i1 [[TMP22]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14]]
+; CHECK: [[PRED_STORE_IF13]]:
+; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 7
+; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP23]]
+; CHECK-NEXT: store i16 1, ptr [[TMP24]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE14]]
+; CHECK: [[PRED_STORE_CONTINUE14]]:
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
+; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br i1 true, label %[[END:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 256, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[P_IV:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IV]]
+; CHECK-NEXT: store i16 1, ptr [[P_IV]], align 2
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 255
+; CHECK-NEXT: br i1 [[COND]], label %[[END]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK: [[END]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %p.iv = getelementptr inbounds i16, ptr %p, i64 %iv
+ store i16 1, ptr %p.iv, align 2
+ %iv.next = add nuw nsw i64 %iv, 1
+ %cond = icmp eq i64 %iv.next, 255
+ br i1 %cond, label %end, label %loop
+
+end:
+ ret void
+}
+
+define dso_local void @canonical_lower_limit_i16(ptr nocapture noundef writeonly %p) {
+; CHECK-LABEL: define dso_local void @canonical_lower_limit_i16(
+; CHECK-SAME: ptr nocapture noundef writeonly [[P:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE14:.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 256)
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0
+; CHECK-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK: [[PRED_STORE_IF]]:
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP2]]
+; CHECK-NEXT: store i16 1, ptr [[TMP3]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]]
+; CHECK: [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1
+; CHECK-NEXT: br i1 [[TMP4]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; CHECK: [[PRED_STORE_IF1]]:
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP5]]
+; CHECK-NEXT: store i16 1, ptr [[TMP6]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]]
+; CHECK: [[PRED_STORE_CONTINUE2]]:
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2
+; CHECK-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; CHECK: [[PRED_STORE_IF3]]:
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP8]]
+; CHECK-NEXT: store i16 1, ptr [[TMP9]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE4]]
+; CHECK: [[PRED_STORE_CONTINUE4]]:
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3
+; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; CHECK: [[PRED_STORE_IF5]]:
+; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP11]]
+; CHECK-NEXT: store i16 1, ptr [[TMP12]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]]
+; CHECK: [[PRED_STORE_CONTINUE6]]:
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4
+; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; CHECK: [[PRED_STORE_IF7]]:
+; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP14]]
+; CHECK-NEXT: store i16 1, ptr [[TMP15]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE8]]
+; CHECK: [[PRED_STORE_CONTINUE8]]:
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5
+; CHECK-NEXT: br i1 [[TMP16]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; CHECK: [[PRED_STORE_IF9]]:
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 5
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP17]]
+; CHECK-NEXT: store i16 1, ptr [[TMP18]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE10]]
+; CHECK: [[PRED_STORE_CONTINUE10]]:
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6
+; CHECK-NEXT: br i1 [[TMP19]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; CHECK: [[PRED_STORE_IF11]]:
+; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP20]]
+; CHECK-NEXT: store i16 1, ptr [[TMP21]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE12]]
+; CHECK: [[PRED_STORE_CONTINUE12]]:
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7
+; CHECK-NEXT: br i1 [[TMP22]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14]]
+; CHECK: [[PRED_STORE_IF13]]:
+; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 7
+; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP23]]
+; CHECK-NEXT: store i16 1, ptr [[TMP24]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE14]]
+; CHECK: [[PRED_STORE_CONTINUE14]]:
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
+; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 264
+; CHECK-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br i1 true, label %[[END:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 264, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[P_IV:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IV]]
+; CHECK-NEXT: store i16 1, ptr [[P_IV]], align 2
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 257
+; CHECK-NEXT: br i1 [[COND]], label %[[END]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK: [[END]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %p.iv = getelementptr inbounds i16, ptr %p, i64 %iv
+ store i16 1, ptr %p.iv, align 2
+ %iv.next = add nuw nsw i64 %iv, 1
+ %cond = icmp eq i64 %iv.next, 257
+ br i1 %cond, label %end, label %loop
+
+end:
+ ret void
+}
+
+define dso_local void @canonical_upper_limit_i16(ptr nocapture noundef writeonly %p) {
+; CHECK-LABEL: define dso_local void @canonical_upper_limit_i16(
+; CHECK-SAME: ptr nocapture noundef writeonly [[P:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE14:.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 65534)
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0
+; CHECK-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK: [[PRED_STORE_IF]]:
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP2]]
+; CHECK-NEXT: store i16 1, ptr [[TMP3]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]]
+; CHECK: [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1
+; CHECK-NEXT: br i1 [[TMP4]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; CHECK: [[PRED_STORE_IF1]]:
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP5]]
+; CHECK-NEXT: store i16 1, ptr [[TMP6]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]]
+; CHECK: [[PRED_STORE_CONTINUE2]]:
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2
+; CHECK-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; CHECK: [[PRED_STORE_IF3]]:
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP8]]
+; CHECK-NEXT: store i16 1, ptr [[TMP9]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE4]]
+; CHECK: [[PRED_STORE_CONTINUE4]]:
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3
+; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; CHECK: [[PRED_STORE_IF5]]:
+; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP11]]
+; CHECK-NEXT: store i16 1, ptr [[TMP12]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]]
+; CHECK: [[PRED_STORE_CONTINUE6]]:
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4
+; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; CHECK: [[PRED_STORE_IF7]]:
+; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP14]]
+; CHECK-NEXT: store i16 1, ptr [[TMP15]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE8]]
+; CHECK: [[PRED_STORE_CONTINUE8]]:
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5
+; CHECK-NEXT: br i1 [[TMP16]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; CHECK: [[PRED_STORE_IF9]]:
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 5
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP17]]
+; CHECK-NEXT: store i16 1, ptr [[TMP18]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE10]]
+; CHECK: [[PRED_STORE_CONTINUE10]]:
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6
+; CHECK-NEXT: br i1 [[TMP19]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; CHECK: [[PRED_STORE_IF11]]:
+; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP20]]
+; CHECK-NEXT: store i16 1, ptr [[TMP21]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE12]]
+; CHECK: [[PRED_STORE_CONTINUE12]]:
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7
+; CHECK-NEXT: br i1 [[TMP22]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14]]
+; CHECK: [[PRED_STORE_IF13]]:
+; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 7
+; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP23]]
+; CHECK-NEXT: store i16 1, ptr [[TMP24]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE14]]
+; CHECK: [[PRED_STORE_CONTINUE14]]:
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
+; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
+; CHECK-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br i1 true, label %[[END:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 65536, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[P_IV:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IV]]
+; CHECK-NEXT: store i16 1, ptr [[P_IV]], align 2
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 65535
+; CHECK-NEXT: br i1 [[COND]], label %[[END]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK: [[END]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %p.iv = getelementptr inbounds i16, ptr %p, i64 %iv
+ store i16 1, ptr %p.iv, align 2
+ %iv.next = add nuw nsw i64 %iv, 1
+ %cond = icmp eq i64 %iv.next, 65535
+ br i1 %cond, label %end, label %loop
+
+end:
+ ret void
+}
+
+define dso_local void @canonical_lower_limit_i32(ptr nocapture noundef writeonly %p) {
+; CHECK-LABEL: define dso_local void @canonical_lower_limit_i32(
+; CHECK-SAME: ptr nocapture noundef writeonly [[P:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE14:.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 65536)
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0
+; CHECK-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK: [[PRED_STORE_IF]]:
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP2]]
+; CHECK-NEXT: store i16 1, ptr [[TMP3]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]]
+; CHECK: [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1
+; CHECK-NEXT: br i1 [[TMP4]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; CHECK: [[PRED_STORE_IF1]]:
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP5]]
+; CHECK-NEXT: store i16 1, ptr [[TMP6]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]]
+; CHECK: [[PRED_STORE_CONTINUE2]]:
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2
+; CHECK-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; CHECK: [[PRED_STORE_IF3]]:
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP8]]
+; CHECK-NEXT: store i16 1, ptr [[TMP9]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE4]]
+; CHECK: [[PRED_STORE_CONTINUE4]]:
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3
+; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; CHECK: [[PRED_STORE_IF5]]:
+; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP11]]
+; CHECK-NEXT: store i16 1, ptr [[TMP12]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]]
+; CHECK: [[PRED_STORE_CONTINUE6]]:
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4
+; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; CHECK: [[PRED_STORE_IF7]]:
+; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP14]]
+; CHECK-NEXT: store i16 1, ptr [[TMP15]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE8]]
+; CHECK: [[PRED_STORE_CONTINUE8]]:
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5
+; CHECK-NEXT: br i1 [[TMP16]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; CHECK: [[PRED_STORE_IF9]]:
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 5
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP17]]
+; CHECK-NEXT: store i16 1, ptr [[TMP18]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE10]]
+; CHECK: [[PRED_STORE_CONTINUE10]]:
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6
+; CHECK-NEXT: br i1 [[TMP19]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; CHECK: [[PRED_STORE_IF11]]:
+; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP20]]
+; CHECK-NEXT: store i16 1, ptr [[TMP21]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE12]]
+; CHECK: [[PRED_STORE_CONTINUE12]]:
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7
+; CHECK-NEXT: br i1 [[TMP22]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14]]
+; CHECK: [[PRED_STORE_IF13]]:
+; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 7
+; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP23]]
+; CHECK-NEXT: store i16 1, ptr [[TMP24]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE14]]
+; CHECK: [[PRED_STORE_CONTINUE14]]:
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
+; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65544
+; CHECK-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br i1 true, label %[[END:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 65544, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[P_IV:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IV]]
+; CHECK-NEXT: store i16 1, ptr [[P_IV]], align 2
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 65537
+; CHECK-NEXT: br i1 [[COND]], label %[[END]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK: [[END]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %p.iv = getelementptr inbounds i16, ptr %p, i64 %iv
+ store i16 1, ptr %p.iv, align 2
+ %iv.next = add nuw nsw i64 %iv, 1
+ %cond = icmp eq i64 %iv.next, 65537
+ br i1 %cond, label %end, label %loop
+
+end:
+ ret void
+}
+
+define dso_local void @canonical_upper_limit_i32(ptr nocapture noundef writeonly %p) {
+; CHECK-LABEL: define dso_local void @canonical_upper_limit_i32(
+; CHECK-SAME: ptr nocapture noundef writeonly [[P:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE14:.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 4294967294)
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0
+; CHECK-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK: [[PRED_STORE_IF]]:
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP2]]
+; CHECK-NEXT: store i16 1, ptr [[TMP3]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]]
+; CHECK: [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1
+; CHECK-NEXT: br i1 [[TMP4]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; CHECK: [[PRED_STORE_IF1]]:
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP5]]
+; CHECK-NEXT: store i16 1, ptr [[TMP6]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]]
+; CHECK: [[PRED_STORE_CONTINUE2]]:
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2
+; CHECK-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; CHECK: [[PRED_STORE_IF3]]:
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP8]]
+; CHECK-NEXT: store i16 1, ptr [[TMP9]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE4]]
+; CHECK: [[PRED_STORE_CONTINUE4]]:
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3
+; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; CHECK: [[PRED_STORE_IF5]]:
+; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP11]]
+; CHECK-NEXT: store i16 1, ptr [[TMP12]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]]
+; CHECK: [[PRED_STORE_CONTINUE6]]:
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4
+; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; CHECK: [[PRED_STORE_IF7]]:
+; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP14]]
+; CHECK-NEXT: store i16 1, ptr [[TMP15]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE8]]
+; CHECK: [[PRED_STORE_CONTINUE8]]:
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5
+; CHECK-NEXT: br i1 [[TMP16]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; CHECK: [[PRED_STORE_IF9]]:
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 5
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP17]]
+; CHECK-NEXT: store i16 1, ptr [[TMP18]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE10]]
+; CHECK: [[PRED_STORE_CONTINUE10]]:
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6
+; CHECK-NEXT: br i1 [[TMP19]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; CHECK: [[PRED_STORE_IF11]]:
+; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP20]]
+; CHECK-NEXT: store i16 1, ptr [[TMP21]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE12]]
+; CHECK: [[PRED_STORE_CONTINUE12]]:
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7
+; CHECK-NEXT: br i1 [[TMP22]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14]]
+; CHECK: [[PRED_STORE_IF13]]:
+; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 7
+; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP23]]
+; CHECK-NEXT: store i16 1, ptr [[TMP24]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE14]]
+; CHECK: [[PRED_STORE_CONTINUE14]]:
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
+; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4294967296
+; CHECK-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br i1 true, label %[[END:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4294967296, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[P_IV:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IV]]
+; CHECK-NEXT: store i16 1, ptr [[P_IV]], align 2
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 4294967295
+; CHECK-NEXT: br i1 [[COND]], label %[[END]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK: [[END]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %p.iv = getelementptr inbounds i16, ptr %p, i64 %iv
+ store i16 1, ptr %p.iv, align 2
+ %iv.next = add nuw nsw i64 %iv, 1
+ %cond = icmp eq i64 %iv.next, 4294967295
+ br i1 %cond, label %end, label %loop
+
+end:
+ ret void
+}
+
+define dso_local void @canonical_lower_limit_i64(ptr nocapture noundef writeonly %p) {
+; CHECK-LABEL: define dso_local void @canonical_lower_limit_i64(
+; CHECK-SAME: ptr nocapture noundef writeonly [[P:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE14:.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 4294967296)
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0
+; CHECK-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK: [[PRED_STORE_IF]]:
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP2]]
+; CHECK-NEXT: store i16 1, ptr [[TMP3]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]]
+; CHECK: [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1
+; CHECK-NEXT: br i1 [[TMP4]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; CHECK: [[PRED_STORE_IF1]]:
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP5]]
+; CHECK-NEXT: store i16 1, ptr [[TMP6]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]]
+; CHECK: [[PRED_STORE_CONTINUE2]]:
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2
+; CHECK-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; CHECK: [[PRED_STORE_IF3]]:
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP8]]
+; CHECK-NEXT: store i16 1, ptr [[TMP9]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE4]]
+; CHECK: [[PRED_STORE_CONTINUE4]]:
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3
+; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; CHECK: [[PRED_STORE_IF5]]:
+; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP11]]
+; CHECK-NEXT: store i16 1, ptr [[TMP12]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]]
+; CHECK: [[PRED_STORE_CONTINUE6]]:
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4
+; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; CHECK: [[PRED_STORE_IF7]]:
+; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP14]]
+; CHECK-NEXT: store i16 1, ptr [[TMP15]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE8]]
+; CHECK: [[PRED_STORE_CONTINUE8]]:
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5
+; CHECK-NEXT: br i1 [[TMP16]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; CHECK: [[PRED_STORE_IF9]]:
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 5
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP17]]
+; CHECK-NEXT: store i16 1, ptr [[TMP18]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE10]]
+; CHECK: [[PRED_STORE_CONTINUE10]]:
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6
+; CHECK-NEXT: br i1 [[TMP19]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; CHECK: [[PRED_STORE_IF11]]:
+; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP20]]
+; CHECK-NEXT: store i16 1, ptr [[TMP21]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE12]]
+; CHECK: [[PRED_STORE_CONTINUE12]]:
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7
+; CHECK-NEXT: br i1 [[TMP22]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14]]
+; CHECK: [[PRED_STORE_IF13]]:
+; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 7
+; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP23]]
+; CHECK-NEXT: store i16 1, ptr [[TMP24]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE14]]
+; CHECK: [[PRED_STORE_CONTINUE14]]:
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
+; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4294967304
+; CHECK-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br i1 true, label %[[END:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4294967304, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[P_IV:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IV]]
+; CHECK-NEXT: store i16 1, ptr [[P_IV]], align 2
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 4294967297
+; CHECK-NEXT: br i1 [[COND]], label %[[END]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK: [[END]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %p.iv = getelementptr inbounds i16, ptr %p, i64 %iv
+ store i16 1, ptr %p.iv, align 2
+ %iv.next = add nuw nsw i64 %iv, 1
+ %cond = icmp eq i64 %iv.next, 4294967297
+ br i1 %cond, label %end, label %loop
+
+end:
+ ret void
+}
+
+define dso_local void @canonical_upper_limit_i64(ptr nocapture noundef writeonly %p) {
+; CHECK-LABEL: define dso_local void @canonical_upper_limit_i64(
+; CHECK-SAME: ptr nocapture noundef writeonly [[P:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE14:.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 -2)
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0
+; CHECK-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK: [[PRED_STORE_IF]]:
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP2]]
+; CHECK-NEXT: store i16 1, ptr [[TMP3]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]]
+; CHECK: [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1
+; CHECK-NEXT: br i1 [[TMP4]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
+; CHECK: [[PRED_STORE_IF1]]:
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP5]]
+; CHECK-NEXT: store i16 1, ptr [[TMP6]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]]
+; CHECK: [[PRED_STORE_CONTINUE2]]:
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2
+; CHECK-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; CHECK: [[PRED_STORE_IF3]]:
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP8]]
+; CHECK-NEXT: store i16 1, ptr [[TMP9]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE4]]
+; CHECK: [[PRED_STORE_CONTINUE4]]:
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3
+; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; CHECK: [[PRED_STORE_IF5]]:
+; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP11]]
+; CHECK-NEXT: store i16 1, ptr [[TMP12]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]]
+; CHECK: [[PRED_STORE_CONTINUE6]]:
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4
+; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; CHECK: [[PRED_STORE_IF7]]:
+; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP14]]
+; CHECK-NEXT: store i16 1, ptr [[TMP15]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE8]]
+; CHECK: [[PRED_STORE_CONTINUE8]]:
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5
+; CHECK-NEXT: br i1 [[TMP16]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; CHECK: [[PRED_STORE_IF9]]:
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 5
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP17]]
+; CHECK-NEXT: store i16 1, ptr [[TMP18]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE10]]
+; CHECK: [[PRED_STORE_CONTINUE10]]:
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6
+; CHECK-NEXT: br i1 [[TMP19]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; CHECK: [[PRED_STORE_IF11]]:
+; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP20]]
+; CHECK-NEXT: store i16 1, ptr [[TMP21]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE12]]
+; CHECK: [[PRED_STORE_CONTINUE12]]:
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7
+; CHECK-NEXT: br i1 [[TMP22]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14]]
+; CHECK: [[PRED_STORE_IF13]]:
+; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 7
+; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[TMP23]]
+; CHECK-NEXT: store i16 1, ptr [[TMP24]], align 2
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE14]]
+; CHECK: [[PRED_STORE_CONTINUE14]]:
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
+; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br i1 true, label %[[END:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[P_IV:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IV]]
+; CHECK-NEXT: store i16 1, ptr [[P_IV]], align 2
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], -1
+; CHECK-NEXT: br i1 [[COND]], label %[[END]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK: [[END]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %p.iv = getelementptr inbounds i16, ptr %p, i64 %iv
+ store i16 1, ptr %p.iv, align 2
+ %iv.next = add nuw nsw i64 %iv, 1
+ %cond = icmp eq i64 %iv.next, 18446744073709551615
+ br i1 %cond, label %end, label %loop
+
+end:
+ ret void
+}
+
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
+;.
>From ab915f742d21920974ea8845edaef2ba055c329c Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Thu, 5 Dec 2024 16:22:15 +0000
Subject: [PATCH 2/4] [LV] Optimize VPWidenIntOrFpInductionRecipe for known TC
Optimize the IR generated for a VPWidenIntOrFpInductionRecipe to use the
narrowest type necessary, when the trip-count of a loop is known to be
constant and the only use of the recipe is the condition used by the
vector loop's backedge branch.
---
.../Transforms/Vectorize/LoopVectorize.cpp | 3 +
llvm/lib/Transforms/Vectorize/VPlan.h | 3 +
.../Transforms/Vectorize/VPlanTransforms.cpp | 54 +++++++++++
.../Transforms/Vectorize/VPlanTransforms.h | 4 +
.../AArch64/conditional-branches-cost.ll | 18 ++--
.../pr45679-fold-tail-by-masking.ll | 12 +--
.../LoopVectorize/reduction-inloop-pred.ll | 90 +++++++++----------
.../LoopVectorize/reduction-predselect.ll | 54 +++++------
.../tail-folding-alloca-in-loop.ll | 6 +-
...folding-optimize-vector-induction-width.ll | 36 ++++----
.../LoopVectorize/tail-folding-switch.ll | 6 +-
11 files changed, 175 insertions(+), 111 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3c7c044a042719..3e6774ff14ac1f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7657,6 +7657,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
OrigLoop->getHeader()->getContext());
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
+ auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop);
+ VPlanTransforms::optimizeForTCAndVF(BestVPlan, TC, BestVF);
+
LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
<< ", UF=" << BestUF << '\n');
BestVPlan.setName("Final VPlan");
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e1d828f038f9a2..809efc5988930b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2150,6 +2150,9 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe {
VPValue *getStepValue() { return getOperand(1); }
const VPValue *getStepValue() const { return getOperand(1); }
+ /// Update the start value of the recipe.
+ void setStepValue(VPValue *V) { setOperand(1, V); }
+
VPValue *getVFValue() { return getOperand(2); }
const VPValue *getVFValue() const { return getOperand(2); }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index cee83d1015b536..6e27b1b5055446 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -661,6 +661,60 @@ static void recursivelyDeleteDeadRecipes(VPValue *V) {
}
}
+void VPlanTransforms::optimizeForTCAndVF(VPlan &Plan, unsigned TC,
+ ElementCount BestVF) {
+ assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
+ if (!TC || !BestVF.isFixed())
+ return;
+
+ // Calculate the widest type required for known TC and VF.
+ uint64_t Width = BestVF.getKnownMinValue();
+ uint64_t MaxVal = alignTo(TC, Width) - 1;
+ unsigned MaxActiveBits = Log2_64_Ceil(MaxVal);
+ unsigned NewBitWidth = std::max<unsigned>(PowerOf2Ceil(MaxActiveBits), 8);
+ LLVMContext &Ctx = Plan.getCanonicalIV()->getScalarType()->getContext();
+ auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
+
+ bool MadeChange = false;
+
+ VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
+ for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
+ auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
+ if (!WideIV || !WideIV->isCanonical())
+ continue;
+
+ if (WideIV->hasMoreThanOneUniqueUser())
+ continue;
+
+ // Currently only handle cases where the single user is a header-mask
+ // comparison with the backedge-taken-count.
+ VPValue *Bound;
+ using namespace VPlanPatternMatch;
+ auto *Cmp = dyn_cast<VPInstruction>(*WideIV->user_begin());
+ if (!Cmp ||
+ !match(Cmp, m_Binary<Instruction::ICmp>(m_Specific(WideIV),
+ m_VPValue(Bound))) ||
+ Bound != Plan.getOrCreateBackedgeTakenCount())
+ continue;
+
+ if (NewIVTy == WideIV->getScalarType())
+ continue;
+
+ // Update IV operands and comparison bound to use new narrower type.
+ auto *NewStart = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 0));
+ WideIV->setStartValue(NewStart);
+ auto *NewStep = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 1));
+ WideIV->setStepValue(NewStep);
+ auto *NewBound = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, TC - 1));
+ Cmp->setOperand(1, NewBound);
+
+ MadeChange = true;
+ }
+
+ if (MadeChange)
+ Plan.setVF(BestVF);
+}
+
void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
unsigned BestUF,
PredicatedScalarEvolution &PSE) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 1491e0a8df04d5..ba772cf385d560 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -15,6 +15,7 @@
#include "VPlan.h"
#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/Support/TypeSize.h"
namespace llvm {
@@ -58,6 +59,9 @@ struct VPlanTransforms {
unsigned BestUF,
PredicatedScalarEvolution &PSE);
+ /// Optimize \p Plan based on \p TC and \p BestVF.
+ static void optimizeForTCAndVF(VPlan &Plan, unsigned TC, ElementCount BestVF);
+
/// Apply VPlan-to-VPlan optimizations to \p Plan, including induction recipe
/// optimizations, dead recipe removal, replicate region optimizations and
/// block merging.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 69560305706364..bec30b3dcb5289 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -389,8 +389,8 @@ define void @latch_branch_cost(ptr %dst) {
; PRED-NEXT: br label [[VECTOR_BODY:%.*]]
; PRED: vector.body:
; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
-; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
-; PRED-NEXT: [[TMP0:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 99)
+; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
+; PRED-NEXT: [[TMP0:%.*]] = icmp ule <8 x i8> [[VEC_IND]], splat (i8 99)
; PRED-NEXT: [[TMP1:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0
; PRED-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; PRED: pred.store.if:
@@ -456,7 +456,7 @@ define void @latch_branch_cost(ptr %dst) {
; PRED-NEXT: br label [[PRED_STORE_CONTINUE6]]
; PRED: pred.store.continue14:
; PRED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
+; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
; PRED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 104
; PRED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; PRED: middle.block:
@@ -903,9 +903,9 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
; DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]]
; DEFAULT: vector.body:
; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE14:%.*]] ]
-; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE14]] ]
+; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <8 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE14]] ]
; DEFAULT-NEXT: [[TMP0:%.*]] = trunc i64 [[INDEX]] to i8
-; DEFAULT-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 6)
+; DEFAULT-NEXT: [[TMP1:%.*]] = icmp ule <8 x i8> [[VEC_IND]], splat (i8 6)
; DEFAULT-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
; DEFAULT-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; DEFAULT: pred.store.if:
@@ -978,7 +978,7 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
; DEFAULT-NEXT: store i8 [[TMP33]], ptr [[TMP32]], align 1
; DEFAULT-NEXT: br label [[PRED_STORE_CONTINUE14]]
; DEFAULT: pred.store.continue14:
-; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
+; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; DEFAULT-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
; DEFAULT: middle.block:
@@ -1005,9 +1005,9 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
; PRED-NEXT: br label [[VECTOR_BODY:%.*]]
; PRED: vector.body:
; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE14:%.*]] ]
-; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE14]] ]
+; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE14]] ]
; PRED-NEXT: [[TMP0:%.*]] = trunc i64 [[INDEX]] to i8
-; PRED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 6)
+; PRED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i8> [[VEC_IND]], splat (i8 6)
; PRED-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
; PRED-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; PRED: pred.store.if:
@@ -1080,7 +1080,7 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
; PRED-NEXT: store i8 [[TMP33]], ptr [[TMP32]], align 1
; PRED-NEXT: br label [[PRED_STORE_CONTINUE14]]
; PRED: pred.store.continue14:
-; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
+; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
; PRED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; PRED-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; PRED: middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
index c301ef3c5319a2..b207cca03c90f2 100644
--- a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
@@ -18,8 +18,8 @@ define void @pr45679(ptr %A) optsize {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i32> [[VEC_IND]], splat (i32 13)
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 2, i8 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i8> [[VEC_IND]], splat (i8 13)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; CHECK: pred.store.if:
@@ -53,7 +53,7 @@ define void @pr45679(ptr %A) optsize {
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]]
; CHECK: pred.store.continue6:
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
@@ -213,8 +213,8 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IND]], splat (i64 13)
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 2, i8 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i8> [[VEC_IND]], splat (i8 13)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; CHECK: pred.store.if:
@@ -252,7 +252,7 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) {
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]]
; CHECK: pred.store.continue6:
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
index 8e132ed8399cd6..c76057a18bf3cd 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
@@ -11,9 +11,9 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], splat (i64 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -57,7 +57,7 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) {
; CHECK-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP24]])
; CHECK-NEXT: [[TMP26]] = add i32 [[TMP25]], [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
@@ -97,10 +97,10 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE8:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE8]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE8]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[PRED_LOAD_CONTINUE8]] ]
; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[PRED_LOAD_CONTINUE8]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], splat (i64 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -166,7 +166,7 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) {
; CHECK-NEXT: [[TMP47:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP46]])
; CHECK-NEXT: [[TMP48]] = add i32 [[TMP47]], [[TMP45]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-NEXT: [[VEC_IND_NEXT2]] = add <4 x i32> [[VEC_IND1]], splat (i32 4)
; CHECK-NEXT: [[TMP49:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP49]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -212,9 +212,9 @@ define i32 @reduction_sum_const(ptr noalias nocapture %A) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], splat (i64 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -261,7 +261,7 @@ define i32 @reduction_sum_const(ptr noalias nocapture %A) {
; CHECK-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP27]])
; CHECK-NEXT: [[TMP29]] = add i32 [[TMP28]], [[TMP26]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
@@ -302,10 +302,10 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE8:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE8]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE8]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 1, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[PRED_LOAD_CONTINUE8]] ]
; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[PRED_LOAD_CONTINUE8]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], splat (i64 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -371,7 +371,7 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) {
; CHECK-NEXT: [[TMP47:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP46]])
; CHECK-NEXT: [[TMP48]] = mul i32 [[TMP47]], [[TMP45]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-NEXT: [[VEC_IND_NEXT2]] = add <4 x i32> [[VEC_IND1]], splat (i32 4)
; CHECK-NEXT: [[TMP49:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP49]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -417,10 +417,10 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE8:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE8]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE8]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[PRED_LOAD_CONTINUE8]] ]
; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[PRED_LOAD_CONTINUE8]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], splat (i64 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -484,7 +484,7 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) {
; CHECK-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP44]])
; CHECK-NEXT: [[TMP46]] = add i32 [[TMP45]], [[TMP43]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-NEXT: [[VEC_IND_NEXT2]] = add <4 x i32> [[VEC_IND1]], splat (i32 4)
; CHECK-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP47]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -530,9 +530,9 @@ define i32 @reduction_mul(ptr noalias nocapture %A, ptr noalias nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 19, [[VECTOR_PH]] ], [ [[TMP45:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], splat (i64 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -595,7 +595,7 @@ define i32 @reduction_mul(ptr noalias nocapture %A, ptr noalias nocapture %B) {
; CHECK-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP43]])
; CHECK-NEXT: [[TMP45]] = mul i32 [[TMP44]], [[TMP42]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP46]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK: middle.block:
@@ -638,9 +638,9 @@ define i32 @reduction_and(ptr nocapture %A, ptr nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ -1, [[VECTOR_PH]] ], [ [[TMP45:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], splat (i64 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -703,7 +703,7 @@ define i32 @reduction_and(ptr nocapture %A, ptr nocapture %B) {
; CHECK-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP43]])
; CHECK-NEXT: [[TMP45]] = and i32 [[TMP44]], [[TMP42]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP46]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK: middle.block:
@@ -746,9 +746,9 @@ define i32 @reduction_or(ptr nocapture %A, ptr nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], splat (i64 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -809,7 +809,7 @@ define i32 @reduction_or(ptr nocapture %A, ptr nocapture %B) {
; CHECK-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP41]])
; CHECK-NEXT: [[TMP43]] = or i32 [[TMP42]], [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK: middle.block:
@@ -852,9 +852,9 @@ define i32 @reduction_xor(ptr nocapture %A, ptr nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], splat (i64 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -915,7 +915,7 @@ define i32 @reduction_xor(ptr nocapture %A, ptr nocapture %B) {
; CHECK-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP41]])
; CHECK-NEXT: [[TMP43]] = xor i32 [[TMP42]], [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
; CHECK: middle.block:
@@ -958,9 +958,9 @@ define float @reduction_fadd(ptr nocapture %A, ptr nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], splat (i64 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -1021,7 +1021,7 @@ define float @reduction_fadd(ptr nocapture %A, ptr nocapture %B) {
; CHECK-NEXT: [[TMP42:%.*]] = select fast <4 x i1> [[TMP0]], <4 x float> [[TMP39]], <4 x float> zeroinitializer
; CHECK-NEXT: [[TMP43]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[TMP41]], <4 x float> [[TMP42]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; CHECK: middle.block:
@@ -1064,9 +1064,9 @@ define float @reduction_fmul(ptr nocapture %A, ptr nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP45:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], splat (i64 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -1129,7 +1129,7 @@ define float @reduction_fmul(ptr nocapture %A, ptr nocapture %B) {
; CHECK-NEXT: [[TMP44:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP43]])
; CHECK-NEXT: [[TMP45]] = fmul fast float [[TMP44]], [[TMP42]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP46]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
; CHECK: middle.block:
@@ -1172,9 +1172,9 @@ define i32 @reduction_min(ptr nocapture %A, ptr nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 1000, [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], splat (i64 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -1218,7 +1218,7 @@ define i32 @reduction_min(ptr nocapture %A, ptr nocapture %B) {
; CHECK-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP24]])
; CHECK-NEXT: [[RDX_MINMAX]] = call i32 @llvm.smin.i32(i32 [[TMP25]], i32 [[VEC_PHI]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
; CHECK: middle.block:
@@ -1259,9 +1259,9 @@ define i32 @reduction_max(ptr nocapture %A, ptr nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 1000, [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], splat (i64 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -1305,7 +1305,7 @@ define i32 @reduction_max(ptr nocapture %A, ptr nocapture %B) {
; CHECK-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP24]])
; CHECK-NEXT: [[RDX_MINMAX]] = call i32 @llvm.umax.i32(i32 [[TMP25]], i32 [[VEC_PHI]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
; CHECK: middle.block:
@@ -1438,9 +1438,9 @@ define i8 @reduction_add_trunc(ptr noalias nocapture %A) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 255, i32 0, i32 0, i32 0>, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IND]], splat (i32 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[VEC_PHI]], splat (i32 255)
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
@@ -1488,7 +1488,7 @@ define i8 @reduction_add_trunc(ptr noalias nocapture %A) {
; CHECK-NEXT: [[TMP29:%.*]] = zext <4 x i8> [[TMP28]] to <4 x i32>
; CHECK-NEXT: [[TMP30]] = add nuw nsw <4 x i32> [[TMP1]], [[TMP29]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
; CHECK: middle.block:
@@ -1534,9 +1534,9 @@ define i8 @reduction_and_trunc(ptr noalias nocapture %A) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 255, i32 -1, i32 -1, i32 -1>, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IND]], splat (i32 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -1583,7 +1583,7 @@ define i8 @reduction_and_trunc(ptr noalias nocapture %A) {
; CHECK-NEXT: [[TMP28:%.*]] = zext <4 x i8> [[TMP27]] to <4 x i32>
; CHECK-NEXT: [[TMP29]] = and <4 x i32> [[VEC_PHI]], [[TMP28]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
; CHECK: middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll b/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll
index f95be1a221e73b..3d40707a5e97ec 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll
@@ -11,9 +11,9 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IND]], splat (i32 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -56,7 +56,7 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) {
; CHECK-NEXT: [[TMP24:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP23]], <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP25]] = add <4 x i32> [[VEC_PHI]], [[TMP24]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
@@ -203,9 +203,9 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 1), [[VECTOR_PH]] ], [ [[TMP42:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IND]], splat (i32 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -265,7 +265,7 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) {
; CHECK-NEXT: [[TMP41:%.*]] = mul <4 x i32> [[TMP40]], [[TMP39]]
; CHECK-NEXT: [[TMP42]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP41]], <4 x i32> [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-NEXT: [[TMP43:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
@@ -308,9 +308,9 @@ define i32 @reduction_and(ptr nocapture %A, ptr nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -1), [[VECTOR_PH]] ], [ [[TMP42:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IND]], splat (i32 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -370,7 +370,7 @@ define i32 @reduction_and(ptr nocapture %A, ptr nocapture %B) {
; CHECK-NEXT: [[TMP41:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP40]], <4 x i32> splat (i32 -1)
; CHECK-NEXT: [[TMP42]] = and <4 x i32> [[VEC_PHI]], [[TMP41]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-NEXT: [[TMP43:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: middle.block:
@@ -413,9 +413,9 @@ define i32 @reduction_or(ptr nocapture %A, ptr nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP42:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IND]], splat (i32 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -475,7 +475,7 @@ define i32 @reduction_or(ptr nocapture %A, ptr nocapture %B) {
; CHECK-NEXT: [[TMP41:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP40]], <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP42]] = or <4 x i32> [[VEC_PHI]], [[TMP41]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-NEXT: [[TMP43:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: middle.block:
@@ -518,9 +518,9 @@ define i32 @reduction_xor(ptr nocapture %A, ptr nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP42:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IND]], splat (i32 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -580,7 +580,7 @@ define i32 @reduction_xor(ptr nocapture %A, ptr nocapture %B) {
; CHECK-NEXT: [[TMP41:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP40]], <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP42]] = xor <4 x i32> [[VEC_PHI]], [[TMP41]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-NEXT: [[TMP43:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK: middle.block:
@@ -623,9 +623,9 @@ define float @reduction_fadd(ptr nocapture %A, ptr nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP42:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IND]], splat (i32 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -685,7 +685,7 @@ define float @reduction_fadd(ptr nocapture %A, ptr nocapture %B) {
; CHECK-NEXT: [[TMP41:%.*]] = fadd fast <4 x float> [[TMP40]], [[TMP39]]
; CHECK-NEXT: [[TMP42]] = select fast <4 x i1> [[TMP0]], <4 x float> [[TMP41]], <4 x float> [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-NEXT: [[TMP43:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK: middle.block:
@@ -728,9 +728,9 @@ define float @reduction_fmul(ptr nocapture %A, ptr nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[VECTOR_PH]] ], [ [[TMP42:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IND]], splat (i32 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -790,7 +790,7 @@ define float @reduction_fmul(ptr nocapture %A, ptr nocapture %B) {
; CHECK-NEXT: [[TMP41:%.*]] = fmul fast <4 x float> [[TMP40]], [[TMP39]]
; CHECK-NEXT: [[TMP42]] = select fast <4 x i1> [[TMP0]], <4 x float> [[TMP41]], <4 x float> [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-NEXT: [[TMP43:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK: middle.block:
@@ -833,9 +833,9 @@ define i32 @reduction_min(ptr nocapture %A, ptr nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 1000), [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IND]], splat (i32 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -878,7 +878,7 @@ define i32 @reduction_min(ptr nocapture %A, ptr nocapture %B) {
; CHECK-NEXT: [[TMP24:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[VEC_PHI]], <4 x i32> [[TMP23]])
; CHECK-NEXT: [[TMP25]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP24]], <4 x i32> [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
; CHECK: middle.block:
@@ -919,9 +919,9 @@ define i32 @reduction_max(ptr nocapture %A, ptr nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 1000), [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IND]], splat (i32 257)
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i16> [[VEC_IND]], splat (i16 257)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -964,7 +964,7 @@ define i32 @reduction_max(ptr nocapture %A, ptr nocapture %B) {
; CHECK-NEXT: [[TMP24:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[VEC_PHI]], <4 x i32> [[TMP23]])
; CHECK-NEXT: [[TMP25]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP24]], <4 x i32> [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; CHECK: middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-alloca-in-loop.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-alloca-in-loop.ll
index 56fc8eac35bad6..3a54244a41017d 100644
--- a/llvm/test/Transforms/LoopVectorize/tail-folding-alloca-in-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/tail-folding-alloca-in-loop.ll
@@ -10,8 +10,8 @@ define i32 @test(ptr %vf1, i64 %n) {
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IND]], splat (i64 200)
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 2, i8 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i8> [[VEC_IND]], splat (i8 -56)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
; CHECK-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; CHECK: [[PRED_STORE_IF]]:
@@ -49,7 +49,7 @@ define i32 @test(ptr %vf1, i64 %n) {
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]]
; CHECK: [[PRED_STORE_CONTINUE6]]:
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 204
; CHECK-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-optimize-vector-induction-width.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-optimize-vector-induction-width.ll
index 252d6b063eee9b..ae20e7d823a583 100644
--- a/llvm/test/Transforms/LoopVectorize/tail-folding-optimize-vector-induction-width.ll
+++ b/llvm/test/Transforms/LoopVectorize/tail-folding-optimize-vector-induction-width.ll
@@ -13,8 +13,8 @@ define dso_local void @canonical_small_tc_i8(ptr nocapture noundef writeonly %p)
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE14:.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 14)
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <8 x i8> [[VEC_IND]], splat (i8 14)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0
; CHECK-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; CHECK: [[PRED_STORE_IF]]:
@@ -80,7 +80,7 @@ define dso_local void @canonical_small_tc_i8(ptr nocapture noundef writeonly %p)
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE14]]
; CHECK: [[PRED_STORE_CONTINUE14]]:
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
; CHECK-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
@@ -122,8 +122,8 @@ define dso_local void @canonical_upper_limit_i8(ptr nocapture noundef writeonly
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE14:.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 254)
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <8 x i8> [[VEC_IND]], splat (i8 -2)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0
; CHECK-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; CHECK: [[PRED_STORE_IF]]:
@@ -189,7 +189,7 @@ define dso_local void @canonical_upper_limit_i8(ptr nocapture noundef writeonly
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE14]]
; CHECK: [[PRED_STORE_CONTINUE14]]:
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
; CHECK-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
@@ -231,8 +231,8 @@ define dso_local void @canonical_lower_limit_i16(ptr nocapture noundef writeonly
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE14:.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 256)
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i16> [ <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <8 x i16> [[VEC_IND]], splat (i16 256)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0
; CHECK-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; CHECK: [[PRED_STORE_IF]]:
@@ -298,7 +298,7 @@ define dso_local void @canonical_lower_limit_i16(ptr nocapture noundef writeonly
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE14]]
; CHECK: [[PRED_STORE_CONTINUE14]]:
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i16> [[VEC_IND]], splat (i16 8)
; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 264
; CHECK-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
@@ -340,8 +340,8 @@ define dso_local void @canonical_upper_limit_i16(ptr nocapture noundef writeonly
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE14:.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 65534)
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i16> [ <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <8 x i16> [[VEC_IND]], splat (i16 -2)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0
; CHECK-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; CHECK: [[PRED_STORE_IF]]:
@@ -407,7 +407,7 @@ define dso_local void @canonical_upper_limit_i16(ptr nocapture noundef writeonly
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE14]]
; CHECK: [[PRED_STORE_CONTINUE14]]:
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i16> [[VEC_IND]], splat (i16 8)
; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
; CHECK-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
@@ -449,8 +449,8 @@ define dso_local void @canonical_lower_limit_i32(ptr nocapture noundef writeonly
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE14:.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 65536)
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <8 x i32> [[VEC_IND]], splat (i32 65536)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0
; CHECK-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; CHECK: [[PRED_STORE_IF]]:
@@ -516,7 +516,7 @@ define dso_local void @canonical_lower_limit_i32(ptr nocapture noundef writeonly
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE14]]
; CHECK: [[PRED_STORE_CONTINUE14]]:
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65544
; CHECK-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
@@ -558,8 +558,8 @@ define dso_local void @canonical_upper_limit_i32(ptr nocapture noundef writeonly
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE14:.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 4294967294)
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <8 x i32> [[VEC_IND]], splat (i32 -2)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0
; CHECK-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; CHECK: [[PRED_STORE_IF]]:
@@ -625,7 +625,7 @@ define dso_local void @canonical_upper_limit_i32(ptr nocapture noundef writeonly
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE14]]
; CHECK: [[PRED_STORE_CONTINUE14]]:
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4294967296
; CHECK-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-switch.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-switch.ll
index 31732f027f6dd4..59d382c5cce985 100644
--- a/llvm/test/Transforms/LoopVectorize/tail-folding-switch.ll
+++ b/llvm/test/Transforms/LoopVectorize/tail-folding-switch.ll
@@ -13,8 +13,8 @@ define void @tail_fold_switch(ptr %dst, i32 %0) {
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 2, i8 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <4 x i8> [[VEC_IND]], splat (i8 4)
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], splat (i32 1)
; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP2]], <4 x i1> zeroinitializer
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
@@ -50,7 +50,7 @@ define void @tail_fold_switch(ptr %dst, i32 %0) {
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]]
; CHECK: [[PRED_STORE_CONTINUE6]]:
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 8
; CHECK-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
>From 14574f70927c7ddc7f82c0a7fb37a8c2671f6841 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Fri, 6 Dec 2024 12:52:31 +0000
Subject: [PATCH 3/4] Fix regression tests for other Targets
---
.../SystemZ/predicated-first-order-recurrence.ll | 6 +++---
.../LoopVectorize/X86/consecutive-ptr-uniforms.ll | 6 +++---
2 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll
index d0754f1c2bb555..4fbcab9792c1b8 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll
@@ -19,10 +19,10 @@ define void @func_21() {
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ <i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[PRED_STORE_CONTINUE4]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE4]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i8> [ <i8 0, i8 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE4]] ]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <2 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <2 x i8> [[VEC_IND]], splat (i8 4)
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
@@ -59,7 +59,7 @@ define void @func_21() {
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]]
; CHECK: pred.store.continue4:
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i8> [[VEC_IND]], splat (i8 2)
; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 6
; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
index e685a83d9ccbb2..e26221eeefd2d8 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
@@ -86,8 +86,8 @@ attributes #0 = { "target-cpu"="knl" }
; FORCE-NEXT: br label [[VECTOR_BODY:%.*]]
; FORCE: vector.body:
; FORCE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
-; FORCE-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE4]] ]
-; FORCE-NEXT: [[TMP2:%.*]] = icmp ule <2 x i32> [[VEC_IND]], splat (i32 2)
+; FORCE-NEXT: [[VEC_IND:%.*]] = phi <2 x i8> [ <i8 0, i8 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE4]] ]
+; FORCE-NEXT: [[TMP2:%.*]] = icmp ule <2 x i8> [[VEC_IND]], splat (i8 2)
; FORCE-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
; FORCE-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; FORCE: pred.store.if:
@@ -103,7 +103,7 @@ attributes #0 = { "target-cpu"="knl" }
; FORCE-NEXT: br label [[PRED_STORE_CONTINUE4]]
; FORCE: pred.store.continue2:
; FORCE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; FORCE-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; FORCE-NEXT: [[VEC_IND_NEXT]] = add <2 x i8> [[VEC_IND]], splat (i8 2)
; FORCE-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
; FORCE-NEXT: br i1 [[TMP15]], label {{%.*}}, label [[VECTOR_BODY]]
;
>From eaa7c25c62ab7c6ab373ae12e3b3bdfbcba68745 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Tue, 17 Dec 2024 14:11:46 +0000
Subject: [PATCH 4/4] Address review comments
- Fix incorrect comment
- Consider UF as well as VF
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 2 +-
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 14 +++++++++-----
llvm/lib/Transforms/Vectorize/VPlanTransforms.h | 5 +++--
4 files changed, 14 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3e6774ff14ac1f..e8b092e0ddc83b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7658,7 +7658,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop);
- VPlanTransforms::optimizeForTCAndVF(BestVPlan, TC, BestVF);
+ VPlanTransforms::optimizeForTCAndVFAndUF(BestVPlan, TC, BestVF, BestUF);
LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
<< ", UF=" << BestUF << '\n');
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 809efc5988930b..49300831e1c38a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2150,7 +2150,7 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe {
VPValue *getStepValue() { return getOperand(1); }
const VPValue *getStepValue() const { return getOperand(1); }
- /// Update the start value of the recipe.
+ /// Update the step value of the recipe.
void setStepValue(VPValue *V) { setOperand(1, V); }
VPValue *getVFValue() { return getOperand(2); }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 6e27b1b5055446..9cd7e62f2f5e86 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -661,14 +661,16 @@ static void recursivelyDeleteDeadRecipes(VPValue *V) {
}
}
-void VPlanTransforms::optimizeForTCAndVF(VPlan &Plan, unsigned TC,
- ElementCount BestVF) {
+void VPlanTransforms::optimizeForTCAndVFAndUF(VPlan &Plan, unsigned TC,
+ ElementCount BestVF,
+ unsigned BestUF) {
assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
+ assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
if (!TC || !BestVF.isFixed())
return;
- // Calculate the widest type required for known TC and VF.
- uint64_t Width = BestVF.getKnownMinValue();
+ // Calculate the widest type required for known TC, VF and UF.
+ uint64_t Width = BestVF.getKnownMinValue() * BestUF;
uint64_t MaxVal = alignTo(TC, Width) - 1;
unsigned MaxActiveBits = Log2_64_Ceil(MaxVal);
unsigned NewBitWidth = std::max<unsigned>(PowerOf2Ceil(MaxActiveBits), 8);
@@ -711,8 +713,10 @@ void VPlanTransforms::optimizeForTCAndVF(VPlan &Plan, unsigned TC,
MadeChange = true;
}
- if (MadeChange)
+ if (MadeChange) {
Plan.setVF(BestVF);
+ Plan.setUF(BestUF);
+ }
}
void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index ba772cf385d560..948665b89adc00 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -59,8 +59,9 @@ struct VPlanTransforms {
unsigned BestUF,
PredicatedScalarEvolution &PSE);
- /// Optimize \p Plan based on \p TC and \p BestVF.
- static void optimizeForTCAndVF(VPlan &Plan, unsigned TC, ElementCount BestVF);
+ /// Optimize \p Plan based on \p TC, \p BestVF and \p BestUF.
+ static void optimizeForTCAndVFAndUF(VPlan &Plan, unsigned TC,
+ ElementCount BestVF, unsigned BestUF);
/// Apply VPlan-to-VPlan optimizations to \p Plan, including induction recipe
/// optimizations, dead recipe removal, replicate region optimizations and
More information about the llvm-commits
mailing list