[llvm] [LV][VPlan] Add initial support for CSA vectorization (PR #106560)
Michael Maitland via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 18 09:35:47 PDT 2024
https://github.com/michaelmaitland updated https://github.com/llvm/llvm-project/pull/106560
>From 2abfe2c8d19999a89f969b747cbffbd6ad2fddef Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Mon, 19 Aug 2024 12:34:43 -0700
Subject: [PATCH 01/16] [LV] Precommit csa vectorization test cases
---
.../Transforms/LoopVectorize/RISCV/csa.ll | 3520 +++++++++++++++++
1 file changed, 3520 insertions(+)
create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/csa.ll
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll b/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll
new file mode 100644
index 00000000000000..71a5519522a275
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll
@@ -0,0 +1,3520 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -mtriple riscv64 -mattr="+v" -riscv-v-vector-bits-min=256 \
+; RUN: -passes=loop-vectorize -force-tail-folding-style=data-with-evl \
+; RUN: | FileCheck %s -check-prefix=EVL
+; RUN: opt < %s -S -mtriple riscv64 -mattr="+v" -riscv-v-vector-bits-min=256 \
+; RUN: -passes=loop-vectorize -force-tail-folding-style=none \
+; RUN: | FileCheck %s -check-prefix=NO-EVL
+; RUN: opt < %s -S -mtriple riscv64 -mattr="+v" -riscv-v-vector-bits-min=256 \
+; RUN: -passes=loop-vectorize -force-tail-folding-style=data \
+; RUN: | FileCheck %s -check-prefix=DATA
+
+; This function is generated from the following C/C++ program:
+; int simple_csa_int_select(int N, int *data, int a) {
+; int t = -1;
+; for (int i = 0; i < N; i++) {
+; if (a < data[i])
+; t = data[i];
+; }
+; return t; // use t
+; }
+define i32 @simple_csa_int_select(i32 %N, ptr %data, i64 %a) {
+; EVL-LABEL: @simple_csa_int_select(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT: ret i32 [[T_0_LCSSA]]
+; EVL: for.body:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT: [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
+; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP1]]
+; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_010]]
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @simple_csa_int_select(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT: ret i32 [[T_0_LCSSA]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT: [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
+; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP1]]
+; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_010]]
+; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @simple_csa_int_select(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT: ret i32 [[T_0_LCSSA]]
+; DATA: for.body:
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT: [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
+; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP1]]
+; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_010]]
+; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp9 = icmp sgt i32 %N, 0
+ br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %spec.select.lcssa = phi i32 [ %spec.select, %for.body ]
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %t.0.lcssa = phi i32 [ -1, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+ ret i32 %t.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %t.010 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %1 = sext i32 %0 to i64
+ %cmp1 = icmp slt i64 %a, %1
+ %spec.select = select i1 %cmp1, i32 %0, i32 %t.010
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int simple_csa_int_select(int N, int *data) {
+; int t = -1;
+; for (int i = 0; i < N; i++) {
+; if (i < data[i])
+; t = data[i];
+; }
+; return t; // use t
+; }
+define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
+; EVL-LABEL: @simple_csa_int_select_induction_cmp(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT: ret i32 [[T_0_LCSSA]]
+; EVL: for.body:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT: [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
+; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP1]]
+; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_010]]
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @simple_csa_int_select_induction_cmp(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT: ret i32 [[T_0_LCSSA]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT: [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
+; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP1]]
+; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_010]]
+; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @simple_csa_int_select_induction_cmp(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT: ret i32 [[T_0_LCSSA]]
+; DATA: for.body:
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT: [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
+; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP1]]
+; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_010]]
+; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp9 = icmp sgt i32 %N, 0
+ br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %spec.select.lcssa = phi i32 [ %spec.select, %for.body ]
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %t.0.lcssa = phi i32 [ -1, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+ ret i32 %t.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %t.010 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %1 = sext i32 %0 to i64
+ %cmp1 = icmp slt i64 %indvars.iv, %1
+ %spec.select = select i1 %cmp1, i32 %0, i32 %t.010
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float simple_csa_float_select(int N, float *data) {
+; float t = 1.0f;
+; for (int i = 0; i < N; i++) {
+; if (0.0f < data[i])
+; t = data[i];
+; }
+; return t; // use t
+; }
+define float @simple_csa_float_select(i32 %N, ptr %data) {
+; EVL-LABEL: @simple_csa_float_select(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT: ret float [[T_0_LCSSA]]
+; EVL: for.body:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[T_09:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_BODY]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 0.000000e+00
+; EVL-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP0]], float [[T_09]]
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @simple_csa_float_select(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT: ret float [[T_0_LCSSA]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[T_09:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 0.000000e+00
+; NO-EVL-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP0]], float [[T_09]]
+; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @simple_csa_float_select(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT: ret float [[T_0_LCSSA]]
+; DATA: for.body:
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[T_09:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_BODY]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 0.000000e+00
+; DATA-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP0]], float [[T_09]]
+; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp8 = icmp sgt i32 %N, 0
+ br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.body ]
+ ret float %t.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %t.09 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.body ]
+ %arrayidx = getelementptr inbounds float, ptr %data, i64 %indvars.iv
+ %0 = load float, ptr %arrayidx, align 4
+ %cmp1 = fcmp ogt float %0, 0.000000e+00
+ %t.1 = select i1 %cmp1, float %0, float %t.09
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int simple_csa_int(int N, bool *cond, int *data) {
+; int t = -1;
+; for (int i = 0; i < N; i++) {
+; if (cond[i])
+; t = data[i];
+; }
+; return t; // use t
+; }
+define i32 @simple_csa_int(i32 %N, ptr %cond, ptr %data) {
+; EVL-LABEL: @simple_csa_int(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT: ret i32 [[T_0_LCSSA]]
+; EVL: for.body:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; EVL-NEXT: [[T_07:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; EVL: if.then:
+; EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; EVL-NEXT: br label [[FOR_INC]]
+; EVL: for.inc:
+; EVL-NEXT: [[T_1]] = phi i32 [ [[TMP1]], [[IF_THEN]] ], [ [[T_07]], [[FOR_BODY]] ]
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @simple_csa_int(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT: ret i32 [[T_0_LCSSA]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[T_07:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; NO-EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; NO-EVL: if.then:
+; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-EVL-NEXT: br label [[FOR_INC]]
+; NO-EVL: for.inc:
+; NO-EVL-NEXT: [[T_1]] = phi i32 [ [[TMP1]], [[IF_THEN]] ], [ [[T_07]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @simple_csa_int(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT: ret i32 [[T_0_LCSSA]]
+; DATA: for.body:
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; DATA-NEXT: [[T_07:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; DATA-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; DATA-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; DATA: if.then:
+; DATA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; DATA-NEXT: br label [[FOR_INC]]
+; DATA: for.inc:
+; DATA-NEXT: [[T_1]] = phi i32 [ [[TMP1]], [[IF_THEN]] ], [ [[T_07]], [[FOR_BODY]] ]
+; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp6 = icmp sgt i32 %N, 0
+ br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.inc, %entry
+ %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.1, %for.inc ]
+ ret i32 %t.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %t.07 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
+ %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
+ %0 = load i8, ptr %arrayidx, align 1
+ %tobool.not = icmp eq i8 %0, 0
+ br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then: ; preds = %for.body
+ %arrayidx2 = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
+ %1 = load i32, ptr %arrayidx2, align 4
+ br label %for.inc
+
+for.inc: ; preds = %for.body, %if.then
+ %t.1 = phi i32 [ %1, %if.then ], [ %t.07, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float simple_csa_float(int N, bool *cond, float *data) {
+; float t = 1.0f;
+; for (int i = 0; i < N; i++) {
+; if (cond[i])
+; t = data[i];
+; }
+; return t; // use t
+; }
+define float @simple_csa_float(i32 %N, ptr %cond, ptr %data) {
+; EVL-LABEL: @simple_csa_float(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT: ret float [[T_0_LCSSA]]
+; EVL: for.body:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; EVL-NEXT: [[T_07:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; EVL: if.then:
+; EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; EVL-NEXT: br label [[FOR_INC]]
+; EVL: for.inc:
+; EVL-NEXT: [[T_1]] = phi float [ [[TMP1]], [[IF_THEN]] ], [ [[T_07]], [[FOR_BODY]] ]
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @simple_csa_float(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT: ret float [[T_0_LCSSA]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[T_07:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; NO-EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; NO-EVL: if.then:
+; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; NO-EVL-NEXT: br label [[FOR_INC]]
+; NO-EVL: for.inc:
+; NO-EVL-NEXT: [[T_1]] = phi float [ [[TMP1]], [[IF_THEN]] ], [ [[T_07]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @simple_csa_float(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT: ret float [[T_0_LCSSA]]
+; DATA: for.body:
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; DATA-NEXT: [[T_07:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; DATA-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; DATA-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; DATA: if.then:
+; DATA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; DATA-NEXT: br label [[FOR_INC]]
+; DATA: for.inc:
+; DATA-NEXT: [[T_1]] = phi float [ [[TMP1]], [[IF_THEN]] ], [ [[T_07]], [[FOR_BODY]] ]
+; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp6 = icmp sgt i32 %N, 0
+ br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.inc, %entry
+ %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.inc ]
+ ret float %t.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %t.07 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
+ %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
+ %0 = load i8, ptr %arrayidx, align 1
+ %tobool.not = icmp eq i8 %0, 0
+ br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then: ; preds = %for.body
+ %arrayidx2 = getelementptr inbounds float, ptr %data, i64 %indvars.iv
+ %1 = load float, ptr %arrayidx2, align 4
+ br label %for.inc
+
+for.inc: ; preds = %for.body, %if.then
+ %t.1 = phi float [ %1, %if.then ], [ %t.07, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_in_series_int_select(int N, int *data0, int *data1, int a) {
+; int t = -1;
+; int s = -1;
+; for (int i = 0; i < N; i++) {
+; if (a < data0[i])
+; t = data0[i];
+; if (a < data1[i])
+; s = data1[i];
+; }
+; return t | s; // use t and s
+; }
+define i32 @csa_in_series_int_select(i32 %N, ptr %data0, ptr %data1, i64 %a) {
+; EVL-LABEL: @csa_in_series_int_select(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; EVL-NEXT: ret i32 [[OR]]
+; EVL: for.body:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
+; EVL-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP2]]
+; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[T_022]]
+; EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; EVL-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
+; EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP4]]
+; EVL-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP3]], i32 [[S_023]]
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @csa_in_series_int_select(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT: ret i32 [[OR]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP2]]
+; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[T_022]]
+; NO-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; NO-EVL-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
+; NO-EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP4]]
+; NO-EVL-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP3]], i32 [[S_023]]
+; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @csa_in_series_int_select(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; DATA-NEXT: ret i32 [[OR]]
+; DATA: for.body:
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
+; DATA-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP2]]
+; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[T_022]]
+; DATA-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; DATA-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
+; DATA-NEXT: [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP4]]
+; DATA-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP3]], i32 [[S_023]]
+; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp21 = icmp sgt i32 %N, 0
+ br i1 %cmp21, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %0 = or i32 %s.1, %spec.select
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
+ ret i32 %or
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %s.023 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.body ]
+ %t.022 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+ %1 = load i32, ptr %arrayidx, align 4
+ %2 = sext i32 %1 to i64
+ %cmp1 = icmp slt i64 %a, %2
+ %spec.select = select i1 %cmp1, i32 %1, i32 %t.022
+ %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+ %3 = load i32, ptr %arrayidx5, align 4
+ %4 = sext i32 %3 to i64
+ %cmp6 = icmp slt i64 %a, %4
+ %s.1 = select i1 %cmp6, i32 %3, i32 %s.023
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_in_series_int_select(int N, int *data0, int *data1) {
+; int t = -1;
+; int s = -1;
+; for (int i = 0; i < N; i++) {
+; if (a < data0[i])
+; t = data0[i];
+; if (a < data1[i])
+; s = data1[i];
+; }
+; return t | s; // use t and s
+; }
+define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_int_select_induction_cmp(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; EVL-NEXT: ret i32 [[OR]]
+; EVL: for.body:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
+; EVL-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
+; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[T_022]]
+; EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; EVL-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
+; EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP4]]
+; EVL-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP3]], i32 [[S_023]]
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @csa_in_series_int_select_induction_cmp(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT: ret i32 [[OR]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
+; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[T_022]]
+; NO-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; NO-EVL-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
+; NO-EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP4]]
+; NO-EVL-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP3]], i32 [[S_023]]
+; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @csa_in_series_int_select_induction_cmp(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; DATA-NEXT: ret i32 [[OR]]
+; DATA: for.body:
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
+; DATA-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
+; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[T_022]]
+; DATA-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; DATA-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
+; DATA-NEXT: [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP4]]
+; DATA-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP3]], i32 [[S_023]]
+; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp21 = icmp sgt i32 %N, 0
+ br i1 %cmp21, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %0 = or i32 %s.1, %spec.select
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
+ ret i32 %or
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %s.023 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.body ]
+ %t.022 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+ %1 = load i32, ptr %arrayidx, align 4
+ %2 = sext i32 %1 to i64
+ %cmp1 = icmp slt i64 %indvars.iv, %2
+ %spec.select = select i1 %cmp1, i32 %1, i32 %t.022
+ %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+ %3 = load i32, ptr %arrayidx5, align 4
+ %4 = sext i32 %3 to i64
+ %cmp6 = icmp slt i64 %indvars.iv, %4
+ %s.1 = select i1 %cmp6, i32 %3, i32 %s.023
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_in_series_float_select(int N, float *data0,
+; float *data1) {
+; float t = 1.0f;
+; float s = 1.0f;
+; for (int i = 0; i < N; i++) {
+; if (0.0f < data0[i])
+; t = data0[i];
+; if (0.0f <data1[i])
+; s = data1[i];
+; }
+; return t + s; // use t and s
+; }
+define float @csa_in_series_float_select(i32 %N, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_float_select(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; EVL-NEXT: ret float [[ADD]]
+; EVL: for.body:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[S_021:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
+; EVL-NEXT: [[T_020:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_BODY]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP1]], 0.000000e+00
+; EVL-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP1]], float [[T_020]]
+; EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; EVL-NEXT: [[CMP6:%.*]] = fcmp ogt float [[TMP2]], 0.000000e+00
+; EVL-NEXT: [[S_1]] = select i1 [[CMP6]], float [[TMP2]], float [[S_021]]
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @csa_in_series_float_select(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT: ret float [[ADD]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[S_021:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[T_020:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP1]], 0.000000e+00
+; NO-EVL-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP1]], float [[T_020]]
+; NO-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; NO-EVL-NEXT: [[CMP6:%.*]] = fcmp ogt float [[TMP2]], 0.000000e+00
+; NO-EVL-NEXT: [[S_1]] = select i1 [[CMP6]], float [[TMP2]], float [[S_021]]
+; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @csa_in_series_float_select(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; DATA-NEXT: ret float [[ADD]]
+; DATA: for.body:
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[S_021:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
+; DATA-NEXT: [[T_020:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_BODY]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP1]], 0.000000e+00
+; DATA-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP1]], float [[T_020]]
+; DATA-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; DATA-NEXT: [[CMP6:%.*]] = fcmp ogt float [[TMP2]], 0.000000e+00
+; DATA-NEXT: [[S_1]] = select i1 [[CMP6]], float [[TMP2]], float [[S_021]]
+; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp19 = icmp sgt i32 %N, 0
+ br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %0 = fadd float %t.1, %s.1
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
+ ret float %add
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %s.021 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.body ]
+ %t.020 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.body ]
+ %arrayidx = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+ %1 = load float, ptr %arrayidx, align 4
+ %cmp1 = fcmp ogt float %1, 0.000000e+00
+ %t.1 = select i1 %cmp1, float %1, float %t.020
+ %arrayidx5 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+ %2 = load float, ptr %arrayidx5, align 4
+ %cmp6 = fcmp ogt float %2, 0.000000e+00
+ %s.1 = select i1 %cmp6, float %2, float %s.021
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_in_series_int(int N, bool *cond0, bool *cond1, int *data0, int *data1) {
+; int t = -1;
+; int s = -1;
+; for (int i = 0; i < N; i++) {
+; if (cond0[i])
+; t = data0[i];
+; if (cond1[i])
+; s = data1[i];
+; }
+; return t | s; // use t and s
+; }
+define i32 @csa_in_series_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_int(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC:%.*]] ]
+; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC]] ]
+; EVL-NEXT: [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; EVL-NEXT: ret i32 [[OR]]
+; EVL: for.body:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; EVL-NEXT: [[S_017:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; EVL-NEXT: [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; EVL: if.then:
+; EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; EVL-NEXT: br label [[IF_END]]
+; EVL: if.end:
+; EVL-NEXT: [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
+; EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
+; EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; EVL: if.then6:
+; EVL-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
+; EVL-NEXT: br label [[FOR_INC]]
+; EVL: for.inc:
+; EVL-NEXT: [[S_1]] = phi i32 [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_END]] ]
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @csa_in_series_int(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC:%.*]] ]
+; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT: ret i32 [[OR]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[S_017:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; NO-EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; NO-EVL: if.then:
+; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-EVL-NEXT: br label [[IF_END]]
+; NO-EVL: if.end:
+; NO-EVL-NEXT: [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; NO-EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
+; NO-EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; NO-EVL: if.then6:
+; NO-EVL-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
+; NO-EVL-NEXT: br label [[FOR_INC]]
+; NO-EVL: for.inc:
+; NO-EVL-NEXT: [[S_1]] = phi i32 [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_END]] ]
+; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @csa_in_series_int(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC:%.*]] ]
+; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC]] ]
+; DATA-NEXT: [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; DATA-NEXT: ret i32 [[OR]]
+; DATA: for.body:
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; DATA-NEXT: [[S_017:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; DATA-NEXT: [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; DATA-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; DATA-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; DATA: if.then:
+; DATA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; DATA-NEXT: br label [[IF_END]]
+; DATA: if.end:
+; DATA-NEXT: [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
+; DATA-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; DATA-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
+; DATA-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; DATA: if.then6:
+; DATA-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
+; DATA-NEXT: br label [[FOR_INC]]
+; DATA: for.inc:
+; DATA-NEXT: [[S_1]] = phi i32 [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_END]] ]
+; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp15 = icmp sgt i32 %N, 0
+ br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.inc
+ %0 = or i32 %s.1, %t.1
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
+ ret i32 %or
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %s.017 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
+ %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
+ %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+ %1 = load i8, ptr %arrayidx, align 1
+ %tobool.not = icmp eq i8 %1, 0
+ br i1 %tobool.not, label %if.end, label %if.then
+
+if.then: ; preds = %for.body
+ %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+ %2 = load i32, ptr %arrayidx2, align 4
+ br label %if.end
+
+if.end: ; preds = %if.then, %for.body
+ %t.1 = phi i32 [ %2, %if.then ], [ %t.016, %for.body ]
+ %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+ %3 = load i8, ptr %arrayidx4, align 1
+ %tobool5.not = icmp eq i8 %3, 0
+ br i1 %tobool5.not, label %for.inc, label %if.then6
+
+if.then6: ; preds = %if.end
+ %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+ %4 = load i32, ptr %arrayidx8, align 4
+ br label %for.inc
+
+for.inc: ; preds = %if.end, %if.then6
+ %s.1 = phi i32 [ %4, %if.then6 ], [ %s.017, %if.end ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_in_series_float(int N, bool *cond0, bool *cond1, float *data0,
+; float *data1) {
+; float t = 1.0f;
+; float s = 1.0f;
+; for (int i = 0; i < N; i++) {
+; if (cond0[i])
+; t = data0[i];
+; if (cond1[i])
+; s = data1[i];
+; }
+; return t + s; // use t and s
+; }
+define float @csa_in_series_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_float(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC:%.*]] ]
+; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC]] ]
+; EVL-NEXT: [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; EVL-NEXT: ret float [[ADD]]
+; EVL: for.body:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; EVL-NEXT: [[S_017:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; EVL-NEXT: [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; EVL: if.then:
+; EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; EVL-NEXT: br label [[IF_END]]
+; EVL: if.end:
+; EVL-NEXT: [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
+; EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
+; EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; EVL: if.then6:
+; EVL-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; EVL-NEXT: br label [[FOR_INC]]
+; EVL: for.inc:
+; EVL-NEXT: [[S_1]] = phi float [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_END]] ]
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @csa_in_series_float(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC:%.*]] ]
+; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT: ret float [[ADD]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[S_017:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; NO-EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; NO-EVL: if.then:
+; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; NO-EVL-NEXT: br label [[IF_END]]
+; NO-EVL: if.end:
+; NO-EVL-NEXT: [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; NO-EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
+; NO-EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; NO-EVL: if.then6:
+; NO-EVL-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; NO-EVL-NEXT: br label [[FOR_INC]]
+; NO-EVL: for.inc:
+; NO-EVL-NEXT: [[S_1]] = phi float [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_END]] ]
+; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @csa_in_series_float(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC:%.*]] ]
+; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC]] ]
+; DATA-NEXT: [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; DATA-NEXT: ret float [[ADD]]
+; DATA: for.body:
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; DATA-NEXT: [[S_017:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; DATA-NEXT: [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; DATA-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; DATA-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; DATA: if.then:
+; DATA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; DATA-NEXT: br label [[IF_END]]
+; DATA: if.end:
+; DATA-NEXT: [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
+; DATA-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; DATA-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
+; DATA-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; DATA: if.then6:
+; DATA-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; DATA-NEXT: br label [[FOR_INC]]
+; DATA: for.inc:
+; DATA-NEXT: [[S_1]] = phi float [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_END]] ]
+; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp15 = icmp sgt i32 %N, 0
+ br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.inc
+ %0 = fadd float %t.1, %s.1
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
+ ret float %add
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %s.017 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
+ %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
+ %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+ %1 = load i8, ptr %arrayidx, align 1
+ %tobool.not = icmp eq i8 %1, 0
+ br i1 %tobool.not, label %if.end, label %if.then
+
+if.then: ; preds = %for.body
+ %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+ %2 = load float, ptr %arrayidx2, align 4
+ br label %if.end
+
+if.end: ; preds = %if.then, %for.body
+ %t.1 = phi float [ %2, %if.then ], [ %t.016, %for.body ]
+ %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+ %3 = load i8, ptr %arrayidx4, align 1
+ %tobool5.not = icmp eq i8 %3, 0
+ br i1 %tobool5.not, label %for.inc, label %if.then6
+
+if.then6: ; preds = %if.end
+ %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+ %4 = load float, ptr %arrayidx8, align 4
+ br label %for.inc
+
+for.inc: ; preds = %if.end, %if.then6
+ %s.1 = phi float [ %4, %if.then6 ], [ %s.017, %if.end ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_in_series_same_scalar_int_select(int N, int *data0,
+; int *data1) {
+; int t = -1;
+; for (int i = 0; i < N; i++) {
+; if (i < data0[i])
+; t = data0[i];
+; if (i < data1[i])
+; t = data1[i];
+; }
+; return t; // use t
+; }
+define i32 @csa_in_series_same_scalar_int_select(i32 %N, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_same_scalar_int_select(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[T_2_LCSSA:%.*]] = phi i32 [ [[T_2:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT: ret i32 [[T_0_LCSSA]]
+; EVL: for.body:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_BODY]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT: [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
+; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP1]]
+; EVL-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_022]]
+; EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; EVL-NEXT: [[TMP3:%.*]] = sext i32 [[TMP2]] to i64
+; EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP3]]
+; EVL-NEXT: [[T_2]] = select i1 [[CMP6]], i32 [[TMP2]], i32 [[SPEC_SELECT]]
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @csa_in_series_same_scalar_int_select(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[T_2_LCSSA:%.*]] = phi i32 [ [[T_2:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT: ret i32 [[T_0_LCSSA]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT: [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
+; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP1]]
+; NO-EVL-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_022]]
+; NO-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; NO-EVL-NEXT: [[TMP3:%.*]] = sext i32 [[TMP2]] to i64
+; NO-EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP3]]
+; NO-EVL-NEXT: [[T_2]] = select i1 [[CMP6]], i32 [[TMP2]], i32 [[SPEC_SELECT]]
+; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @csa_in_series_same_scalar_int_select(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[T_2_LCSSA:%.*]] = phi i32 [ [[T_2:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT: ret i32 [[T_0_LCSSA]]
+; DATA: for.body:
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_BODY]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT: [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
+; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP1]]
+; DATA-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_022]]
+; DATA-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; DATA-NEXT: [[TMP3:%.*]] = sext i32 [[TMP2]] to i64
+; DATA-NEXT: [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP3]]
+; DATA-NEXT: [[T_2]] = select i1 [[CMP6]], i32 [[TMP2]], i32 [[SPEC_SELECT]]
+; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp21 = icmp sgt i32 %N, 0
+ br i1 %cmp21, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.2, %for.body ]
+ ret i32 %t.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %t.022 = phi i32 [ -1, %for.body.preheader ], [ %t.2, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %1 = sext i32 %0 to i64
+ %cmp1 = icmp slt i64 %indvars.iv, %1
+ %spec.select = select i1 %cmp1, i32 %0, i32 %t.022
+ %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+ %2 = load i32, ptr %arrayidx5, align 4
+ %3 = sext i32 %2 to i64
+ %cmp6 = icmp slt i64 %indvars.iv, %3
+ %t.2 = select i1 %cmp6, i32 %2, i32 %spec.select
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_in_series_same_scalar_float_select(int N,
+; float *data0, float *data1) {
+; float t = 1.0f;
+; for (int i = 0; i < N; i++) {
+; if (0.0f < data0[i])
+; t = data0[i];
+; if (0.0f < data1[i])
+; t = data1[i];
+; }
+; return t; // use t
+; }
+define float @csa_in_series_same_scalar_float_select(i32 %N, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_same_scalar_float_select(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[T_2_LCSSA:%.*]] = phi float [ [[T_2:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT: ret float [[T_0_LCSSA]]
+; EVL: for.body:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[T_020:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_BODY]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 0.000000e+00
+; EVL-NEXT: [[T_1:%.*]] = select i1 [[CMP1]], float [[TMP0]], float [[T_020]]
+; EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; EVL-NEXT: [[CMP6:%.*]] = fcmp ogt float [[TMP1]], 0.000000e+00
+; EVL-NEXT: [[T_2]] = select i1 [[CMP6]], float [[TMP1]], float [[T_1]]
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @csa_in_series_same_scalar_float_select(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[T_2_LCSSA:%.*]] = phi float [ [[T_2:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT: ret float [[T_0_LCSSA]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[T_020:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 0.000000e+00
+; NO-EVL-NEXT: [[T_1:%.*]] = select i1 [[CMP1]], float [[TMP0]], float [[T_020]]
+; NO-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; NO-EVL-NEXT: [[CMP6:%.*]] = fcmp ogt float [[TMP1]], 0.000000e+00
+; NO-EVL-NEXT: [[T_2]] = select i1 [[CMP6]], float [[TMP1]], float [[T_1]]
+; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @csa_in_series_same_scalar_float_select(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[T_2_LCSSA:%.*]] = phi float [ [[T_2:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT: ret float [[T_0_LCSSA]]
+; DATA: for.body:
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[T_020:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_BODY]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 0.000000e+00
+; DATA-NEXT: [[T_1:%.*]] = select i1 [[CMP1]], float [[TMP0]], float [[T_020]]
+; DATA-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; DATA-NEXT: [[CMP6:%.*]] = fcmp ogt float [[TMP1]], 0.000000e+00
+; DATA-NEXT: [[T_2]] = select i1 [[CMP6]], float [[TMP1]], float [[T_1]]
+; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp19 = icmp sgt i32 %N, 0
+ br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.2, %for.body ]
+ ret float %t.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %t.020 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.2, %for.body ]
+ %arrayidx = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+ %0 = load float, ptr %arrayidx, align 4
+ %cmp1 = fcmp ogt float %0, 0.000000e+00
+ %t.1 = select i1 %cmp1, float %0, float %t.020
+ %arrayidx5 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+ %1 = load float, ptr %arrayidx5, align 4
+ %cmp6 = fcmp ogt float %1, 0.000000e+00
+ %t.2 = select i1 %cmp6, float %1, float %t.1
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_in_series_same_scalar_int(int N, bool *cond0, bool *cond1, int *data0,
+; int *data1) {
+; int t = -1;
+; for (int i = 0; i < N; i++) {
+; if (cond0[i])
+; t = data0[i];
+; if (cond1[i])
+; t = data1[i];
+; }
+; return t; // use t
+; }
+define i32 @csa_in_series_same_scalar_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_same_scalar_int(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[T_2_LCSSA:%.*]] = phi i32 [ [[T_2:%.*]], [[FOR_INC:%.*]] ]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT: ret i32 [[T_0_LCSSA]]
+; EVL: for.body:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; EVL-NEXT: [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_INC]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; EVL: if.then:
+; EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; EVL-NEXT: br label [[IF_END]]
+; EVL: if.end:
+; EVL-NEXT: [[T_1:%.*]] = phi i32 [ [[TMP1]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
+; EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP2]], 0
+; EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; EVL: if.then6:
+; EVL-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
+; EVL-NEXT: br label [[FOR_INC]]
+; EVL: for.inc:
+; EVL-NEXT: [[T_2]] = phi i32 [ [[TMP3]], [[IF_THEN6]] ], [ [[T_1]], [[IF_END]] ]
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @csa_in_series_same_scalar_int(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[T_2_LCSSA:%.*]] = phi i32 [ [[T_2:%.*]], [[FOR_INC:%.*]] ]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT: ret i32 [[T_0_LCSSA]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; NO-EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; NO-EVL: if.then:
+; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-EVL-NEXT: br label [[IF_END]]
+; NO-EVL: if.end:
+; NO-EVL-NEXT: [[T_1:%.*]] = phi i32 [ [[TMP1]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; NO-EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP2]], 0
+; NO-EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; NO-EVL: if.then6:
+; NO-EVL-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
+; NO-EVL-NEXT: br label [[FOR_INC]]
+; NO-EVL: for.inc:
+; NO-EVL-NEXT: [[T_2]] = phi i32 [ [[TMP3]], [[IF_THEN6]] ], [ [[T_1]], [[IF_END]] ]
+; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @csa_in_series_same_scalar_int(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[T_2_LCSSA:%.*]] = phi i32 [ [[T_2:%.*]], [[FOR_INC:%.*]] ]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT: ret i32 [[T_0_LCSSA]]
+; DATA: for.body:
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; DATA-NEXT: [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_INC]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; DATA-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; DATA-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; DATA: if.then:
+; DATA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; DATA-NEXT: br label [[IF_END]]
+; DATA: if.end:
+; DATA-NEXT: [[T_1:%.*]] = phi i32 [ [[TMP1]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
+; DATA-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; DATA-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP2]], 0
+; DATA-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; DATA: if.then6:
+; DATA-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
+; DATA-NEXT: br label [[FOR_INC]]
+; DATA: for.inc:
+; DATA-NEXT: [[T_2]] = phi i32 [ [[TMP3]], [[IF_THEN6]] ], [ [[T_1]], [[IF_END]] ]
+; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp15 = icmp sgt i32 %N, 0
+ br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.inc, %entry
+ %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.2, %for.inc ]
+ ret i32 %t.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.2, %for.inc ]
+ %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+ %0 = load i8, ptr %arrayidx, align 1
+ %tobool.not = icmp eq i8 %0, 0
+ br i1 %tobool.not, label %if.end, label %if.then
+
+if.then: ; preds = %for.body
+ %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+ %1 = load i32, ptr %arrayidx2, align 4
+ br label %if.end
+
+if.end: ; preds = %if.then, %for.body
+ %t.1 = phi i32 [ %1, %if.then ], [ %t.016, %for.body ]
+ %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+ %2 = load i8, ptr %arrayidx4, align 1
+ %tobool5.not = icmp eq i8 %2, 0
+ br i1 %tobool5.not, label %for.inc, label %if.then6
+
+if.then6: ; preds = %if.end
+ %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+ %3 = load i32, ptr %arrayidx8, align 4
+ br label %for.inc
+
+for.inc: ; preds = %if.end, %if.then6
+ %t.2 = phi i32 [ %3, %if.then6 ], [ %t.1, %if.end ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_in_series_same_scalar_float(int N, bool *cond0, bool *cond1,
+; float *data0, float *data1) {
+; float t = 1.0f;
+; for (int i = 0; i < N; i++) {
+; if (cond0[i])
+; t = data0[i];
+; if (cond1[i])
+; t = data1[i];
+; }
+; return t; // use t
+; }
+define float @csa_in_series_same_scalar_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_same_scalar_float(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[T_2_LCSSA:%.*]] = phi float [ [[T_2:%.*]], [[FOR_INC:%.*]] ]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT: ret float [[T_0_LCSSA]]
+; EVL: for.body:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; EVL-NEXT: [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_INC]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; EVL: if.then:
+; EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; EVL-NEXT: br label [[IF_END]]
+; EVL: if.end:
+; EVL-NEXT: [[T_1:%.*]] = phi float [ [[TMP1]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
+; EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP2]], 0
+; EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; EVL: if.then6:
+; EVL-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; EVL-NEXT: br label [[FOR_INC]]
+; EVL: for.inc:
+; EVL-NEXT: [[T_2]] = phi float [ [[TMP3]], [[IF_THEN6]] ], [ [[T_1]], [[IF_END]] ]
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @csa_in_series_same_scalar_float(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[T_2_LCSSA:%.*]] = phi float [ [[T_2:%.*]], [[FOR_INC:%.*]] ]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT: ret float [[T_0_LCSSA]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; NO-EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; NO-EVL: if.then:
+; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; NO-EVL-NEXT: br label [[IF_END]]
+; NO-EVL: if.end:
+; NO-EVL-NEXT: [[T_1:%.*]] = phi float [ [[TMP1]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; NO-EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP2]], 0
+; NO-EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; NO-EVL: if.then6:
+; NO-EVL-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; NO-EVL-NEXT: br label [[FOR_INC]]
+; NO-EVL: for.inc:
+; NO-EVL-NEXT: [[T_2]] = phi float [ [[TMP3]], [[IF_THEN6]] ], [ [[T_1]], [[IF_END]] ]
+; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @csa_in_series_same_scalar_float(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[T_2_LCSSA:%.*]] = phi float [ [[T_2:%.*]], [[FOR_INC:%.*]] ]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT: ret float [[T_0_LCSSA]]
+; DATA: for.body:
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; DATA-NEXT: [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_INC]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; DATA-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; DATA-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; DATA: if.then:
+; DATA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; DATA-NEXT: br label [[IF_END]]
+; DATA: if.end:
+; DATA-NEXT: [[T_1:%.*]] = phi float [ [[TMP1]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
+; DATA-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; DATA-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP2]], 0
+; DATA-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; DATA: if.then6:
+; DATA-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; DATA-NEXT: br label [[FOR_INC]]
+; DATA: for.inc:
+; DATA-NEXT: [[T_2]] = phi float [ [[TMP3]], [[IF_THEN6]] ], [ [[T_1]], [[IF_END]] ]
+; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp15 = icmp sgt i32 %N, 0
+ br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.inc, %entry
+ %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.2, %for.inc ]
+ ret float %t.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.2, %for.inc ]
+ %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+ %0 = load i8, ptr %arrayidx, align 1
+ %tobool.not = icmp eq i8 %0, 0
+ br i1 %tobool.not, label %if.end, label %if.then
+
+if.then: ; preds = %for.body
+ %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+ %1 = load float, ptr %arrayidx2, align 4
+ br label %if.end
+
+if.end: ; preds = %if.then, %for.body
+ %t.1 = phi float [ %1, %if.then ], [ %t.016, %for.body ]
+ %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+ %2 = load i8, ptr %arrayidx4, align 1
+ %tobool5.not = icmp eq i8 %2, 0
+ br i1 %tobool5.not, label %for.inc, label %if.then6
+
+if.then6: ; preds = %if.end
+ %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+ %3 = load float, ptr %arrayidx8, align 4
+ br label %for.inc
+
+for.inc: ; preds = %if.end, %if.then6
+ %t.2 = phi float [ %3, %if.then6 ], [ %t.1, %if.end ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_same_cond_int(int N, bool *cond, int *data0, int *data1) {
+; int t = -1;
+; int s = -1;
+; for (int i = 0; i < N; i++) {
+; if (cond[i]) {
+; t = data0[i];
+; s = data1[i];
+; }
+; }
+; return t | s; // use t and s
+; }
+define i32 @csa_same_cond_int(i32 %N, ptr %cond, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_same_cond_int(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC]] ]
+; EVL-NEXT: [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; EVL-NEXT: ret i32 [[OR]]
+; EVL: for.body:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; EVL-NEXT: [[S_011:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; EVL-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; EVL: if.then:
+; EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
+; EVL-NEXT: br label [[FOR_INC]]
+; EVL: for.inc:
+; EVL-NEXT: [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_010]], [[FOR_BODY]] ]
+; EVL-NEXT: [[S_1]] = phi i32 [ [[TMP3]], [[IF_THEN]] ], [ [[S_011]], [[FOR_BODY]] ]
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @csa_same_cond_int(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT: ret i32 [[OR]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[S_011:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; NO-EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; NO-EVL: if.then:
+; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
+; NO-EVL-NEXT: br label [[FOR_INC]]
+; NO-EVL: for.inc:
+; NO-EVL-NEXT: [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_010]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[S_1]] = phi i32 [ [[TMP3]], [[IF_THEN]] ], [ [[S_011]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @csa_same_cond_int(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; DATA-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC]] ]
+; DATA-NEXT: [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; DATA-NEXT: ret i32 [[OR]]
+; DATA: for.body:
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; DATA-NEXT: [[S_011:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; DATA-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; DATA-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; DATA-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; DATA: if.then:
+; DATA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; DATA-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
+; DATA-NEXT: br label [[FOR_INC]]
+; DATA: for.inc:
+; DATA-NEXT: [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_010]], [[FOR_BODY]] ]
+; DATA-NEXT: [[S_1]] = phi i32 [ [[TMP3]], [[IF_THEN]] ], [ [[S_011]], [[FOR_BODY]] ]
+; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp9 = icmp sgt i32 %N, 0
+ br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.inc
+ %0 = or i32 %s.1, %t.1
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
+ ret i32 %or
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %s.011 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
+ %t.010 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
+ %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
+ %1 = load i8, ptr %arrayidx, align 1
+ %tobool.not = icmp eq i8 %1, 0
+ br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then: ; preds = %for.body
+ %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+ %2 = load i32, ptr %arrayidx2, align 4
+ %arrayidx4 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+ %3 = load i32, ptr %arrayidx4, align 4
+ br label %for.inc
+
+for.inc: ; preds = %for.body, %if.then
+ %t.1 = phi i32 [ %2, %if.then ], [ %t.010, %for.body ]
+ %s.1 = phi i32 [ %3, %if.then ], [ %s.011, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_same_cond_float(int N, bool *cond, float *data0, float *data1) {
+; float t = 1.0f;
+; float s = 1.0f;
+; for (int i = 0; i < N; i++) {
+; if (cond[i]) {
+; t = data0[i];
+; s = data1[i];
+; }
+; }
+; return t + s; // use t and s
+; }
+define float @csa_same_cond_float(i32 %N, ptr %cond, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_same_cond_float(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC]] ]
+; EVL-NEXT: [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; EVL-NEXT: ret float [[ADD]]
+; EVL: for.body:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; EVL-NEXT: [[S_011:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; EVL-NEXT: [[T_010:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; EVL: if.then:
+; EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; EVL-NEXT: br label [[FOR_INC]]
+; EVL: for.inc:
+; EVL-NEXT: [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_010]], [[FOR_BODY]] ]
+; EVL-NEXT: [[S_1]] = phi float [ [[TMP3]], [[IF_THEN]] ], [ [[S_011]], [[FOR_BODY]] ]
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @csa_same_cond_float(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT: ret float [[ADD]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[S_011:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[T_010:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; NO-EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; NO-EVL: if.then:
+; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; NO-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; NO-EVL-NEXT: br label [[FOR_INC]]
+; NO-EVL: for.inc:
+; NO-EVL-NEXT: [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_010]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[S_1]] = phi float [ [[TMP3]], [[IF_THEN]] ], [ [[S_011]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @csa_same_cond_float(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; DATA-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC]] ]
+; DATA-NEXT: [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; DATA-NEXT: ret float [[ADD]]
+; DATA: for.body:
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; DATA-NEXT: [[S_011:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; DATA-NEXT: [[T_010:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; DATA-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; DATA-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; DATA: if.then:
+; DATA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; DATA-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; DATA-NEXT: br label [[FOR_INC]]
+; DATA: for.inc:
+; DATA-NEXT: [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_010]], [[FOR_BODY]] ]
+; DATA-NEXT: [[S_1]] = phi float [ [[TMP3]], [[IF_THEN]] ], [ [[S_011]], [[FOR_BODY]] ]
+; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp9 = icmp sgt i32 %N, 0
+ br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.inc
+ %0 = fadd float %t.1, %s.1
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
+ ret float %add
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %s.011 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
+ %t.010 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
+ %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
+ %1 = load i8, ptr %arrayidx, align 1
+ %tobool.not = icmp eq i8 %1, 0
+ br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then: ; preds = %for.body
+ %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+ %2 = load float, ptr %arrayidx2, align 4
+ %arrayidx4 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+ %3 = load float, ptr %arrayidx4, align 4
+ br label %for.inc
+
+for.inc: ; preds = %for.body, %if.then
+ %t.1 = phi float [ %2, %if.then ], [ %t.010, %for.body ]
+ %s.1 = phi float [ %3, %if.then ], [ %s.011, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_else_if_same_scalar_int(int N, bool *cond0, bool *cond1, int *data0,
+; int *data1) {
+; int t = -1;
+; for (int i = 0; i < N; i++) {
+; if (cond0[i])
+; t = data0[i];
+; else if (cond1[i])
+; t = data1[i];
+; }
+; return t; // use t
+; }
+define i32 @csa_else_if_same_scalar_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_else_if_same_scalar_int(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT: ret i32 [[T_0_LCSSA]]
+; EVL: for.body:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; EVL-NEXT: [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[FOR_INC_SINK_SPLIT:%.*]]
+; EVL: if.else:
+; EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[FOR_INC_SINK_SPLIT]]
+; EVL: for.inc.sink.split:
+; EVL-NEXT: [[DATA0_SINK:%.*]] = phi ptr [ [[DATA0:%.*]], [[FOR_BODY]] ], [ [[DATA1:%.*]], [[IF_ELSE]] ]
+; EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0_SINK]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; EVL-NEXT: br label [[FOR_INC]]
+; EVL: for.inc:
+; EVL-NEXT: [[T_1]] = phi i32 [ [[T_016]], [[IF_ELSE]] ], [ [[TMP2]], [[FOR_INC_SINK_SPLIT]] ]
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @csa_else_if_same_scalar_int(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT: ret i32 [[T_0_LCSSA]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; NO-EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[FOR_INC_SINK_SPLIT:%.*]]
+; NO-EVL: if.else:
+; NO-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; NO-EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; NO-EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[FOR_INC_SINK_SPLIT]]
+; NO-EVL: for.inc.sink.split:
+; NO-EVL-NEXT: [[DATA0_SINK:%.*]] = phi ptr [ [[DATA0:%.*]], [[FOR_BODY]] ], [ [[DATA1:%.*]], [[IF_ELSE]] ]
+; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0_SINK]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-EVL-NEXT: br label [[FOR_INC]]
+; NO-EVL: for.inc:
+; NO-EVL-NEXT: [[T_1]] = phi i32 [ [[T_016]], [[IF_ELSE]] ], [ [[TMP2]], [[FOR_INC_SINK_SPLIT]] ]
+; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @csa_else_if_same_scalar_int(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT: ret i32 [[T_0_LCSSA]]
+; DATA: for.body:
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; DATA-NEXT: [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; DATA-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; DATA-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[FOR_INC_SINK_SPLIT:%.*]]
+; DATA: if.else:
+; DATA-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; DATA-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; DATA-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[FOR_INC_SINK_SPLIT]]
+; DATA: for.inc.sink.split:
+; DATA-NEXT: [[DATA0_SINK:%.*]] = phi ptr [ [[DATA0:%.*]], [[FOR_BODY]] ], [ [[DATA1:%.*]], [[IF_ELSE]] ]
+; DATA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0_SINK]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; DATA-NEXT: br label [[FOR_INC]]
+; DATA: for.inc:
+; DATA-NEXT: [[T_1]] = phi i32 [ [[T_016]], [[IF_ELSE]] ], [ [[TMP2]], [[FOR_INC_SINK_SPLIT]] ]
+; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp15 = icmp sgt i32 %N, 0
+ br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.inc, %entry
+ %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.1, %for.inc ]
+ ret i32 %t.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
+ %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+ %0 = load i8, ptr %arrayidx, align 1
+ %tobool.not = icmp eq i8 %0, 0
+ br i1 %tobool.not, label %if.else, label %for.inc.sink.split
+
+if.else: ; preds = %for.body
+ %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+ %1 = load i8, ptr %arrayidx4, align 1
+ %tobool5.not = icmp eq i8 %1, 0
+ br i1 %tobool5.not, label %for.inc, label %for.inc.sink.split
+
+for.inc.sink.split: ; preds = %if.else, %for.body
+ %data0.sink = phi ptr [ %data0, %for.body ], [ %data1, %if.else ]
+ %arrayidx2 = getelementptr inbounds i32, ptr %data0.sink, i64 %indvars.iv
+ %2 = load i32, ptr %arrayidx2, align 4
+ br label %for.inc
+
+for.inc: ; preds = %for.inc.sink.split, %if.else
+ %t.1 = phi i32 [ %t.016, %if.else ], [ %2, %for.inc.sink.split ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_else_if_same_scalar_float(int N, bool *cond0, bool *cond1,
+; float *data0, float *data1) {
+; float t = 1.0f;
+; for (int i = 0; i < N; i++) {
+; if (cond0[i])
+; t = data0[i];
+; else if (cond1[i])
+; t = data1[i];
+; }
+; return t; // use t
+; }
+define float @csa_else_if_same_scalar_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_else_if_same_scalar_float(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT: ret float [[T_0_LCSSA]]
+; EVL: for.body:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; EVL-NEXT: [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[FOR_INC_SINK_SPLIT:%.*]]
+; EVL: if.else:
+; EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[FOR_INC_SINK_SPLIT]]
+; EVL: for.inc.sink.split:
+; EVL-NEXT: [[DATA0_SINK:%.*]] = phi ptr [ [[DATA0:%.*]], [[FOR_BODY]] ], [ [[DATA1:%.*]], [[IF_ELSE]] ]
+; EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0_SINK]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; EVL-NEXT: br label [[FOR_INC]]
+; EVL: for.inc:
+; EVL-NEXT: [[T_1]] = phi float [ [[T_016]], [[IF_ELSE]] ], [ [[TMP2]], [[FOR_INC_SINK_SPLIT]] ]
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @csa_else_if_same_scalar_float(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT: ret float [[T_0_LCSSA]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; NO-EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[FOR_INC_SINK_SPLIT:%.*]]
+; NO-EVL: if.else:
+; NO-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; NO-EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; NO-EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[FOR_INC_SINK_SPLIT]]
+; NO-EVL: for.inc.sink.split:
+; NO-EVL-NEXT: [[DATA0_SINK:%.*]] = phi ptr [ [[DATA0:%.*]], [[FOR_BODY]] ], [ [[DATA1:%.*]], [[IF_ELSE]] ]
+; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0_SINK]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; NO-EVL-NEXT: br label [[FOR_INC]]
+; NO-EVL: for.inc:
+; NO-EVL-NEXT: [[T_1]] = phi float [ [[T_016]], [[IF_ELSE]] ], [ [[TMP2]], [[FOR_INC_SINK_SPLIT]] ]
+; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @csa_else_if_same_scalar_float(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT: ret float [[T_0_LCSSA]]
+; DATA: for.body:
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; DATA-NEXT: [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; DATA-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; DATA-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[FOR_INC_SINK_SPLIT:%.*]]
+; DATA: if.else:
+; DATA-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; DATA-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; DATA-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[FOR_INC_SINK_SPLIT]]
+; DATA: for.inc.sink.split:
+; DATA-NEXT: [[DATA0_SINK:%.*]] = phi ptr [ [[DATA0:%.*]], [[FOR_BODY]] ], [ [[DATA1:%.*]], [[IF_ELSE]] ]
+; DATA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0_SINK]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; DATA-NEXT: br label [[FOR_INC]]
+; DATA: for.inc:
+; DATA-NEXT: [[T_1]] = phi float [ [[T_016]], [[IF_ELSE]] ], [ [[TMP2]], [[FOR_INC_SINK_SPLIT]] ]
+; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp15 = icmp sgt i32 %N, 0
+ br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.inc, %entry
+ %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.inc ]
+ ret float %t.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
+ %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+ %0 = load i8, ptr %arrayidx, align 1
+ %tobool.not = icmp eq i8 %0, 0
+ br i1 %tobool.not, label %if.else, label %for.inc.sink.split
+
+if.else: ; preds = %for.body
+ %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+ %1 = load i8, ptr %arrayidx4, align 1
+ %tobool5.not = icmp eq i8 %1, 0
+ br i1 %tobool5.not, label %for.inc, label %for.inc.sink.split
+
+for.inc.sink.split: ; preds = %if.else, %for.body
+ %data0.sink = phi ptr [ %data0, %for.body ], [ %data1, %if.else ]
+ %arrayidx2 = getelementptr inbounds float, ptr %data0.sink, i64 %indvars.iv
+ %2 = load float, ptr %arrayidx2, align 4
+ br label %for.inc
+
+for.inc: ; preds = %for.inc.sink.split, %if.else
+ %t.1 = phi float [ %t.016, %if.else ], [ %2, %for.inc.sink.split ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_else_if_int(int N, bool *cond0, bool *cond1, int *data0, int *data1) {
+; int t = -1;
+; int s = -1;
+; for (int i = 0; i < N; i++) {
+; if (cond0[i])
+; t = data0[i];
+; else if (cond1[i])
+; s = data1[i];
+; }
+; return t | s; // use t and s
+; }
+define i32 @csa_else_if_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_else_if_int(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC]] ]
+; EVL-NEXT: [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; EVL-NEXT: ret i32 [[OR]]
+; EVL: for.body:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; EVL-NEXT: [[S_017:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; EVL-NEXT: [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; EVL: if.then:
+; EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; EVL-NEXT: br label [[FOR_INC]]
+; EVL: if.else:
+; EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
+; EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; EVL: if.then6:
+; EVL-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
+; EVL-NEXT: br label [[FOR_INC]]
+; EVL: for.inc:
+; EVL-NEXT: [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[IF_THEN6]] ], [ [[T_016]], [[IF_ELSE]] ]
+; EVL-NEXT: [[S_1]] = phi i32 [ [[S_017]], [[IF_THEN]] ], [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_ELSE]] ]
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @csa_else_if_int(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT: ret i32 [[OR]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[S_017:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; NO-EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; NO-EVL: if.then:
+; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-EVL-NEXT: br label [[FOR_INC]]
+; NO-EVL: if.else:
+; NO-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; NO-EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
+; NO-EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; NO-EVL: if.then6:
+; NO-EVL-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
+; NO-EVL-NEXT: br label [[FOR_INC]]
+; NO-EVL: for.inc:
+; NO-EVL-NEXT: [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[IF_THEN6]] ], [ [[T_016]], [[IF_ELSE]] ]
+; NO-EVL-NEXT: [[S_1]] = phi i32 [ [[S_017]], [[IF_THEN]] ], [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_ELSE]] ]
+; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @csa_else_if_int(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; DATA-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC]] ]
+; DATA-NEXT: [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; DATA-NEXT: ret i32 [[OR]]
+; DATA: for.body:
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; DATA-NEXT: [[S_017:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; DATA-NEXT: [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; DATA-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; DATA-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; DATA: if.then:
+; DATA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; DATA-NEXT: br label [[FOR_INC]]
+; DATA: if.else:
+; DATA-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; DATA-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
+; DATA-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; DATA: if.then6:
+; DATA-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
+; DATA-NEXT: br label [[FOR_INC]]
+; DATA: for.inc:
+; DATA-NEXT: [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[IF_THEN6]] ], [ [[T_016]], [[IF_ELSE]] ]
+; DATA-NEXT: [[S_1]] = phi i32 [ [[S_017]], [[IF_THEN]] ], [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_ELSE]] ]
+; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp15 = icmp sgt i32 %N, 0
+ br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.inc
+ %0 = or i32 %s.1, %t.1
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
+ ret i32 %or
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %s.017 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
+ %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
+ %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+ %1 = load i8, ptr %arrayidx, align 1
+ %tobool.not = icmp eq i8 %1, 0
+ br i1 %tobool.not, label %if.else, label %if.then
+
+if.then: ; preds = %for.body
+ %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+ %2 = load i32, ptr %arrayidx2, align 4
+ br label %for.inc
+
+if.else: ; preds = %for.body
+ %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+ %3 = load i8, ptr %arrayidx4, align 1
+ %tobool5.not = icmp eq i8 %3, 0
+ br i1 %tobool5.not, label %for.inc, label %if.then6
+
+if.then6: ; preds = %if.else
+ %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+ %4 = load i32, ptr %arrayidx8, align 4
+ br label %for.inc
+
+for.inc: ; preds = %if.then, %if.then6, %if.else
+ %t.1 = phi i32 [ %2, %if.then ], [ %t.016, %if.then6 ], [ %t.016, %if.else ]
+ %s.1 = phi i32 [ %s.017, %if.then ], [ %4, %if.then6 ], [ %s.017, %if.else ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_else_if_float(int N, bool *cond0, bool *cond1, float *data0,
+; float *data1) {
+; float t = 1.0f;
+; float s = 1.0f;
+; for (int i = 0; i < N; i++) {
+; if (cond0[i])
+; t = data0[i];
+; else if (cond1[i])
+; s = data1[i];
+; }
+; return t + s; // use t and s
+; }
+define float @csa_else_if_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_else_if_float(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC]] ]
+; EVL-NEXT: [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; EVL-NEXT: ret float [[ADD]]
+; EVL: for.body:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; EVL-NEXT: [[S_017:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; EVL-NEXT: [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; EVL: if.then:
+; EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; EVL-NEXT: br label [[FOR_INC]]
+; EVL: if.else:
+; EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
+; EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; EVL: if.then6:
+; EVL-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; EVL-NEXT: br label [[FOR_INC]]
+; EVL: for.inc:
+; EVL-NEXT: [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[IF_THEN6]] ], [ [[T_016]], [[IF_ELSE]] ]
+; EVL-NEXT: [[S_1]] = phi float [ [[S_017]], [[IF_THEN]] ], [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_ELSE]] ]
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @csa_else_if_float(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT: ret float [[ADD]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[S_017:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; NO-EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; NO-EVL: if.then:
+; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; NO-EVL-NEXT: br label [[FOR_INC]]
+; NO-EVL: if.else:
+; NO-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; NO-EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
+; NO-EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; NO-EVL: if.then6:
+; NO-EVL-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; NO-EVL-NEXT: br label [[FOR_INC]]
+; NO-EVL: for.inc:
+; NO-EVL-NEXT: [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[IF_THEN6]] ], [ [[T_016]], [[IF_ELSE]] ]
+; NO-EVL-NEXT: [[S_1]] = phi float [ [[S_017]], [[IF_THEN]] ], [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_ELSE]] ]
+; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @csa_else_if_float(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
+; DATA-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC]] ]
+; DATA-NEXT: [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; DATA-NEXT: ret float [[ADD]]
+; DATA: for.body:
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; DATA-NEXT: [[S_017:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
+; DATA-NEXT: [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; DATA-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
+; DATA-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; DATA: if.then:
+; DATA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; DATA-NEXT: br label [[FOR_INC]]
+; DATA: if.else:
+; DATA-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; DATA-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
+; DATA-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
+; DATA: if.then6:
+; DATA-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; DATA-NEXT: br label [[FOR_INC]]
+; DATA: for.inc:
+; DATA-NEXT: [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[IF_THEN6]] ], [ [[T_016]], [[IF_ELSE]] ]
+; DATA-NEXT: [[S_1]] = phi float [ [[S_017]], [[IF_THEN]] ], [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_ELSE]] ]
+; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp15 = icmp sgt i32 %N, 0
+ br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.inc
+ %0 = fadd float %t.1, %s.1
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
+ ret float %add
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %s.017 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
+ %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
+ %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+ %1 = load i8, ptr %arrayidx, align 1
+ %tobool.not = icmp eq i8 %1, 0
+ br i1 %tobool.not, label %if.else, label %if.then
+
+if.then: ; preds = %for.body
+ %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+ %2 = load float, ptr %arrayidx2, align 4
+ br label %for.inc
+
+if.else: ; preds = %for.body
+ %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+ %3 = load i8, ptr %arrayidx4, align 1
+ %tobool5.not = icmp eq i8 %3, 0
+ br i1 %tobool5.not, label %for.inc, label %if.then6
+
+if.then6: ; preds = %if.else
+ %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+ %4 = load float, ptr %arrayidx8, align 4
+ br label %for.inc
+
+for.inc: ; preds = %if.then, %if.then6, %if.else
+ %t.1 = phi float [ %2, %if.then ], [ %t.016, %if.then6 ], [ %t.016, %if.else ]
+ %s.1 = phi float [ %s.017, %if.then ], [ %4, %if.then6 ], [ %s.017, %if.else ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; uint64_t idx_scalar(int64_t *a, int64_t *b, uint64_t ii, uint64_t n) {
+; uint64_t idx = ii;
+; for (uint64_t i = 0; i < n; ++i)
+; idx = (a[i] > b[i]) ? i : idx;
+; return idx;
+; }
+define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
+; EVL-LABEL: @idx_scalar(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II:%.*]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT: ret i64 [[IDX_0_LCSSA]]
+; EVL: for.body:
+; EVL-NEXT: [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT: [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[I_010]]
+; EVL-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[I_010]]
+; EVL-NEXT: [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; EVL-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; EVL-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
+; EVL-NEXT: [[INC]] = add nuw i64 [[I_010]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @idx_scalar(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II:%.*]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT: ret i64 [[IDX_0_LCSSA]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT: [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[I_010]]
+; NO-EVL-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; NO-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[I_010]]
+; NO-EVL-NEXT: [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; NO-EVL-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; NO-EVL-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
+; NO-EVL-NEXT: [[INC]] = add nuw i64 [[I_010]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @idx_scalar(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II:%.*]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT: ret i64 [[IDX_0_LCSSA]]
+; DATA: for.body:
+; DATA-NEXT: [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; DATA-NEXT: [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[FOR_BODY_PREHEADER]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[I_010]]
+; DATA-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; DATA-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[I_010]]
+; DATA-NEXT: [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; DATA-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; DATA-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
+; DATA-NEXT: [[INC]] = add nuw i64 [[I_010]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp8.not = icmp eq i64 %n, 0
+ br i1 %cmp8.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %cond.lcssa = phi i64 [ %cond, %for.body ]
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond.lcssa, %for.cond.cleanup.loopexit ]
+ ret i64 %idx.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.010 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %idx.09 = phi i64 [ %cond, %for.body ], [ %ii, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i64, ptr %a, i64 %i.010
+ %0 = load i64, ptr %arrayidx, align 8
+ %arrayidx1 = getelementptr inbounds i64, ptr %b, i64 %i.010
+ %1 = load i64, ptr %arrayidx1, align 8
+ %cmp2 = icmp sgt i64 %0, %1
+ %cond = select i1 %cmp2, i64 %i.010, i64 %idx.09
+ %inc = add nuw i64 %i.010, 1
+ %exitcond.not = icmp eq i64 %inc, %n
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; uint64_t idx_scalar_dec(int64_t *a, int64_t *b, uint64_t ii, uint64_t n) {
+; uint64_t idx = ii;
+; for (uint64_t i = n; i > 0; --i) // decreasing
+; idx = (a[i - 1] > b[i - 1]) ? i : idx;
+; return idx;
+; }
+define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
+; EVL-LABEL: @idx_scalar_dec(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II:%.*]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT: ret i64 [[IDX_0_LCSSA]]
+; EVL: for.body:
+; EVL-NEXT: [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT: [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT: [[SUB]] = add i64 [[I_011]], -1
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[SUB]]
+; EVL-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[SUB]]
+; EVL-NEXT: [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; EVL-NEXT: [[CMP3:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; EVL-NEXT: [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
+; EVL-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
+; EVL-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @idx_scalar_dec(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II:%.*]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT: ret i64 [[IDX_0_LCSSA]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT: [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT: [[SUB]] = add i64 [[I_011]], -1
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[SUB]]
+; NO-EVL-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[SUB]]
+; NO-EVL-NEXT: [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; NO-EVL-NEXT: [[CMP3:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; NO-EVL-NEXT: [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
+; NO-EVL-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
+; NO-EVL-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @idx_scalar_dec(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II:%.*]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT: ret i64 [[IDX_0_LCSSA]]
+; DATA: for.body:
+; DATA-NEXT: [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
+; DATA-NEXT: [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[FOR_BODY_PREHEADER]] ]
+; DATA-NEXT: [[SUB]] = add i64 [[I_011]], -1
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[SUB]]
+; DATA-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; DATA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[SUB]]
+; DATA-NEXT: [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; DATA-NEXT: [[CMP3:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; DATA-NEXT: [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
+; DATA-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
+; DATA-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp.not9 = icmp eq i64 %n, 0
+ br i1 %cmp.not9, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %cond.lcssa = phi i64 [ %cond, %for.body ]
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond.lcssa, %for.cond.cleanup.loopexit ]
+ ret i64 %idx.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.011 = phi i64 [ %sub, %for.body ], [ %n, %for.body.preheader ]
+ %idx.010 = phi i64 [ %cond, %for.body ], [ %ii, %for.body.preheader ]
+ %sub = add i64 %i.011, -1
+ %arrayidx = getelementptr inbounds i64, ptr %a, i64 %sub
+ %0 = load i64, ptr %arrayidx, align 8
+ %arrayidx2 = getelementptr inbounds i64, ptr %b, i64 %sub
+ %1 = load i64, ptr %arrayidx2, align 8
+ %cmp3 = icmp sgt i64 %0, %1
+ %cond = select i1 %cmp3, i64 %i.011, i64 %idx.010
+ %cmp.not = icmp eq i64 %sub, 0
+ br i1 %cmp.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; The key part of this function is that the true arm of the select corresponds
+; to selecting the initial value, instead of selecting the new value.
+; int simple_csa_int_select_neg_cond(int N, int *data) {
+; int t = 0;
+; for (int i = 0; i < N; i++) {
+; if (i != data[i])
+; t = data[i];
+; }
+; return t; // use t
+; }
+define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
+; EVL-LABEL: @simple_csa_int_select_neg_cond(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT: ret i32 [[T_0_LCSSA]]
+; EVL: for.body:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[T_010:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; EVL-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP1]]
+; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP0]]
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @simple_csa_int_select_neg_cond(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT: ret i32 [[T_0_LCSSA]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[T_010:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; NO-EVL-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP1]]
+; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP0]]
+; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @simple_csa_int_select_neg_cond(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT: ret i32 [[T_0_LCSSA]]
+; DATA: for.body:
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[T_010:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; DATA-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP1]]
+; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP0]]
+; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp9 = icmp sgt i32 %N, 0
+ br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %spec.select.lcssa = phi i32 [ %spec.select, %for.body ]
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %t.0.lcssa = phi i32 [ 0, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+ ret i32 %t.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %t.010 = phi i32 [ 0, %for.body.preheader ], [ %spec.select, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %1 = zext i32 %0 to i64
+ %cmp1.not = icmp eq i64 %indvars.iv, %1
+ %spec.select = select i1 %cmp1.not, i32 %t.010, i32 %0
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int *simple_csa_ptr_select(int N, int **data) {
+; int *t = nullptr;
+; for (int i = 0; i < N; i++) {
+; if (i < *data[i])
+; t = data[i];
+; }
+; return t; // use t
+; }
+define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
+; EVL-LABEL: @simple_csa_ptr_select(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT: ret ptr [[T_0_LCSSA]]
+; EVL: for.body:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[T_010:%.*]] = phi ptr [ null, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; EVL-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+; EVL-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
+; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @simple_csa_ptr_select(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT: ret ptr [[T_0_LCSSA]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[T_010:%.*]] = phi ptr [ null, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; NO-EVL-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+; NO-EVL-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
+; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
+; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @simple_csa_ptr_select(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT: ret ptr [[T_0_LCSSA]]
+; DATA: for.body:
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[T_010:%.*]] = phi ptr [ null, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; DATA-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+; DATA-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
+; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
+; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp9 = icmp sgt i32 %N, 0
+ br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit:
+ %spec.select.lcssa = phi ptr [ %spec.select, %for.body ]
+ br label %for.cond.cleanup
+
+for.cond.cleanup:
+ %t.0.lcssa = phi ptr [ null, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+ ret ptr %t.0.lcssa
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %t.010 = phi ptr [ null, %for.body.preheader ], [ %spec.select, %for.body ]
+ %arrayidx = getelementptr inbounds ptr, ptr %data, i64 %indvars.iv
+ %0 = load ptr, ptr %arrayidx, align 8
+ %1 = load i32, ptr %0, align 4
+ %2 = sext i32 %1 to i64
+ %cmp1 = icmp slt i64 %indvars.iv, %2
+ %spec.select = select i1 %cmp1, ptr %0, ptr %t.010
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
>From b2fad8a7d3b5c00bf37153fc12ebcb2afc99d8ac Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Thu, 8 Aug 2024 10:31:27 -0700
Subject: [PATCH 02/16] [CSA] Add CSADescriptors Analysis
---
llvm/include/llvm/Analysis/CSADescriptors.h | 78 +++++++++++++++++++++
llvm/lib/Analysis/CMakeLists.txt | 1 +
llvm/lib/Analysis/CSADescriptors.cpp | 73 +++++++++++++++++++
3 files changed, 152 insertions(+)
create mode 100644 llvm/include/llvm/Analysis/CSADescriptors.h
create mode 100644 llvm/lib/Analysis/CSADescriptors.cpp
diff --git a/llvm/include/llvm/Analysis/CSADescriptors.h b/llvm/include/llvm/Analysis/CSADescriptors.h
new file mode 100644
index 00000000000000..edd98777d84ab6
--- /dev/null
+++ b/llvm/include/llvm/Analysis/CSADescriptors.h
@@ -0,0 +1,78 @@
+//===- llvm/Analysis/CSADescriptors.h - CSA Descriptors --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file "describes" conditional scalar assignments (CSA).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Value.h"
+
+#ifndef LLVM_ANALYSIS_CSADESCRIPTORS_H
+#define LLVM_ANALYSIS_CSADESCRIPTORS_H
+
+namespace llvm {
+
+/// A Conditional Scalar Assignment (CSA) is an assignment from an initial
+/// scalar that may or may not occur.
+class CSADescriptor {
+ /// If the conditional assignment occurs inside a loop, then Phi chooses
+ /// the value of the assignment from the entry block or the loop body block.
+ PHINode *Phi = nullptr;
+
+ /// The initial value of the CSA. If the condition guarding the assignment is
+ /// not met, then the assignment retains this value.
+ Value *InitScalar = nullptr;
+
+ /// The Instruction that conditionally assigned to inside the loop.
+ Instruction *Assignment = nullptr;
+
+ /// Create a CSA Descriptor that models an invalid CSA.
+ CSADescriptor() = default;
+
+ /// Create a CSA Descriptor that models a valid CSA with its members
+ /// initialized correctly.
+ CSADescriptor(PHINode *Phi, Instruction *Assignment, Value *InitScalar)
+ : Phi(Phi), InitScalar(InitScalar), Assignment(Assignment) {}
+
+public:
+ /// If Phi is the root of a CSA, return the CSADescriptor of the CSA rooted by
+ /// Phi. Otherwise, return a CSADescriptor with IsValidCSA set to false.
+ static CSADescriptor isCSAPhi(PHINode *Phi, Loop *TheLoop);
+
+ operator bool() const { return isValid(); }
+
+ /// Returns whether SI is the Assignment in CSA
+ static bool isCSASelect(CSADescriptor Desc, SelectInst *SI) {
+ return Desc.getAssignment() == SI;
+ }
+
+ /// Return whether this CSADescriptor models a valid CSA.
+ bool isValid() const { return Phi && InitScalar && Assignment; }
+
+ /// Return the PHI that roots this CSA.
+ PHINode *getPhi() const { return Phi; }
+
+ /// Return the initial value of the CSA. This is the value if the conditional
+ /// assignment does not occur.
+ Value *getInitScalar() const { return InitScalar; }
+
+ /// The Instruction that is used after the loop
+ Instruction *getAssignment() const { return Assignment; }
+
+ /// Return the condition that this CSA is conditional upon.
+ Value *getCond() const {
+ if (auto *SI = dyn_cast_or_null<SelectInst>(Assignment))
+ return SI->getCondition();
+ return nullptr;
+ }
+};
+} // namespace llvm
+
+#endif // LLVM_ANALYSIS_CSADESCRIPTORS_H
diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
index 393803fad89383..24ca426990d9ed 100644
--- a/llvm/lib/Analysis/CMakeLists.txt
+++ b/llvm/lib/Analysis/CMakeLists.txt
@@ -46,6 +46,7 @@ add_llvm_component_library(LLVMAnalysis
CostModel.cpp
CodeMetrics.cpp
ConstantFolding.cpp
+ CSADescriptors.cpp
CtxProfAnalysis.cpp
CycleAnalysis.cpp
DDG.cpp
diff --git a/llvm/lib/Analysis/CSADescriptors.cpp b/llvm/lib/Analysis/CSADescriptors.cpp
new file mode 100644
index 00000000000000..d0377c8c16de33
--- /dev/null
+++ b/llvm/lib/Analysis/CSADescriptors.cpp
@@ -0,0 +1,73 @@
+//=== llvm/Analysis/CSADescriptors.cpp - CSA Descriptors -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file "describes" conditional scalar assignments (CSA).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/CSADescriptors.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "csa-descriptors"
+
+CSADescriptor CSADescriptor::isCSAPhi(PHINode *Phi, Loop *TheLoop) {
+ // Return CSADescriptor that describes a CSA that matches one of these
+ // patterns:
+ // phi loop_inv, (select cmp, value, phi)
+ // phi loop_inv, (select cmp, phi, value)
+ // phi (select cmp, value, phi), loop_inv
+ // phi (select cmp, phi, value), loop_inv
+ // If the CSA does not match any of these paterns, return a CSADescriptor
+ // that describes an InvalidCSA.
+
+ // Must be a scalar
+ Type *Type = Phi->getType();
+ if (!Type->isIntegerTy() && !Type->isFloatingPointTy() &&
+ !Type->isPointerTy())
+ return CSADescriptor();
+
+ // Match phi loop_inv, (select cmp, value, phi)
+ // or phi loop_inv, (select cmp, phi, value)
+ // or phi (select cmp, value, phi), loop_inv
+ // or phi (select cmp, phi, value), loop_inv
+ if (Phi->getNumIncomingValues() != 2)
+ return CSADescriptor();
+ auto SelectInstIt = find_if(Phi->incoming_values(), [&Phi](Use &U) {
+ return match(U.get(), m_Select(m_Value(), m_Specific(Phi), m_Value())) ||
+ match(U.get(), m_Select(m_Value(), m_Value(), m_Specific(Phi)));
+ });
+ if (SelectInstIt == Phi->incoming_values().end())
+ return CSADescriptor();
+ auto LoopInvIt = find_if(Phi->incoming_values(), [&](Use &U) {
+ return U.get() != *SelectInstIt && TheLoop->isLoopInvariant(U.get());
+ });
+ if (LoopInvIt == Phi->incoming_values().end())
+ return CSADescriptor();
+
+ // Phi or Sel must be used only outside the loop,
+ // excluding if Phi use Sel or Sel use Phi
+ auto IsOnlyUsedOutsideLoop = [=](Value *V, Value *Ignore) {
+ return all_of(V->users(), [Ignore, TheLoop](User *U) {
+ if (U == Ignore)
+ return true;
+ if (auto *I = dyn_cast<Instruction>(U))
+ return !TheLoop->contains(I);
+ return true;
+ });
+ };
+ auto *Sel = cast<SelectInst>(SelectInstIt->get());
+ auto *LoopInv = LoopInvIt->get();
+ if (!IsOnlyUsedOutsideLoop(Phi, Sel) || !IsOnlyUsedOutsideLoop(Sel, Phi))
+ return CSADescriptor();
+
+ return CSADescriptor(Phi, Sel, LoopInv);
+}
>From 67259ff527658d86350917632ad0eeded0b9b944 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Thu, 8 Aug 2024 11:16:32 -0700
Subject: [PATCH 03/16] [LVL][CSA] Legalize CSA vectorization
---
.../llvm/Analysis/TargetTransformInfo.h | 9 +++++
.../llvm/Analysis/TargetTransformInfoImpl.h | 2 ++
.../Vectorize/LoopVectorizationLegality.h | 18 ++++++++++
llvm/lib/Analysis/TargetTransformInfo.cpp | 4 +++
.../Target/RISCV/RISCVTargetTransformInfo.cpp | 5 +++
.../Target/RISCV/RISCVTargetTransformInfo.h | 4 +++
.../Vectorize/LoopVectorizationLegality.cpp | 34 ++++++++++++++++---
7 files changed, 72 insertions(+), 4 deletions(-)
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 3411163549de2f..013e57544926c5 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1769,6 +1769,10 @@ class TargetTransformInfo {
: EVLParamStrategy(EVLParamStrategy), OpStrategy(OpStrategy) {}
};
+ /// \returns true if the loop vectorizer should vectorize conditional
+ /// scalar assignments for the target.
+ bool enableCSAVectorization() const;
+
/// \returns How the target needs this vector-predicated operation to be
/// transformed.
VPLegalization getVPLegalizationStrategy(const VPIntrinsic &PI) const;
@@ -2178,6 +2182,7 @@ class TargetTransformInfo::Concept {
virtual bool supportsScalableVectors() const = 0;
virtual bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
Align Alignment) const = 0;
+ virtual bool enableCSAVectorization() const = 0;
virtual VPLegalization
getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
virtual bool hasArmWideBranch(bool Thumb) const = 0;
@@ -2945,6 +2950,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
return Impl.hasActiveVectorLength(Opcode, DataType, Alignment);
}
+ bool enableCSAVectorization() const override {
+ return Impl.enableCSAVectorization();
+ }
+
VPLegalization
getVPLegalizationStrategy(const VPIntrinsic &PI) const override {
return Impl.getVPLegalizationStrategy(PI);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 2819af30cd1704..7ffb5688a5a707 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -966,6 +966,8 @@ class TargetTransformInfoImplBase {
return false;
}
+ bool enableCSAVectorization() const { return false; }
+
TargetTransformInfo::VPLegalization
getVPLegalizationStrategy(const VPIntrinsic &PI) const {
return TargetTransformInfo::VPLegalization(
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 0f4d1355dd2bfe..a492af0b3856ee 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -27,6 +27,7 @@
#define LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONLEGALITY_H
#include "llvm/ADT/MapVector.h"
+#include "llvm/Analysis/CSADescriptors.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Support/TypeSize.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
@@ -257,6 +258,10 @@ class LoopVectorizationLegality {
/// induction descriptor.
using InductionList = MapVector<PHINode *, InductionDescriptor>;
+ /// CSAList contains the CSA descriptors for all the CSAs that were found
+ /// in the loop, rooted by their phis.
+ using CSAList = MapVector<PHINode *, CSADescriptor>;
+
/// RecurrenceSet contains the phi nodes that are recurrences other than
/// inductions and reductions.
using RecurrenceSet = SmallPtrSet<const PHINode *, 8>;
@@ -309,6 +314,12 @@ class LoopVectorizationLegality {
/// Returns True if V is a Phi node of an induction variable in this loop.
bool isInductionPhi(const Value *V) const;
+ /// Returns the CSAs found in the loop.
+ const CSAList &getCSAs() const { return CSAs; }
+
+ /// Returns true if Phi is the root of a CSA in the loop.
+ bool isCSAPhi(PHINode *Phi) const { return CSAs.count(Phi) != 0; }
+
/// Returns a pointer to the induction descriptor, if \p Phi is an integer or
/// floating point induction.
const InductionDescriptor *getIntOrFpInductionDescriptor(PHINode *Phi) const;
@@ -463,6 +474,10 @@ class LoopVectorizationLegality {
void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID,
SmallPtrSetImpl<Value *> &AllowedExit);
+ // Updates the vetorization state by adding \p Phi to the CSA list.
+ void addCSAPhi(PHINode *Phi, const CSADescriptor &CSADesc,
+ SmallPtrSetImpl<Value *> &AllowedExit);
+
/// The loop that we evaluate.
Loop *TheLoop;
@@ -507,6 +522,9 @@ class LoopVectorizationLegality {
/// variables can be pointers.
InductionList Inductions;
+ /// Holds the conditional scalar assignments
+ CSAList CSAs;
+
/// Holds all the casts that participate in the update chain of the induction
/// variables, and that have been proven to be redundant (possibly under a
/// runtime guard). These casts can be ignored when creating the vectorized
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 67b626f300a101..916a759391c8a0 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1309,6 +1309,10 @@ bool TargetTransformInfo::preferEpilogueVectorization() const {
return TTIImpl->preferEpilogueVectorization();
}
+bool TargetTransformInfo::enableCSAVectorization() const {
+ return TTIImpl->enableCSAVectorization();
+}
+
TargetTransformInfo::VPLegalization
TargetTransformInfo::getVPLegalizationStrategy(const VPIntrinsic &VPI) const {
return TTIImpl->getVPLegalizationStrategy(VPI);
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 5d280b44630aef..817238c670b10c 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -2088,6 +2088,11 @@ bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
C2.ScaleCost, C2.ImmCost, C2.SetupCost);
}
+bool RISCVTTIImpl::enableCSAVectorization() const {
+ return ST->hasVInstructions() &&
+ ST->getProcFamily() == RISCVSubtarget::SiFive7;
+}
+
bool RISCVTTIImpl::isLegalMaskedCompressStore(Type *DataTy, Align Alignment) {
auto *VTy = dyn_cast<VectorType>(DataTy);
if (!VTy || VTy->isScalableTy())
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 763b89bfec0a66..bbd54ce3620a37 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -293,6 +293,10 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
return TLI->isVScaleKnownToBeAPowerOfTwo();
}
+ /// \returns true if the loop vectorizer should vectorize conditional
+ /// scalar assignments for the target.
+ bool enableCSAVectorization() const;
+
/// \returns How the target needs this vector-predicated operation to be
/// transformed.
TargetTransformInfo::VPLegalization
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 7062e21383a5fc..50ee706b875ba1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -79,6 +79,10 @@ static cl::opt<LoopVectorizeHints::ScalableForceKind>
"Scalable vectorization is available and favored when the "
"cost is inconclusive.")));
+static cl::opt<bool>
+ EnableCSA("enable-csa-vectorization", cl::init(false), cl::Hidden,
+ cl::desc("Control whether CSA loop vectorization is enabled"));
+
/// Maximum vectorization interleave count.
static const unsigned MaxInterleaveFactor = 16;
@@ -749,6 +753,15 @@ bool LoopVectorizationLegality::setupOuterLoopInductions() {
return llvm::all_of(Header->phis(), IsSupportedPhi);
}
+void LoopVectorizationLegality::addCSAPhi(
+ PHINode *Phi, const CSADescriptor &CSADesc,
+ SmallPtrSetImpl<Value *> &AllowedExit) {
+ assert(CSADesc.isValid() && "Expected Valid CSADescriptor");
+ LLVM_DEBUG(dbgs() << "LV: found legal CSA opportunity" << *Phi << "\n");
+ AllowedExit.insert(Phi);
+ CSAs.insert({Phi, CSADesc});
+}
+
/// Checks if a function is scalarizable according to the TLI, in
/// the sense that it should be vectorized and then expanded in
/// multiple scalar calls. This is represented in the
@@ -866,14 +879,23 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
continue;
}
- // As a last resort, coerce the PHI to a AddRec expression
- // and re-try classifying it a an induction PHI.
+ // Try to coerce the PHI to a AddRec expression and re-try classifying
+ // it a an induction PHI.
if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true) &&
!IsDisallowedStridedPointerInduction(ID)) {
addInductionPhi(Phi, ID, AllowedExit);
continue;
}
+ // Check if the PHI can be classified as a CSA PHI.
+ if (EnableCSA || (TTI->enableCSAVectorization() &&
+ EnableCSA.getNumOccurrences() == 0)) {
+ if (auto CSADesc = CSADescriptor::isCSAPhi(Phi, TheLoop)) {
+ addCSAPhi(Phi, CSADesc, AllowedExit);
+ continue;
+ }
+ }
+
reportVectorizationFailure("Found an unidentified PHI",
"value that could not be identified as "
"reduction is used outside the loop",
@@ -1564,11 +1586,15 @@ bool LoopVectorizationLegality::canFoldTailByMasking() const {
for (const auto &Reduction : getReductionVars())
ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr());
+ SmallPtrSet<const Value *, 8> CSALiveOuts;
+ for (const auto &CSA : getCSAs())
+ CSALiveOuts.insert(CSA.second.getAssignment());
+
// TODO: handle non-reduction outside users when tail is folded by masking.
for (auto *AE : AllowedExit) {
// Check that all users of allowed exit values are inside the loop or
- // are the live-out of a reduction.
- if (ReductionLiveOuts.count(AE))
+ // are the live-out of a reduction or a CSA
+ if (ReductionLiveOuts.count(AE) || CSALiveOuts.count(AE))
continue;
for (User *U : AE->users()) {
Instruction *UI = cast<Instruction>(U);
>From 1d0b3fb8aa74dd39ee92969d5a60bf2deac4bcfd Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Thu, 15 Aug 2024 11:00:06 -0700
Subject: [PATCH 04/16] [LV] Build VPlan for CSA
---
.../Transforms/Vectorize/LoopVectorize.cpp | 190 +-
llvm/lib/Transforms/Vectorize/VPlan.cpp | 7 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 184 ++
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 269 +-
.../Transforms/Vectorize/VPlanTransforms.cpp | 44 +
llvm/lib/Transforms/Vectorize/VPlanValue.h | 3 +
.../Transforms/Vectorize/VPlanVerifier.cpp | 6 +-
.../Transforms/LoopVectorize/RISCV/csa.ll | 2457 ++++++++++++++---
8 files changed, 2777 insertions(+), 383 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0fa7c2add1faa2..66819a9d9fdbd5 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -173,6 +173,8 @@ const char LLVMLoopVectorizeFollowupEpilogue[] =
STATISTIC(LoopsVectorized, "Number of loops vectorized");
STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
+STATISTIC(CSAsVectorized,
+ "Number of conditional scalar assignments vectorized");
static cl::opt<bool> EnableEpilogueVectorization(
"enable-epilogue-vectorization", cl::init(true), cl::Hidden,
@@ -493,6 +495,10 @@ class InnerLoopVectorizer {
virtual std::pair<BasicBlock *, Value *>
createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
+ /// For all vectorized CSAs, replace uses of live-out scalar from the orignal
+ /// loop with the extracted scalar from the vector loop for.
+ void fixCSALiveOuts(VPTransformState &State, VPlan &Plan);
+
/// Fix the vectorized code, taking care of header phi's, live-outs, and more.
void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
@@ -2930,6 +2936,25 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
TargetTransformInfo::TCK_RecipThroughput);
}
+void InnerLoopVectorizer::fixCSALiveOuts(VPTransformState &State, VPlan &Plan) {
+ for (const auto &CSA : Plan.getCSAStates()) {
+ VPCSADataUpdateRecipe *VPDataUpdate = CSA.second->getDataUpdate();
+ assert(VPDataUpdate &&
+ "VPDataUpdate must have been introduced prior to fixing live outs");
+ Value *V = VPDataUpdate->getUnderlyingValue();
+ Value *ExtractedScalar = State.get(CSA.second->getExtractScalarRecipe(), 0,
+ /*NeedsScalar=*/true);
+ // Fix LCSSAPhis
+ llvm::SmallPtrSet<PHINode *, 2> ToFix;
+ for (User *U : V->users())
+ if (auto *Phi = dyn_cast<PHINode>(U);
+ Phi && Phi->getParent() == LoopExitBlock)
+ ToFix.insert(Phi);
+ for (PHINode *Phi : ToFix)
+ Phi->addIncoming(ExtractedScalar, LoopMiddleBlock);
+ }
+}
+
void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
VPlan &Plan) {
// Fix widened non-induction PHIs by setting up the PHI operands.
@@ -2970,6 +2995,8 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
IVEndValues[Entry.first], LoopMiddleBlock,
VectorLoop->getHeader(), Plan, State);
+
+ fixCSALiveOuts(State, Plan);
}
// Fix live-out phis not already fixed earlier.
@@ -4477,6 +4504,9 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
case VPDef::VPEVLBasedIVPHISC:
case VPDef::VPPredInstPHISC:
case VPDef::VPBranchOnMaskSC:
+ case VPRecipeBase::VPCSADataUpdateSC:
+ case VPRecipeBase::VPCSAExtractScalarSC:
+ case VPRecipeBase::VPCSAHeaderPHISC:
continue;
case VPDef::VPReductionSC:
case VPDef::VPActiveLaneMaskPHISC:
@@ -8519,9 +8549,6 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
return Recipe;
VPHeaderPHIRecipe *PhiRecipe = nullptr;
- assert((Legal->isReductionVariable(Phi) ||
- Legal->isFixedOrderRecurrence(Phi)) &&
- "can only widen reductions and fixed-order recurrences here");
VPValue *StartV = Operands[0];
if (Legal->isReductionVariable(Phi)) {
const RecurrenceDescriptor &RdxDesc =
@@ -8531,12 +8558,23 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
CM.isInLoopReduction(Phi),
CM.useOrderedReductions(RdxDesc));
- } else {
+ } else if (Legal->isFixedOrderRecurrence(Phi)) {
// TODO: Currently fixed-order recurrences are modeled as chains of
// first-order recurrences. If there are no users of the intermediate
// recurrences in the chain, the fixed order recurrence should be modeled
// directly, enabling more efficient codegen.
PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
+ } else if (Legal->isCSAPhi(Phi)) {
+ VPCSAState *State = Plan.getCSAStates().find(Phi)->second;
+ VPValue *InitData = State->getVPInitData();
+ // When the VF=getFixed(1), InitData is just InitScalar.
+ if (!InitData)
+ InitData = State->getVPInitScalar();
+ PhiRecipe = new VPCSAHeaderPHIRecipe(Phi, InitData);
+ State->setPhiRecipe(cast<VPCSAHeaderPHIRecipe>(PhiRecipe));
+ } else {
+ llvm_unreachable(
+ "can only widen reductions, fixed-order recurrences, and CSAs here");
}
PhisToFix.push_back(PhiRecipe);
@@ -8566,6 +8604,19 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
make_range(Operands.begin(), Operands.end()));
if (auto *SI = dyn_cast<SelectInst>(Instr)) {
+ auto *CSADescIt = find_if(Legal->getCSAs(), [&](auto CSA) {
+ return CSADescriptor::isCSASelect(CSA.second, SI);
+ });
+ if (CSADescIt != Legal->getCSAs().end()) {
+ PHINode *CSAPhi = CSADescIt->first;
+ VPCSAState *State = Plan.getCSAStates().find(CSAPhi)->second;
+ VPValue *VPDataPhi = State->getPhiRecipe();
+ auto *R = new VPCSADataUpdateRecipe(
+ SI, {VPDataPhi, Operands[0], Operands[1], Operands[2]});
+ State->setDataUpdate(R);
+ return R;
+ }
+
return new VPWidenSelectRecipe(
*SI, make_range(Operands.begin(), Operands.end()));
}
@@ -8578,6 +8629,107 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
return tryToWiden(Instr, Operands, VPBB);
}
+/// Add CSA Recipes that can occur before each instruction in the input IR
+/// is processed and introduced into VPlan.
+static void
+addCSAPreprocessRecipes(const LoopVectorizationLegality::CSAList &CSAs,
+ Loop *OrigLoop, VPBasicBlock *PreheaderVPBB,
+ VPBasicBlock *HeaderVPBB, DebugLoc DL, VFRange &Range,
+ VPlan &Plan) {
+
+ // Don't build full CSA for VF=ElementCount::getFixed(1)
+ bool IsScalarVF = LoopVectorizationPlanner::getDecisionAndClampRange(
+ [&](ElementCount VF) { return VF.isScalar(); }, Range);
+
+ for (const auto &CSA : CSAs) {
+ VPValue *VPInitScalar = Plan.getOrAddLiveIn(
+ CSA.first->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
+
+ // Scalar VF builds the scalar version of the loop. In that case,
+ // no maintenence of mask nor extraction in middle block is needed.
+ if (IsScalarVF) {
+ VPCSAState *S = new VPCSAState(VPInitScalar);
+ Plan.addCSAState(CSA.first, S);
+ continue;
+ }
+
+ auto *VPInitMask =
+ new VPInstruction(VPInstruction::CSAInitMask, {}, DL, "csa.init.mask");
+ auto *VPInitData = new VPInstruction(VPInstruction::CSAInitData,
+ {VPInitScalar}, DL, "csa.init.data");
+ PreheaderVPBB->appendRecipe(VPInitMask);
+ PreheaderVPBB->appendRecipe(VPInitData);
+
+ auto *VPMaskPhi = new VPInstruction(VPInstruction::CSAMaskPhi, {VPInitMask},
+ DL, "csa.mask.phi");
+ HeaderVPBB->appendRecipe(VPMaskPhi);
+
+ auto *S = new VPCSAState(VPInitScalar, VPInitData, VPMaskPhi);
+ Plan.addCSAState(CSA.first, S);
+ }
+}
+
+/// Add CSA Recipes that must occur after each instruction in the input IR
+/// is processed and introduced into VPlan.
+static void
+addCSAPostprocessRecipes(VPRecipeBuilder &RecipeBuilder,
+ const LoopVectorizationLegality::CSAList &CSAs,
+ VPBasicBlock *MiddleVPBB, DebugLoc DL, VFRange &Range,
+ VPlan &Plan) {
+ // Don't build CSA for VF=ElementCount::getFixed(1)
+ if (LoopVectorizationPlanner::getDecisionAndClampRange(
+ [&](ElementCount VF) { return VF.isScalar(); }, Range))
+ return;
+
+ for (const auto &CSA : CSAs) {
+ VPCSAState *CSAState = Plan.getCSAStates().find(CSA.first)->second;
+ VPCSADataUpdateRecipe *VPDataUpdate = CSAState->getDataUpdate();
+
+ assert(VPDataUpdate &&
+ "VPDataUpdate must have been introduced prior to postprocess");
+ assert(CSA.second.getCond() &&
+ "CSADescriptor must know how to describe the condition");
+ auto GetVPValue = [&](Value *I) {
+ return RecipeBuilder.getRecipe(cast<Instruction>(I))->getVPSingleValue();
+ };
+ VPValue *WidenedCond = GetVPValue(CSA.second.getCond());
+ VPValue *VPInitScalar = CSAState->getVPInitScalar();
+
+ // The CSA optimization wants to use a condition such that when it is
+ // true, a new value is assigned. However, it is possible that a true lane
+ // in WidenedCond corresponds to selection of the initial value instead.
+ // In that case, we must use the negation of WidenedCond.
+ // i.e. select cond new_val old_val versus select cond.not old_val new_val
+ VPValue *CondToUse = WidenedCond;
+ if (cast<SelectInst>(CSA.second.getAssignment())->getTrueValue() ==
+ CSA.first) {
+ auto *VPNotCond = new VPInstruction(VPInstruction::Not, WidenedCond, DL);
+ VPNotCond->insertBefore(
+ GetVPValue(CSA.second.getAssignment())->getDefiningRecipe());
+ CondToUse = VPNotCond;
+ }
+
+ auto *VPAnyActive = new VPInstruction(
+ VPInstruction::CSAAnyActive, {CondToUse}, DL, "csa.cond.anyactive");
+ VPAnyActive->insertBefore(
+ GetVPValue(CSA.second.getAssignment())->getDefiningRecipe());
+
+ auto *VPMaskSel = new VPInstruction(
+ VPInstruction::CSAMaskSel,
+ {CondToUse, CSAState->getVPMaskPhi(), VPAnyActive}, DL, "csa.mask.sel");
+ VPMaskSel->insertAfter(VPAnyActive);
+ VPDataUpdate->setVPNewMaskAndVPAnyActive(VPMaskSel, VPAnyActive);
+ VPCSAExtractScalarRecipe *ExtractScalarRecipe =
+ new VPCSAExtractScalarRecipe({VPInitScalar, VPMaskSel, VPDataUpdate});
+
+ MiddleVPBB->insert(ExtractScalarRecipe, MiddleVPBB->getFirstNonPhi());
+
+ // Update CSAState with new recipes
+ CSAState->setExtractScalarRecipe(ExtractScalarRecipe);
+ CSAState->setVPAnyActive(VPAnyActive);
+ }
+}
+
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
ElementCount MaxVF) {
assert(OrigLoop->isInnermost() && "Inner loop expected.");
@@ -8635,7 +8787,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
// increments.
static SetVector<VPIRInstruction *> collectUsersInExitBlock(
Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
- const MapVector<PHINode *, InductionDescriptor> &Inductions) {
+ const MapVector<PHINode *, InductionDescriptor> &Inductions,
+ const MapVector<PHINode *, CSADescriptor> &CSAs) {
auto *MiddleVPBB =
cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
// No edge from the middle block to the unique exit block has been inserted
@@ -8669,6 +8822,17 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlock(
return P && Inductions.contains(P);
})))
continue;
+ // Exit values for CSAs are computed and updated outside of VPlan and
+ // independent of induction recipes.
+ // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update
+ // live-outs.
+ if (isa<VPCSADataUpdateRecipe>(V) &&
+ (isa<Instruction>(IncomingValue) &&
+ any_of(IncomingValue->users(), [&CSAs](User *U) {
+ auto *P = dyn_cast<PHINode>(U);
+ return P && CSAs.contains(P);
+ })))
+ continue;
ExitUsersToFix.insert(ExitIRI);
ExitIRI->addOperand(V);
}
@@ -8883,6 +9047,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
bool HasNUW = Style == TailFoldingStyle::None;
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
+ addCSAPreprocessRecipes(Legal->getCSAs(), OrigLoop, Plan->getPreheader(),
+ Plan->getVectorLoopRegion()->getEntryBasicBlock(), DL,
+ Range, *Plan);
+
VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
// ---------------------------------------------------------------------------
@@ -8989,6 +9157,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
}
+ VPBasicBlock *MiddleVPBB =
+ cast<VPBasicBlock>(Plan->getVectorLoopRegion()->getSingleSuccessor());
+ addCSAPostprocessRecipes(RecipeBuilder, Legal->getCSAs(), MiddleVPBB, DL,
+ Range, *Plan);
+
// After here, VPBB should not be used.
VPBB = nullptr;
@@ -8998,8 +9171,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
"VPBasicBlock");
RecipeBuilder.fixHeaderPhis();
- SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlock(
- OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
+ SetVector<VPIRInstruction *> ExitUsersToFix =
+ collectUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan,
+ Legal->getInductionVars(), Legal->getCSAs());
addLiveOutsForFirstOrderRecurrences(*Plan, ExitUsersToFix);
addUsersInExitBlock(*Plan, ExitUsersToFix);
@@ -10097,6 +10271,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, true);
++LoopsVectorized;
+ CSAsVectorized += LVL.getCSAs().size();
// Second pass vectorizes the epilogue and adjusts the control flow
// edges from the first pass.
@@ -10191,6 +10366,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
PSI, Checks);
LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
++LoopsVectorized;
+ CSAsVectorized += LVL.getCSAs().size();
// Add metadata to disable runtime unrolling a scalar loop when there
// are no runtime checks about strides and memory. A scalar loop that is
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 41e281f3fa9973..462d29b0ca8136 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -217,7 +217,7 @@ void VPBlockBase::deleteCFG(VPBlockBase *Entry) {
VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
iterator It = begin();
- while (It != end() && It->isPhi())
+ while (It != end() && vputils::isPhi(*It))
It++;
return It;
}
@@ -859,6 +859,9 @@ VPlan::~VPlan() {
delete VPV;
if (BackedgeTakenCount)
delete BackedgeTakenCount;
+
+ for (std::pair<PHINode *, VPCSAState *> &S : CSAStates)
+ delete S.second;
}
static VPIRBasicBlock *createVPIRBasicBlockFor(BasicBlock *BB) {
@@ -1053,7 +1056,7 @@ void VPlan::execute(VPTransformState *State) {
VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
for (VPRecipeBase &R : Header->phis()) {
// Skip phi-like recipes that generate their backedege values themselves.
- if (isa<VPWidenPHIRecipe>(&R))
+ if (vputils::isPhiThatGeneratesBackedge(R))
continue;
if (isa<VPWidenPointerInductionRecipe>(&R) ||
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index eac4fe8ce8b0f2..eb65ec94e8bed3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -250,6 +250,53 @@ struct VPIteration {
bool isFirstIteration() const { return Part == 0 && Lane.isFirstLane(); }
};
+class VPInstruction;
+class VPCSAHeaderPHIRecipe;
+class VPCSADataUpdateRecipe;
+class VPCSAExtractScalarRecipe;
+
+/// VPCSAState holds information required to vectorize a conditional scalar
+/// assignment.
+class VPCSAState {
+ VPValue *VPInitScalar = nullptr;
+ VPInstruction *VPInitData = nullptr;
+ VPInstruction *VPMaskPhi = nullptr;
+ VPInstruction *VPAnyActive = nullptr;
+ VPCSAHeaderPHIRecipe *VPPhiRecipe = nullptr;
+ VPCSADataUpdateRecipe *VPDataUpdate = nullptr;
+ VPCSAExtractScalarRecipe *VPExtractScalar = nullptr;
+
+public:
+ VPCSAState(VPValue *VPInitScalar, VPInstruction *InitData,
+ VPInstruction *MaskPhi)
+ : VPInitScalar(VPInitScalar), VPInitData(InitData), VPMaskPhi(MaskPhi) {}
+
+ VPCSAState(VPValue *VPInitScalar) : VPInitScalar(VPInitScalar) {}
+
+ VPValue *getVPInitScalar() const { return VPInitScalar; }
+
+ VPInstruction *getVPInitData() const { return VPInitData; }
+
+ VPInstruction *getVPMaskPhi() const { return VPMaskPhi; }
+
+ void setVPAnyActive(VPInstruction *AnyActive) { VPAnyActive = AnyActive; }
+ VPInstruction *getVPAnyActive() { return VPAnyActive; }
+
+ VPCSAHeaderPHIRecipe *getPhiRecipe() const { return VPPhiRecipe; }
+
+ void setPhiRecipe(VPCSAHeaderPHIRecipe *R) { VPPhiRecipe = R; }
+
+ VPCSADataUpdateRecipe *getDataUpdate() const { return VPDataUpdate; }
+ void setDataUpdate(VPCSADataUpdateRecipe *R) { VPDataUpdate = R; }
+
+ void setExtractScalarRecipe(VPCSAExtractScalarRecipe *R) {
+ VPExtractScalar = R;
+ }
+ VPCSAExtractScalarRecipe *getExtractScalarRecipe() const {
+ return VPExtractScalar;
+ }
+};
+
/// VPTransformState holds information passed down when "executing" a VPlan,
/// needed for generating the output IR.
struct VPTransformState {
@@ -935,6 +982,9 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPWidenPointerInductionSC:
case VPRecipeBase::VPReductionPHISC:
case VPRecipeBase::VPScalarCastSC:
+ case VPRecipeBase::VPCSAHeaderPHISC:
+ case VPRecipeBase::VPCSADataUpdateSC:
+ case VPRecipeBase::VPCSAExtractScalarSC:
return true;
case VPRecipeBase::VPBranchOnMaskSC:
case VPRecipeBase::VPInterleaveSC:
@@ -1269,6 +1319,14 @@ class VPInstruction : public VPRecipeWithIRFlags {
// operand). Only generates scalar values (either for the first lane only or
// for all lanes, depending on its uses).
PtrAdd,
+ CSAInitMask,
+ CSAInitData,
+ CSAMaskPhi,
+ CSAMaskSel,
+ CSAVLPhi,
+ CSAVLSel,
+ CSAAnyActive,
+ CSAAnyActiveEVL,
};
private:
@@ -2538,6 +2596,110 @@ class VPBranchOnMaskRecipe : public VPRecipeBase {
}
};
+class VPCSAHeaderPHIRecipe final : public VPHeaderPHIRecipe {
+public:
+ VPCSAHeaderPHIRecipe(PHINode *Phi, VPValue *VPInitData)
+ : VPHeaderPHIRecipe(VPDef::VPCSAHeaderPHISC, Phi, VPInitData) {}
+
+ ~VPCSAHeaderPHIRecipe() override = default;
+
+ VPCSAHeaderPHIRecipe *clone() override {
+ return new VPCSAHeaderPHIRecipe(cast<PHINode>(getUnderlyingInstr()),
+ getOperand(0));
+ }
+
+ void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ VP_CLASSOF_IMPL(VPDef::VPCSAHeaderPHISC)
+
+ static inline bool classof(const VPHeaderPHIRecipe *R) {
+ return R->getVPDefID() == VPDef::VPCSAHeaderPHISC;
+ }
+
+ VPValue *getVPInitData() { return getOperand(0); }
+ VPValue *getVPNewData() { return getOperand(1); }
+};
+
+class VPCSADataUpdateRecipe final : public VPSingleDefRecipe {
+public:
+ VPCSADataUpdateRecipe(SelectInst *SI, ArrayRef<VPValue *> Operands)
+ : VPSingleDefRecipe(VPDef::VPCSADataUpdateSC, Operands, SI) {}
+
+ ~VPCSADataUpdateRecipe() override = default;
+
+ VPCSADataUpdateRecipe *clone() override {
+ SmallVector<VPValue *> Ops(operands());
+ return new VPCSADataUpdateRecipe(cast<SelectInst>(getUnderlyingInstr()),
+ Ops);
+ }
+
+ void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ VP_CLASSOF_IMPL(VPDef::VPCSADataUpdateSC)
+
+ VPValue *getVPDataPhi() const { return getOperand(0); }
+
+ // The condition from the original select statement
+ VPValue *getVPCond() const { return getOperand(1); }
+
+ // The true value from the original select statement
+ VPValue *getVPTrue() const { return getOperand(2); }
+
+ // The false value from the original select statement
+ VPValue *getVPFalse() const { return getOperand(3); }
+
+ // We combine the setters so we can be sure NewMask is before AnyActive
+ // in the operands list, so the getters can be sure which operand numbers
+ // to get.
+ void setVPNewMaskAndVPAnyActive(VPValue *NewMask, VPValue *AnyActive) {
+ addOperand(NewMask);
+ addOperand(AnyActive);
+ }
+
+ VPValue *getVPNewMask() const { return getOperand(4); }
+
+ VPValue *getVPAnyActive() const { return getOperand(5); }
+};
+
+class VPCSAExtractScalarRecipe final : public VPSingleDefRecipe {
+public:
+ VPCSAExtractScalarRecipe(ArrayRef<VPValue *> Operands)
+ : VPSingleDefRecipe(VPDef::VPCSAExtractScalarSC, Operands) {}
+
+ ~VPCSAExtractScalarRecipe() override = default;
+
+ VPCSAExtractScalarRecipe *clone() override {
+ SmallVector<VPValue *> Ops(operands());
+ return new VPCSAExtractScalarRecipe(Ops);
+ }
+
+ void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ VPValue *getVPInitScalar() const { return getOperand(0); }
+ VPValue *getVPMaskSel() const { return getOperand(1); }
+ VPValue *getVPDataSel() const { return getOperand(2); }
+ VPValue *getVPCSAVLSel() const { return getOperand(3); }
+ bool usesEVL() { return getNumOperands() == 4; }
+};
+
/// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when
/// control converges back from a Branch-on-Mask. The phi nodes are needed in
/// order to merge values that are set under such a branch and feed their uses.
@@ -3440,6 +3602,8 @@ class VPlan {
/// live-outs are fixed via VPLiveOut::fixPhi.
MapVector<PHINode *, VPLiveOut *> LiveOuts;
+ MapVector<PHINode *, VPCSAState *> CSAStates;
+
/// Mapping from SCEVs to the VPValues representing their expansions.
/// NOTE: This mapping is temporary and will be removed once all users have
/// been modeled in VPlan directly.
@@ -3482,6 +3646,12 @@ class VPlan {
bool RequiresScalarEpilogueCheck,
bool TailFolded, Loop *TheLoop);
+ void addCSAState(PHINode *Phi, VPCSAState *S) { CSAStates.insert({Phi, S}); }
+
+ MapVector<PHINode *, VPCSAState *> const &getCSAStates() const {
+ return CSAStates;
+ }
+
/// Prepare the plan for execution, setting up the required live-in values.
void prepareToExecute(Value *TripCount, Value *VectorTripCount,
Value *CanonicalIVStartValue, VPTransformState &State);
@@ -3925,6 +4095,20 @@ class VPlanSlp {
/// Return true if all visited instruction can be combined.
bool isCompletelySLP() const { return CompletelySLP; }
};
+
+namespace vputils {
+
+/// Returns true for PHI-like recipes.
+bool isPhi(const VPRecipeBase &R);
+
+/// Returns true for PHI-like recipes that generate their own backedge
+bool isPhiThatGeneratesBackedge(const VPRecipeBase &R);
+
+/// Returns true for PHI-like recipes that exists in vector loop header basic
+/// block
+bool isHeaderPhi(const VPRecipeBase &R);
+} // end namespace vputils
+
} // end namespace llvm
#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 9068ccf519c55c..37040fbf76f78c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -368,6 +368,10 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::PtrAdd:
case VPInstruction::ExplicitVectorLength:
+ case VPInstruction::CSAVLSel:
+ case VPInstruction::CSAVLPhi:
+ case VPInstruction::CSAAnyActive:
+ case VPInstruction::CSAAnyActiveEVL:
return true;
default:
return false;
@@ -678,6 +682,94 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
}
return NewPhi;
}
+ case VPInstruction::CSAInitMask: {
+ if (Part == 0) {
+ Value *InitMask = ConstantAggregateZero::get(VectorType::get(
+ Type::getInt1Ty(State.Builder.getContext()), State.VF));
+ State.set(this, InitMask, Part);
+ return InitMask;
+ }
+ Value *V = State.get(this, Part - 1);
+ return V;
+ }
+ case VPInstruction::CSAInitData: {
+ if (Part == 0) {
+ Type *ElemTyp = getOperand(0)->getUnderlyingValue()->getType();
+ Value *InitData = PoisonValue::get(VectorType::get(ElemTyp, State.VF));
+ State.set(this, InitData, Part);
+ return InitData;
+ }
+ Value *V = State.get(this, Part - 1);
+ return V;
+ }
+ case VPInstruction::CSAMaskPhi: {
+ if (Part == 0) {
+ IRBuilder<>::InsertPointGuard Guard(State.Builder);
+ State.Builder.SetInsertPoint(State.CFG.PrevBB->getFirstNonPHI());
+ BasicBlock *PreheaderBB = State.CFG.getPreheaderBBFor(this);
+ Value *InitMask = State.get(getOperand(0), Part);
+ PHINode *MaskPhi =
+ State.Builder.CreatePHI(InitMask->getType(), 2, "csa.mask.phi");
+ MaskPhi->addIncoming(InitMask, PreheaderBB);
+ State.set(this, MaskPhi, Part);
+ return MaskPhi;
+ }
+ Value *V = State.get(this, Part - 1);
+ return V;
+ }
+ case VPInstruction::CSAMaskSel: {
+ Value *WidenedCond = State.get(getOperand(0), Part);
+ Value *MaskPhi = State.get(getOperand(1), Part);
+ Value *AnyActive = State.get(getOperand(2), Part, /*NeedsScalar=*/true);
+ // If not the first Part, use the mask from the previous unrolled Part
+ Value *OldMask = Part == 0 ? MaskPhi : State.get(this, Part - 1);
+ Value *MaskSel = State.Builder.CreateSelect(AnyActive, WidenedCond, OldMask,
+ "csa.mask.sel");
+ // MaskPhi wants to use the most recently updated mask. That's the one
+ // that corresponds to the last Part.
+ if (Part == State.UF - 1)
+ cast<PHINode>(MaskPhi)->addIncoming(MaskSel, State.CFG.PrevBB);
+ return MaskSel;
+ }
+ case VPInstruction::CSAAnyActive: {
+ Value *WidenedCond = State.get(getOperand(0), Part);
+ return Builder.CreateOrReduce(WidenedCond);
+ }
+ case VPInstruction::CSAAnyActiveEVL: {
+ Value *WidenedCond = State.get(getOperand(0), Part);
+ Value *AllOnesMask = Constant::getAllOnesValue(
+ VectorType::get(Type::getInt1Ty(State.Builder.getContext()), State.VF));
+ Value *EVL = State.get(getOperand(1), Part, /*NeedsScalar=*/true);
+
+ Value *StartValue =
+ ConstantInt::get(WidenedCond->getType()->getScalarType(), 0);
+ Value *AnyActive = State.Builder.CreateIntrinsic(
+ WidenedCond->getType()->getScalarType(), Intrinsic::vp_reduce_or,
+ {StartValue, WidenedCond, AllOnesMask, EVL}, nullptr,
+ "csa.cond.anyactive");
+ return AnyActive;
+ }
+ case VPInstruction::CSAVLPhi: {
+ IRBuilder<>::InsertPointGuard Guard(State.Builder);
+ State.Builder.SetInsertPoint(State.CFG.PrevBB->getFirstNonPHI());
+ BasicBlock *PreheaderBB = State.CFG.getPreheaderBBFor(this);
+
+ // InitVL can be anything since it won't be used if no mask was active
+ Value *InitVL = ConstantInt::get(State.Builder.getInt32Ty(), 0);
+ PHINode *VLPhi =
+ State.Builder.CreatePHI(InitVL->getType(), 2, "csa.vl.phi");
+ VLPhi->addIncoming(InitVL, PreheaderBB);
+ return VLPhi;
+ }
+ case VPInstruction::CSAVLSel: {
+ Value *AnyActive = State.get(getOperand(0), Part, /*NeedsScalar=*/true);
+ Value *VLPhi = State.get(getOperand(1), Part, /*NeedsScalar=*/true);
+ Value *EVL = State.get(getOperand(2), Part, /*NeedsScalar=*/true);
+ Value *VLSel =
+ State.Builder.CreateSelect(AnyActive, EVL, VLPhi, "csa.vl.sel");
+ cast<PHINode>(VLPhi)->addIncoming(VLSel, State.CFG.PrevBB);
+ return VLSel;
+ }
default:
llvm_unreachable("Unsupported opcode for instruction");
@@ -686,11 +778,16 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
bool VPInstruction::isVectorToScalar() const {
return getOpcode() == VPInstruction::ExtractFromEnd ||
- getOpcode() == VPInstruction::ComputeReductionResult;
+ getOpcode() == VPInstruction::ComputeReductionResult ||
+ getOpcode() == VPInstruction::CSAAnyActive ||
+ getOpcode() == VPInstruction::CSAAnyActiveEVL;
}
bool VPInstruction::isSingleScalar() const {
- return getOpcode() == VPInstruction::ResumePhi;
+ return getOpcode() == VPInstruction::ResumePhi ||
+ getOpcode() == VPInstruction::CSAVLPhi ||
+ getOpcode() == VPInstruction::CSAVLSel ||
+ getOpcode() == VPInstruction::ExplicitVectorLength;
}
#if !defined(NDEBUG)
@@ -853,6 +950,30 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::PtrAdd:
O << "ptradd";
break;
+ case VPInstruction::CSAInitMask:
+ O << "csa-init-mask";
+ break;
+ case VPInstruction::CSAInitData:
+ O << "csa-init-data";
+ break;
+ case VPInstruction::CSAMaskPhi:
+ O << "csa-mask-phi";
+ break;
+ case VPInstruction::CSAMaskSel:
+ O << "csa-mask-sel";
+ break;
+ case VPInstruction::CSAVLPhi:
+ O << "csa-vl-phi";
+ break;
+ case VPInstruction::CSAVLSel:
+ O << "csa-vl-sel";
+ break;
+ case VPInstruction::CSAAnyActive:
+ O << "csa-anyactive";
+ break;
+ case VPInstruction::CSAAnyActiveEVL:
+ O << "csa-anyactive-evl";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
@@ -2139,6 +2260,123 @@ void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent,
}
#endif
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPCSAHeaderPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "EMIT ";
+ printAsOperand(O, SlotTracker);
+ O << " = csa-data-phi ";
+ printOperands(O, SlotTracker);
+}
+#endif
+
+void VPCSAHeaderPHIRecipe::execute(VPTransformState &State) {
+ // PrevBB is this BB
+ IRBuilder<>::InsertPointGuard Guard(State.Builder);
+ State.Builder.SetInsertPoint(State.CFG.PrevBB->getFirstNonPHI());
+
+ Value *InitData = State.get(getVPInitData(), 0);
+ PHINode *DataPhi =
+ State.Builder.CreatePHI(InitData->getType(), 2, "csa.data.phi");
+ BasicBlock *PreheaderBB = State.CFG.getPreheaderBBFor(this);
+ DataPhi->addIncoming(InitData, PreheaderBB);
+ // Note: We didn't add Incoming for the new data since VPCSADataUpdateRecipe
+ // may not have been executed. We let VPCSADataUpdateRecipe::execute add the
+ // incoming operand to DataPhi.
+
+ // Use the same DataPhi for all Parts
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ State.set(this, DataPhi, Part);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPCSADataUpdateRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "EMIT ";
+ printAsOperand(O, SlotTracker);
+ O << " = csa-data-update ";
+ printOperands(O, SlotTracker);
+}
+#endif
+
+void VPCSADataUpdateRecipe::execute(VPTransformState &State) {
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value *AnyActive = State.get(getVPAnyActive(), Part, /*NeedsScalar=*/true);
+ Value *DataUpdate = getVPDataPhi() == getVPTrue()
+ ? State.get(getVPFalse(), Part)
+ : State.get(getVPTrue(), Part);
+ PHINode *DataPhi = cast<PHINode>(State.get(getVPDataPhi(), Part));
+ // If not the first Part, use the mask from the previous unrolled Part
+ Value *OldData = Part == 0 ? DataPhi : State.get(this, Part - 1);
+ Value *DataSel = State.Builder.CreateSelect(AnyActive, DataUpdate, OldData,
+ "csa.data.sel");
+
+ if (Part == State.UF - 1)
+ DataPhi->addIncoming(DataSel, State.CFG.PrevBB);
+ State.set(this, DataSel, Part);
+ }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPCSAExtractScalarRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "EMIT ";
+ printAsOperand(O, SlotTracker);
+ O << " = CSA-EXTRACT-SCALAR ";
+ printOperands(O, SlotTracker);
+}
+#endif
+
+void VPCSAExtractScalarRecipe::execute(VPTransformState &State) {
+ IRBuilder<>::InsertPointGuard Guard(State.Builder);
+ State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI());
+
+ unsigned LastPart = State.UF - 1;
+ Value *InitScalar = getVPInitScalar()->getLiveInIRValue();
+ Value *MaskSel = State.get(getVPMaskSel(), LastPart);
+ Value *DataSel = State.get(getVPDataSel(), LastPart);
+
+ Value *LastIdx = nullptr;
+ Value *IndexVec = State.Builder.CreateStepVector(
+ VectorType::get(State.Builder.getInt32Ty(), State.VF), "csa.step");
+ Value *NegOne = ConstantInt::get(IndexVec->getType()->getScalarType(), -1);
+ if (usesEVL()) {
+ // A vp.reduce.smax over the IndexVec with the MaskSel as the mask will
+ // give us the last active index into MaskSel, which gives us the correct
+ // index in the data vector to extract from. If no element in the mask
+ // is active, we pick -1. If we pick -1, then we will use the initial scalar
+ // value instead of extracting from the data vector.
+ Value *VL = State.get(getVPCSAVLSel(), LastPart, /*NeedsScalar=*/true);
+ LastIdx = State.Builder.CreateIntrinsic(NegOne->getType(),
+ Intrinsic::vp_reduce_smax,
+ {NegOne, IndexVec, MaskSel, VL});
+ } else {
+ // Get a vector where the elements are zero when the last active mask is
+ // false and the index in the vector when the mask is true.
+ Value *ActiveLaneIdxs = State.Builder.CreateSelect(
+ MaskSel, IndexVec, ConstantAggregateZero::get(IndexVec->getType()));
+ // Get the last active index in the mask. When no lanes in the mask are
+ // active, vector.umax will have value 0. Take the additional step to set
+ // LastIdx as -1 in this case to avoid the case of lane 0 of the mask being
+ // inactive, which would also cause the reduction to have value 0.
+ Value *MaybeLastIdx = State.Builder.CreateIntMaxReduce(ActiveLaneIdxs);
+ Value *IsLaneZeroActive =
+ State.Builder.CreateExtractElement(MaskSel, (uint64_t)0);
+ Value *Zero = ConstantInt::get(MaybeLastIdx->getType(), 0);
+ Value *MaybeLastIdxEQZero = State.Builder.CreateICmpEQ(MaybeLastIdx, Zero);
+ Value *And = State.Builder.CreateAnd(IsLaneZeroActive, MaybeLastIdxEQZero);
+ LastIdx = State.Builder.CreateSelect(And, Zero, NegOne);
+ }
+
+ Value *ExtractFromVec =
+ State.Builder.CreateExtractElement(DataSel, LastIdx, "csa.extract");
+ Value *Zero = ConstantInt::get(LastIdx->getType(), 0);
+ Value *LastIdxGEZero = State.Builder.CreateICmpSGE(LastIdx, Zero);
+ Value *ChooseFromVecOrInit =
+ State.Builder.CreateSelect(LastIdxGEZero, ExtractFromVec, InitScalar);
+ State.set(this, ChooseFromVecOrInit, 0, /*IsScalar=*/true);
+}
+
void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
assert(State.Instance && "Branch on Mask works only on single instance.");
@@ -3202,3 +3440,30 @@ void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
printOperands(O, SlotTracker);
}
#endif
+
+bool vputils::isPhi(const VPRecipeBase &R) {
+ if (R.isPhi())
+ return true;
+ if (auto *VPInst = dyn_cast<VPInstruction>(&R))
+ return VPInst->getOpcode() == VPInstruction::CSAMaskPhi ||
+ VPInst->getOpcode() == VPInstruction::CSAVLPhi;
+ return false;
+}
+
+bool vputils::isPhiThatGeneratesBackedge(const VPRecipeBase &R) {
+ if (isa<VPWidenPHIRecipe, VPCSAHeaderPHIRecipe>(&R))
+ return true;
+ if (auto *VPInst = dyn_cast<VPInstruction>(&R))
+ return VPInst->getOpcode() == VPInstruction::CSAMaskPhi ||
+ VPInst->getOpcode() == VPInstruction::CSAVLPhi;
+ return false;
+}
+
+bool vputils::isHeaderPhi(const VPRecipeBase &R) {
+ if (isa<VPHeaderPHIRecipe, VPWidenPHIRecipe>(&R))
+ return true;
+ if (auto *VPInst = dyn_cast<VPInstruction>(&R))
+ return VPInst->getOpcode() == VPInstruction::CSAMaskPhi ||
+ VPInst->getOpcode() == VPInstruction::CSAVLPhi;
+ return false;
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 1d84550010017f..3f4c22e54d8d16 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1313,6 +1313,45 @@ void VPlanTransforms::addActiveLaneMask(
HeaderMask->replaceAllUsesWith(LaneMask);
}
+/// Add recipes required to make CSA work with EVL based approach. This
+/// includes replacing \p CSAAnyActive with \p CSAAnyActiveEVL, and adding \p
+/// CSAVLPhi and \p CSAVLSel instructions.
+static void addExplicitVectorLengthForCSA(
+ VPValue &EVL, const MapVector<PHINode *, VPCSAState *> &CSAStates) {
+ for (auto &[_, CSAState] : CSAStates) {
+ // CSAAnyActive is used to keep track of whether any condition on the
+ // current iteration is active. This is used to decide whether the mask
+ // should be updated. When we are using EVL, we must only consider the first
+ // EVL number of elements in the mask. Replace CSAAnyActive with the EVL
+ // specific CSAAnyActiveEVL instruction.
+ auto *VPAnyActive = CSAState->getVPAnyActive();
+ auto *VPAnyActiveEVL = new VPInstruction(
+ VPInstruction::CSAAnyActiveEVL, {VPAnyActive->getOperand(0), &EVL},
+ VPAnyActive->getDebugLoc(), "csa.cond.anyactive");
+ VPAnyActiveEVL->insertBefore(VPAnyActive);
+ VPAnyActive->replaceAllUsesWith(VPAnyActiveEVL->getVPSingleValue());
+ VPAnyActive->eraseFromParent();
+ CSAState->setVPAnyActive(VPAnyActiveEVL);
+
+ // When we are using EVL, we must keep track of the most recent EVL when at
+ // least one lane in the mask was active. Imagine the scenario: on iteration
+ // N, there was at least one active lane in the mask. Then on all future
+ // iteration there was no active lanes in the mask. When it is time to
+ // extract the scalar from the data vector, we must use the EVL that
+ // corresponds to the EVL that was used when the mask vector was last
+ // updated. To do this, we introduce CSAVLPhi and CSAVLSel instructions
+ auto *VPVLPhi =
+ new VPInstruction(VPInstruction::CSAVLPhi, {}, {}, "csa.vl.phi");
+ auto *VPVLSel =
+ new VPInstruction(VPInstruction::CSAVLSel,
+ {VPAnyActiveEVL, VPVLPhi, &EVL}, {}, "csa.vl.sel");
+ VPVLPhi->insertAfter(CSAState->getPhiRecipe());
+ VPVLSel->insertAfter(VPAnyActiveEVL);
+
+ CSAState->getExtractScalarRecipe()->addOperand(VPVLSel);
+ }
+}
+
/// Replace recipes with their EVL variants.
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
SmallVector<VPValue *> HeaderMasks = collectAllHeaderMasks(Plan);
@@ -1368,6 +1407,11 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
}
recursivelyDeleteDeadRecipes(HeaderMask);
}
+
+ // We build the scalar version of a CSA when VF=ElementCount::getFixed(1),
+ // which does not require an EVL.
+ if (!Plan.hasScalarVFOnly())
+ addExplicitVectorLengthForCSA(EVL, Plan.getCSAStates());
}
/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 1dd8d09ff62472..e15199e4946fa5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -360,6 +360,8 @@ class VPDef {
VPWidenEVLSC,
VPWidenSelectSC,
VPBlendSC,
+ VPCSADataUpdateSC,
+ VPCSAExtractScalarSC,
// START: Phi-like recipes. Need to be kept together.
VPWidenPHISC,
VPPredInstPHISC,
@@ -371,6 +373,7 @@ class VPDef {
VPFirstOrderRecurrencePHISC,
VPWidenIntOrFpInductionSC,
VPWidenPointerInductionSC,
+ VPCSAHeaderPHISC,
VPReductionPHISC,
// END: SubclassID for recipes that inherit VPHeaderPHIRecipe
// END: Phi-like recipes
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 99bc4c38a3c3cd..b4bc3de463de11 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -73,11 +73,11 @@ bool VPlanVerifier::verifyPhiRecipes(const VPBasicBlock *VPBB) {
const VPRegionBlock *ParentR = VPBB->getParent();
bool IsHeaderVPBB = ParentR && !ParentR->isReplicator() &&
ParentR->getEntryBasicBlock() == VPBB;
- while (RecipeI != End && RecipeI->isPhi()) {
+ while (RecipeI != End && vputils::isPhi(*RecipeI)) {
if (isa<VPActiveLaneMaskPHIRecipe>(RecipeI))
NumActiveLaneMaskPhiRecipes++;
- if (IsHeaderVPBB && !isa<VPHeaderPHIRecipe, VPWidenPHIRecipe>(*RecipeI)) {
+ if (IsHeaderVPBB && !vputils::isHeaderPhi(*RecipeI)) {
errs() << "Found non-header PHI recipe in header VPBB";
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
errs() << ": ";
@@ -104,7 +104,7 @@ bool VPlanVerifier::verifyPhiRecipes(const VPBasicBlock *VPBB) {
}
while (RecipeI != End) {
- if (RecipeI->isPhi() && !isa<VPBlendRecipe>(&*RecipeI)) {
+ if (vputils::isPhi(*RecipeI) && !isa<VPBlendRecipe>(&*RecipeI)) {
errs() << "Found phi-like recipe after non-phi recipe";
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll b/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll
index 71a5519522a275..89a2dbadd3a2ff 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll
@@ -1,13 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -S -mtriple riscv64 -mattr="+v" -riscv-v-vector-bits-min=256 \
; RUN: -passes=loop-vectorize -force-tail-folding-style=data-with-evl \
-; RUN: | FileCheck %s -check-prefix=EVL
+; RUN: -enable-csa-vectorization | FileCheck %s -check-prefix=EVL
; RUN: opt < %s -S -mtriple riscv64 -mattr="+v" -riscv-v-vector-bits-min=256 \
; RUN: -passes=loop-vectorize -force-tail-folding-style=none \
-; RUN: | FileCheck %s -check-prefix=NO-EVL
+; RUN: -enable-csa-vectorization | FileCheck %s -check-prefix=NO-EVL
; RUN: opt < %s -S -mtriple riscv64 -mattr="+v" -riscv-v-vector-bits-min=256 \
; RUN: -passes=loop-vectorize -force-tail-folding-style=data \
-; RUN: | FileCheck %s -check-prefix=DATA
+; RUN: -enable-csa-vectorization | FileCheck %s -check-prefix=DATA
; This function is generated from the following C/C++ program:
; int simple_csa_int_select(int N, int *data, int a) {
@@ -25,24 +25,69 @@ define i32 @simple_csa_int_select(i32 %N, ptr %data, i64 %a) {
; EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; EVL: for.body.preheader:
; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL: vector.ph:
+; EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
+; EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; EVL: vector.body:
+; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP6]]
+; EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; EVL-NEXT: [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; EVL-NEXT: [[TMP10:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
+; EVL-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP11]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP11]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; EVL-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; EVL: middle.block:
+; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EVL-NEXT: [[TMP13:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP13]])
+; EVL-NEXT: [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP14]], 0
+; EVL-NEXT: [[TMP17:%.*]] = and i1 [[TMP15]], [[TMP16]]
+; EVL-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP18]]
+; EVL-NEXT: [[TMP19:%.*]] = icmp sge i32 [[TMP18]], 0
+; EVL-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[CSA_EXTRACT]], i32 -1
+; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL: scalar.ph:
+; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; EVL-NEXT: br label [[FOR_BODY:%.*]]
; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
; EVL: for.cond.cleanup:
-; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
; EVL-NEXT: ret i32 [[T_0_LCSSA]]
; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT: [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
-; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP1]]
-; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_010]]
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT: [[TMP22:%.*]] = sext i32 [[TMP21]] to i64
+; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP22]]
+; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP21]], i32 [[T_010]]
; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
;
; NO-EVL-LABEL: @simple_csa_int_select(
; NO-EVL-NEXT: entry:
@@ -50,24 +95,69 @@ define i32 @simple_csa_int_select(i32 %N, ptr %data, i64 %a) {
; NO-EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; NO-EVL: for.body.preheader:
; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL: vector.ph:
+; NO-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
+; NO-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-EVL: vector.body:
+; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP6]]
+; NO-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-EVL-NEXT: [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; NO-EVL-NEXT: [[TMP10:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
+; NO-EVL-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP11]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP11]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-EVL-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-EVL: middle.block:
+; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-EVL-NEXT: [[TMP13:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP13]])
+; NO-EVL-NEXT: [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP14]], 0
+; NO-EVL-NEXT: [[TMP17:%.*]] = and i1 [[TMP15]], [[TMP16]]
+; NO-EVL-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP18]]
+; NO-EVL-NEXT: [[TMP19:%.*]] = icmp sge i32 [[TMP18]], 0
+; NO-EVL-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[CSA_EXTRACT]], i32 -1
+; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL: scalar.ph:
+; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
; NO-EVL-NEXT: ret i32 [[T_0_LCSSA]]
; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT: [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
-; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP1]]
-; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_010]]
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT: [[TMP22:%.*]] = sext i32 [[TMP21]] to i64
+; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP22]]
+; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP21]], i32 [[T_010]]
; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
;
; DATA-LABEL: @simple_csa_int_select(
; DATA-NEXT: entry:
@@ -75,24 +165,69 @@ define i32 @simple_csa_int_select(i32 %N, ptr %data, i64 %a) {
; DATA-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; DATA: for.body.preheader:
; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA: vector.ph:
+; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; DATA-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
+; DATA-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
+; DATA: vector.body:
+; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP6]]
+; DATA-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; DATA-NEXT: [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; DATA-NEXT: [[TMP10:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
+; DATA-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP11]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP11]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; DATA-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; DATA: middle.block:
+; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; DATA-NEXT: [[TMP13:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; DATA-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP13]])
+; DATA-NEXT: [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP14]], 0
+; DATA-NEXT: [[TMP17:%.*]] = and i1 [[TMP15]], [[TMP16]]
+; DATA-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], i32 0, i32 -1
+; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP18]]
+; DATA-NEXT: [[TMP19:%.*]] = icmp sge i32 [[TMP18]], 0
+; DATA-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[CSA_EXTRACT]], i32 -1
+; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA: scalar.ph:
+; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; DATA-NEXT: br label [[FOR_BODY:%.*]]
; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
; DATA: for.cond.cleanup:
-; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
; DATA-NEXT: ret i32 [[T_0_LCSSA]]
; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT: [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
-; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP1]]
-; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_010]]
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT: [[TMP22:%.*]] = sext i32 [[TMP21]] to i64
+; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP22]]
+; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP21]], i32 [[T_010]]
; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
;
entry:
%cmp9 = icmp sgt i32 %N, 0
@@ -139,24 +274,78 @@ define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
; EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; EVL: for.body.preheader:
; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL: vector.ph:
+; EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; EVL-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; EVL-NEXT: [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
+; EVL-NEXT: [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
+; EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; EVL-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
+; EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; EVL: vector.body:
+; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP12]]
+; EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; EVL-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; EVL-NEXT: [[TMP16:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
+; EVL-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP16]])
+; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; EVL: middle.block:
+; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EVL-NEXT: [[TMP19:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; EVL-NEXT: [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
+; EVL-NEXT: [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
+; EVL-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP24]]
+; EVL-NEXT: [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
+; EVL-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[CSA_EXTRACT]], i32 -1
+; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL: scalar.ph:
+; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; EVL-NEXT: br label [[FOR_BODY:%.*]]
; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
; EVL: for.cond.cleanup:
-; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
; EVL-NEXT: ret i32 [[T_0_LCSSA]]
; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT: [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
-; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP1]]
-; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_010]]
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT: [[TMP28:%.*]] = sext i32 [[TMP27]] to i64
+; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP28]]
+; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP27]], i32 [[T_010]]
; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
;
; NO-EVL-LABEL: @simple_csa_int_select_induction_cmp(
; NO-EVL-NEXT: entry:
@@ -164,24 +353,78 @@ define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
; NO-EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; NO-EVL: for.body.preheader:
; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL: vector.ph:
+; NO-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-EVL-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; NO-EVL-NEXT: [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
+; NO-EVL-NEXT: [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; NO-EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
+; NO-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; NO-EVL-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; NO-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
+; NO-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-EVL: vector.body:
+; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP12]]
+; NO-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; NO-EVL-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; NO-EVL-NEXT: [[TMP16:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
+; NO-EVL-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP16]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; NO-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-EVL: middle.block:
+; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-EVL-NEXT: [[TMP19:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; NO-EVL-NEXT: [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
+; NO-EVL-NEXT: [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
+; NO-EVL-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP24]]
+; NO-EVL-NEXT: [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
+; NO-EVL-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[CSA_EXTRACT]], i32 -1
+; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL: scalar.ph:
+; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
; NO-EVL-NEXT: ret i32 [[T_0_LCSSA]]
; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT: [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
-; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP1]]
-; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_010]]
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT: [[TMP28:%.*]] = sext i32 [[TMP27]] to i64
+; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP28]]
+; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP27]], i32 [[T_010]]
; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
;
; DATA-LABEL: @simple_csa_int_select_induction_cmp(
; DATA-NEXT: entry:
@@ -189,24 +432,78 @@ define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
; DATA-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; DATA: for.body.preheader:
; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA: vector.ph:
+; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; DATA-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; DATA-NEXT: [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
+; DATA-NEXT: [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; DATA-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
+; DATA-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; DATA-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; DATA-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
+; DATA-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
+; DATA: vector.body:
+; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP12]]
+; DATA-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; DATA-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; DATA-NEXT: [[TMP16:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
+; DATA-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP16]])
+; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; DATA-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; DATA-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; DATA: middle.block:
+; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; DATA-NEXT: [[TMP19:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; DATA-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; DATA-NEXT: [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
+; DATA-NEXT: [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
+; DATA-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
+; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP24]]
+; DATA-NEXT: [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
+; DATA-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[CSA_EXTRACT]], i32 -1
+; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA: scalar.ph:
+; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; DATA-NEXT: br label [[FOR_BODY:%.*]]
; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
; DATA: for.cond.cleanup:
-; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
; DATA-NEXT: ret i32 [[T_0_LCSSA]]
; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT: [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
-; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP1]]
-; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_010]]
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT: [[TMP28:%.*]] = sext i32 [[TMP27]] to i64
+; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP28]]
+; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP27]], i32 [[T_010]]
; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
;
entry:
%cmp9 = icmp sgt i32 %N, 0
@@ -253,23 +550,65 @@ define float @simple_csa_float_select(i32 %N, ptr %data) {
; EVL-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; EVL: for.body.preheader:
; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL: vector.ph:
+; EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; EVL: vector.body:
+; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[TMP6]]
+; EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; EVL-NEXT: [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
+; EVL-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
+; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP10]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[CSA_DATA_PHI]]
+; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; EVL-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; EVL: middle.block:
+; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EVL-NEXT: [[TMP12:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP12]])
+; EVL-NEXT: [[TMP14:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP13]], 0
+; EVL-NEXT: [[TMP16:%.*]] = and i1 [[TMP14]], [[TMP15]]
+; EVL-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL]], i32 [[TMP17]]
+; EVL-NEXT: [[TMP18:%.*]] = icmp sge i32 [[TMP17]], 0
+; EVL-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], float [[CSA_EXTRACT]], float 1.000000e+00
+; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL: scalar.ph:
+; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; EVL-NEXT: br label [[FOR_BODY:%.*]]
; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
; EVL: for.cond.cleanup:
-; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
; EVL-NEXT: ret float [[T_0_LCSSA]]
; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT: [[T_09:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_BODY]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 0.000000e+00
-; EVL-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP0]], float [[T_09]]
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[T_09:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP20]], 0.000000e+00
+; EVL-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP20]], float [[T_09]]
; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
;
; NO-EVL-LABEL: @simple_csa_float_select(
; NO-EVL-NEXT: entry:
@@ -277,23 +616,65 @@ define float @simple_csa_float_select(i32 %N, ptr %data) {
; NO-EVL-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; NO-EVL: for.body.preheader:
; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL: vector.ph:
+; NO-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-EVL: vector.body:
+; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[TMP6]]
+; NO-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; NO-EVL-NEXT: [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
+; NO-EVL-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP10]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-EVL-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; NO-EVL: middle.block:
+; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-EVL-NEXT: [[TMP12:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP12]])
+; NO-EVL-NEXT: [[TMP14:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP13]], 0
+; NO-EVL-NEXT: [[TMP16:%.*]] = and i1 [[TMP14]], [[TMP15]]
+; NO-EVL-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL]], i32 [[TMP17]]
+; NO-EVL-NEXT: [[TMP18:%.*]] = icmp sge i32 [[TMP17]], 0
+; NO-EVL-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], float [[CSA_EXTRACT]], float 1.000000e+00
+; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL: scalar.ph:
+; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
; NO-EVL-NEXT: ret float [[T_0_LCSSA]]
; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[T_09:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 0.000000e+00
-; NO-EVL-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP0]], float [[T_09]]
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[T_09:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP20]], 0.000000e+00
+; NO-EVL-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP20]], float [[T_09]]
; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
;
; DATA-LABEL: @simple_csa_float_select(
; DATA-NEXT: entry:
@@ -301,23 +682,65 @@ define float @simple_csa_float_select(i32 %N, ptr %data) {
; DATA-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; DATA: for.body.preheader:
; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA: vector.ph:
+; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
+; DATA: vector.body:
+; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[TMP6]]
+; DATA-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; DATA-NEXT: [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
+; DATA-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
+; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP10]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[CSA_DATA_PHI]]
+; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; DATA-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; DATA: middle.block:
+; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; DATA-NEXT: [[TMP12:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; DATA-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP12]])
+; DATA-NEXT: [[TMP14:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP13]], 0
+; DATA-NEXT: [[TMP16:%.*]] = and i1 [[TMP14]], [[TMP15]]
+; DATA-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 0, i32 -1
+; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL]], i32 [[TMP17]]
+; DATA-NEXT: [[TMP18:%.*]] = icmp sge i32 [[TMP17]], 0
+; DATA-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], float [[CSA_EXTRACT]], float 1.000000e+00
+; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA: scalar.ph:
+; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; DATA-NEXT: br label [[FOR_BODY:%.*]]
; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
; DATA: for.cond.cleanup:
-; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
; DATA-NEXT: ret float [[T_0_LCSSA]]
; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[T_09:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_BODY]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 0.000000e+00
-; DATA-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP0]], float [[T_09]]
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[T_09:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP20]], 0.000000e+00
+; DATA-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP20]], float [[T_09]]
; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
;
entry:
%cmp8 = icmp sgt i32 %N, 0
@@ -626,32 +1049,97 @@ define i32 @csa_in_series_int_select(i32 %N, ptr %data0, ptr %data1, i64 %a) {
; EVL-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; EVL: for.body.preheader:
; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL: vector.ph:
+; EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
+; EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; EVL: vector.body:
+; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP6]]
+; EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; EVL-NEXT: [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; EVL-NEXT: [[TMP10:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
+; EVL-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP11]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
+; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP11]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI2]]
+; EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP6]]
+; EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; EVL-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
+; EVL-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD3]] to <vscale x 4 x i64>
+; EVL-NEXT: [[TMP15:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP14]]
+; EVL-NEXT: [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP15]])
+; EVL-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP16]], <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP16]], <vscale x 4 x i32> [[WIDE_LOAD3]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; EVL: middle.block:
+; EVL-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EVL-NEXT: [[TMP18:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP18]])
+; EVL-NEXT: [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
+; EVL-NEXT: [[TMP21:%.*]] = icmp eq i32 [[TMP19]], 0
+; EVL-NEXT: [[TMP22:%.*]] = and i1 [[TMP20]], [[TMP21]]
+; EVL-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL5]], i32 [[TMP23]]
+; EVL-NEXT: [[TMP24:%.*]] = icmp sge i32 [[TMP23]], 0
+; EVL-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 [[CSA_EXTRACT7]], i32 -1
+; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EVL-NEXT: [[TMP26:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP26]])
+; EVL-NEXT: [[TMP28:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP27]], 0
+; EVL-NEXT: [[TMP30:%.*]] = and i1 [[TMP28]], [[TMP29]]
+; EVL-NEXT: [[TMP31:%.*]] = select i1 [[TMP30]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP31]]
+; EVL-NEXT: [[TMP32:%.*]] = icmp sge i32 [[TMP31]], 0
+; EVL-NEXT: [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[CSA_EXTRACT]], i32 -1
+; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL: scalar.ph:
+; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; EVL-NEXT: br label [[FOR_BODY:%.*]]
; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT: [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT: [[TMP34:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
; EVL: for.cond.cleanup:
-; EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP34]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
; EVL-NEXT: ret i32 [[OR]]
; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
-; EVL-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP2]]
-; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[T_022]]
-; EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; EVL-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP4]]
-; EVL-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP3]], i32 [[S_023]]
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; EVL-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT: [[TMP36:%.*]] = sext i32 [[TMP35]] to i64
+; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP36]]
+; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP35]], i32 [[T_022]]
+; EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; EVL-NEXT: [[TMP38:%.*]] = sext i32 [[TMP37]] to i64
+; EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP38]]
+; EVL-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP37]], i32 [[S_023]]
; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
;
; NO-EVL-LABEL: @csa_in_series_int_select(
; NO-EVL-NEXT: entry:
@@ -659,32 +1147,97 @@ define i32 @csa_in_series_int_select(i32 %N, ptr %data0, ptr %data1, i64 %a) {
; NO-EVL-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; NO-EVL: for.body.preheader:
; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL: vector.ph:
+; NO-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
+; NO-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-EVL: vector.body:
+; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP6]]
+; NO-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-EVL-NEXT: [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; NO-EVL-NEXT: [[TMP10:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
+; NO-EVL-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP11]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP11]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI2]]
+; NO-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP6]]
+; NO-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
+; NO-EVL-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD3]] to <vscale x 4 x i64>
+; NO-EVL-NEXT: [[TMP15:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP14]]
+; NO-EVL-NEXT: [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP15]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP16]], <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP16]], <vscale x 4 x i32> [[WIDE_LOAD3]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; NO-EVL: middle.block:
+; NO-EVL-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-EVL-NEXT: [[TMP18:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP18]])
+; NO-EVL-NEXT: [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
+; NO-EVL-NEXT: [[TMP21:%.*]] = icmp eq i32 [[TMP19]], 0
+; NO-EVL-NEXT: [[TMP22:%.*]] = and i1 [[TMP20]], [[TMP21]]
+; NO-EVL-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL5]], i32 [[TMP23]]
+; NO-EVL-NEXT: [[TMP24:%.*]] = icmp sge i32 [[TMP23]], 0
+; NO-EVL-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 [[CSA_EXTRACT7]], i32 -1
+; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-EVL-NEXT: [[TMP26:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP26]])
+; NO-EVL-NEXT: [[TMP28:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP27]], 0
+; NO-EVL-NEXT: [[TMP30:%.*]] = and i1 [[TMP28]], [[TMP29]]
+; NO-EVL-NEXT: [[TMP31:%.*]] = select i1 [[TMP30]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP31]]
+; NO-EVL-NEXT: [[TMP32:%.*]] = icmp sge i32 [[TMP31]], 0
+; NO-EVL-NEXT: [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[CSA_EXTRACT]], i32 -1
+; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL: scalar.ph:
+; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT: [[TMP34:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP34]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
; NO-EVL-NEXT: ret i32 [[OR]]
; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP2]]
-; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[T_022]]
-; NO-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; NO-EVL-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; NO-EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP4]]
-; NO-EVL-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP3]], i32 [[S_023]]
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT: [[TMP36:%.*]] = sext i32 [[TMP35]] to i64
+; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP36]]
+; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP35]], i32 [[T_022]]
+; NO-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; NO-EVL-NEXT: [[TMP38:%.*]] = sext i32 [[TMP37]] to i64
+; NO-EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP38]]
+; NO-EVL-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP37]], i32 [[S_023]]
; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
;
; DATA-LABEL: @csa_in_series_int_select(
; DATA-NEXT: entry:
@@ -692,32 +1245,97 @@ define i32 @csa_in_series_int_select(i32 %N, ptr %data0, ptr %data1, i64 %a) {
; DATA-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; DATA: for.body.preheader:
; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA: vector.ph:
+; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; DATA-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
+; DATA-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
+; DATA: vector.body:
+; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP6]]
+; DATA-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; DATA-NEXT: [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; DATA-NEXT: [[TMP10:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
+; DATA-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP11]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
+; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP11]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI2]]
+; DATA-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP6]]
+; DATA-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; DATA-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
+; DATA-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD3]] to <vscale x 4 x i64>
+; DATA-NEXT: [[TMP15:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP14]]
+; DATA-NEXT: [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP15]])
+; DATA-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP16]], <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP16]], <vscale x 4 x i32> [[WIDE_LOAD3]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; DATA-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; DATA: middle.block:
+; DATA-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; DATA-NEXT: [[TMP18:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
+; DATA-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP18]])
+; DATA-NEXT: [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
+; DATA-NEXT: [[TMP21:%.*]] = icmp eq i32 [[TMP19]], 0
+; DATA-NEXT: [[TMP22:%.*]] = and i1 [[TMP20]], [[TMP21]]
+; DATA-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 0, i32 -1
+; DATA-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL5]], i32 [[TMP23]]
+; DATA-NEXT: [[TMP24:%.*]] = icmp sge i32 [[TMP23]], 0
+; DATA-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 [[CSA_EXTRACT7]], i32 -1
+; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; DATA-NEXT: [[TMP26:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; DATA-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP26]])
+; DATA-NEXT: [[TMP28:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP27]], 0
+; DATA-NEXT: [[TMP30:%.*]] = and i1 [[TMP28]], [[TMP29]]
+; DATA-NEXT: [[TMP31:%.*]] = select i1 [[TMP30]], i32 0, i32 -1
+; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP31]]
+; DATA-NEXT: [[TMP32:%.*]] = icmp sge i32 [[TMP31]], 0
+; DATA-NEXT: [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[CSA_EXTRACT]], i32 -1
+; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA: scalar.ph:
+; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; DATA-NEXT: br label [[FOR_BODY:%.*]]
; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT: [[TMP34:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
; DATA: for.cond.cleanup:
-; DATA-NEXT: [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; DATA-NEXT: [[OR:%.*]] = phi i32 [ [[TMP34]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
; DATA-NEXT: ret i32 [[OR]]
; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
-; DATA-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP2]]
-; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[T_022]]
-; DATA-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; DATA-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; DATA-NEXT: [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP4]]
-; DATA-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP3]], i32 [[S_023]]
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; DATA-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT: [[TMP36:%.*]] = sext i32 [[TMP35]] to i64
+; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP36]]
+; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP35]], i32 [[T_022]]
+; DATA-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; DATA-NEXT: [[TMP38:%.*]] = sext i32 [[TMP37]] to i64
+; DATA-NEXT: [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP38]]
+; DATA-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP37]], i32 [[S_023]]
; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
;
entry:
%cmp21 = icmp sgt i32 %N, 0
@@ -773,32 +1391,106 @@ define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data
; EVL-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; EVL: for.body.preheader:
; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL: vector.ph:
+; EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; EVL-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; EVL-NEXT: [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
+; EVL-NEXT: [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
+; EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; EVL-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
+; EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; EVL: vector.body:
+; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP12]]
+; EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; EVL-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; EVL-NEXT: [[TMP16:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
+; EVL-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP16]])
+; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
+; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI2]]
+; EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP12]]
+; EVL-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
+; EVL-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP19]], align 4
+; EVL-NEXT: [[TMP20:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD3]] to <vscale x 4 x i64>
+; EVL-NEXT: [[TMP21:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP20]]
+; EVL-NEXT: [[TMP22:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP21]])
+; EVL-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP22]], <vscale x 4 x i1> [[TMP21]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP22]], <vscale x 4 x i32> [[WIDE_LOAD3]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; EVL-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; EVL: middle.block:
+; EVL-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EVL-NEXT: [[TMP24:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP24]])
+; EVL-NEXT: [[TMP26:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
+; EVL-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP25]], 0
+; EVL-NEXT: [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
+; EVL-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL5]], i32 [[TMP29]]
+; EVL-NEXT: [[TMP30:%.*]] = icmp sge i32 [[TMP29]], 0
+; EVL-NEXT: [[TMP31:%.*]] = select i1 [[TMP30]], i32 [[CSA_EXTRACT7]], i32 -1
+; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EVL-NEXT: [[TMP32:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP32]])
+; EVL-NEXT: [[TMP34:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT: [[TMP35:%.*]] = icmp eq i32 [[TMP33]], 0
+; EVL-NEXT: [[TMP36:%.*]] = and i1 [[TMP34]], [[TMP35]]
+; EVL-NEXT: [[TMP37:%.*]] = select i1 [[TMP36]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP37]]
+; EVL-NEXT: [[TMP38:%.*]] = icmp sge i32 [[TMP37]], 0
+; EVL-NEXT: [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[CSA_EXTRACT]], i32 -1
+; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL: scalar.ph:
+; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; EVL-NEXT: br label [[FOR_BODY:%.*]]
; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT: [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT: [[TMP40:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
; EVL: for.cond.cleanup:
-; EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP40]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
; EVL-NEXT: ret i32 [[OR]]
; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
-; EVL-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
-; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[T_022]]
-; EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; EVL-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP4]]
-; EVL-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP3]], i32 [[S_023]]
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; EVL-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP41:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT: [[TMP42:%.*]] = sext i32 [[TMP41]] to i64
+; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP42]]
+; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP41]], i32 [[T_022]]
+; EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP43:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; EVL-NEXT: [[TMP44:%.*]] = sext i32 [[TMP43]] to i64
+; EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP44]]
+; EVL-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP43]], i32 [[S_023]]
; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
;
; NO-EVL-LABEL: @csa_in_series_int_select_induction_cmp(
; NO-EVL-NEXT: entry:
@@ -806,32 +1498,106 @@ define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data
; NO-EVL-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; NO-EVL: for.body.preheader:
; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL: vector.ph:
+; NO-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-EVL-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; NO-EVL-NEXT: [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
+; NO-EVL-NEXT: [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; NO-EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
+; NO-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; NO-EVL-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; NO-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
+; NO-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-EVL: vector.body:
+; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP12]]
+; NO-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; NO-EVL-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; NO-EVL-NEXT: [[TMP16:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
+; NO-EVL-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP16]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI2]]
+; NO-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP12]]
+; NO-EVL-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP19]], align 4
+; NO-EVL-NEXT: [[TMP20:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD3]] to <vscale x 4 x i64>
+; NO-EVL-NEXT: [[TMP21:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP20]]
+; NO-EVL-NEXT: [[TMP22:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP21]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP22]], <vscale x 4 x i1> [[TMP21]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP22]], <vscale x 4 x i32> [[WIDE_LOAD3]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; NO-EVL-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; NO-EVL: middle.block:
+; NO-EVL-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-EVL-NEXT: [[TMP24:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP24]])
+; NO-EVL-NEXT: [[TMP26:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
+; NO-EVL-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP25]], 0
+; NO-EVL-NEXT: [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
+; NO-EVL-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL5]], i32 [[TMP29]]
+; NO-EVL-NEXT: [[TMP30:%.*]] = icmp sge i32 [[TMP29]], 0
+; NO-EVL-NEXT: [[TMP31:%.*]] = select i1 [[TMP30]], i32 [[CSA_EXTRACT7]], i32 -1
+; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-EVL-NEXT: [[TMP32:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP32]])
+; NO-EVL-NEXT: [[TMP34:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT: [[TMP35:%.*]] = icmp eq i32 [[TMP33]], 0
+; NO-EVL-NEXT: [[TMP36:%.*]] = and i1 [[TMP34]], [[TMP35]]
+; NO-EVL-NEXT: [[TMP37:%.*]] = select i1 [[TMP36]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP37]]
+; NO-EVL-NEXT: [[TMP38:%.*]] = icmp sge i32 [[TMP37]], 0
+; NO-EVL-NEXT: [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[CSA_EXTRACT]], i32 -1
+; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL: scalar.ph:
+; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT: [[TMP40:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP40]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
; NO-EVL-NEXT: ret i32 [[OR]]
; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
-; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[T_022]]
-; NO-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; NO-EVL-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; NO-EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP4]]
-; NO-EVL-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP3]], i32 [[S_023]]
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP41:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT: [[TMP42:%.*]] = sext i32 [[TMP41]] to i64
+; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP42]]
+; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP41]], i32 [[T_022]]
+; NO-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP43:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; NO-EVL-NEXT: [[TMP44:%.*]] = sext i32 [[TMP43]] to i64
+; NO-EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP44]]
+; NO-EVL-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP43]], i32 [[S_023]]
; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
;
; DATA-LABEL: @csa_in_series_int_select_induction_cmp(
; DATA-NEXT: entry:
@@ -839,32 +1605,106 @@ define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data
; DATA-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; DATA: for.body.preheader:
; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA: vector.ph:
+; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; DATA-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; DATA-NEXT: [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
+; DATA-NEXT: [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; DATA-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
+; DATA-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; DATA-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; DATA-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
+; DATA-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
+; DATA: vector.body:
+; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP12]]
+; DATA-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; DATA-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; DATA-NEXT: [[TMP16:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
+; DATA-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP16]])
+; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
+; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI2]]
+; DATA-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP12]]
+; DATA-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
+; DATA-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP19]], align 4
+; DATA-NEXT: [[TMP20:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD3]] to <vscale x 4 x i64>
+; DATA-NEXT: [[TMP21:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP20]]
+; DATA-NEXT: [[TMP22:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP21]])
+; DATA-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP22]], <vscale x 4 x i1> [[TMP21]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP22]], <vscale x 4 x i32> [[WIDE_LOAD3]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; DATA-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; DATA-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; DATA: middle.block:
+; DATA-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; DATA-NEXT: [[TMP24:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
+; DATA-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP24]])
+; DATA-NEXT: [[TMP26:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
+; DATA-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP25]], 0
+; DATA-NEXT: [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
+; DATA-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 0, i32 -1
+; DATA-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL5]], i32 [[TMP29]]
+; DATA-NEXT: [[TMP30:%.*]] = icmp sge i32 [[TMP29]], 0
+; DATA-NEXT: [[TMP31:%.*]] = select i1 [[TMP30]], i32 [[CSA_EXTRACT7]], i32 -1
+; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; DATA-NEXT: [[TMP32:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; DATA-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP32]])
+; DATA-NEXT: [[TMP34:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT: [[TMP35:%.*]] = icmp eq i32 [[TMP33]], 0
+; DATA-NEXT: [[TMP36:%.*]] = and i1 [[TMP34]], [[TMP35]]
+; DATA-NEXT: [[TMP37:%.*]] = select i1 [[TMP36]], i32 0, i32 -1
+; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP37]]
+; DATA-NEXT: [[TMP38:%.*]] = icmp sge i32 [[TMP37]], 0
+; DATA-NEXT: [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[CSA_EXTRACT]], i32 -1
+; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA: scalar.ph:
+; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; DATA-NEXT: br label [[FOR_BODY:%.*]]
; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT: [[TMP40:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
; DATA: for.cond.cleanup:
-; DATA-NEXT: [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
+; DATA-NEXT: [[OR:%.*]] = phi i32 [ [[TMP40]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
; DATA-NEXT: ret i32 [[OR]]
; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
-; DATA-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
-; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[T_022]]
-; DATA-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; DATA-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; DATA-NEXT: [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP4]]
-; DATA-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP3]], i32 [[S_023]]
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; DATA-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP41:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT: [[TMP42:%.*]] = sext i32 [[TMP41]] to i64
+; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP42]]
+; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP41]], i32 [[T_022]]
+; DATA-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP43:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; DATA-NEXT: [[TMP44:%.*]] = sext i32 [[TMP43]] to i64
+; DATA-NEXT: [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP44]]
+; DATA-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP43]], i32 [[S_023]]
; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
;
entry:
%cmp21 = icmp sgt i32 %N, 0
@@ -921,30 +1761,91 @@ define float @csa_in_series_float_select(i32 %N, ptr %data0, ptr %data1) {
; EVL-NEXT: br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; EVL: for.body.preheader:
; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL: vector.ph:
+; EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; EVL: vector.body:
+; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[TMP6]]
+; EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; EVL-NEXT: [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
+; EVL-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
+; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
+; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP10]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[CSA_DATA_PHI2]]
+; EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[TMP6]]
+; EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
+; EVL-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP12]], align 4
+; EVL-NEXT: [[TMP13:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD3]], zeroinitializer
+; EVL-NEXT: [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP13]])
+; EVL-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP14]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP14]], <vscale x 4 x float> [[WIDE_LOAD3]], <vscale x 4 x float> [[CSA_DATA_PHI]]
+; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; EVL: middle.block:
+; EVL-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EVL-NEXT: [[TMP16:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP16]])
+; EVL-NEXT: [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
+; EVL-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP17]], 0
+; EVL-NEXT: [[TMP20:%.*]] = and i1 [[TMP18]], [[TMP19]]
+; EVL-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL5]], i32 [[TMP21]]
+; EVL-NEXT: [[TMP22:%.*]] = icmp sge i32 [[TMP21]], 0
+; EVL-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[CSA_EXTRACT7]], float 1.000000e+00
+; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EVL-NEXT: [[TMP24:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP24]])
+; EVL-NEXT: [[TMP26:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP25]], 0
+; EVL-NEXT: [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
+; EVL-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL]], i32 [[TMP29]]
+; EVL-NEXT: [[TMP30:%.*]] = icmp sge i32 [[TMP29]], 0
+; EVL-NEXT: [[TMP31:%.*]] = select i1 [[TMP30]], float [[CSA_EXTRACT]], float 1.000000e+00
+; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL: scalar.ph:
+; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; EVL-NEXT: br label [[FOR_BODY:%.*]]
; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT: [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT: [[TMP32:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
; EVL: for.cond.cleanup:
-; EVL-NEXT: [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; EVL-NEXT: [[ADD:%.*]] = phi float [ [[TMP32]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
; EVL-NEXT: ret float [[ADD]]
; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT: [[S_021:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
-; EVL-NEXT: [[T_020:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_BODY]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP1]], 0.000000e+00
-; EVL-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP1]], float [[T_020]]
-; EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
-; EVL-NEXT: [[CMP6:%.*]] = fcmp ogt float [[TMP2]], 0.000000e+00
-; EVL-NEXT: [[S_1]] = select i1 [[CMP6]], float [[TMP2]], float [[S_021]]
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[S_021:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; EVL-NEXT: [[T_020:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP33:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP33]], 0.000000e+00
+; EVL-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP33]], float [[T_020]]
+; EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP34:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; EVL-NEXT: [[CMP6:%.*]] = fcmp ogt float [[TMP34]], 0.000000e+00
+; EVL-NEXT: [[S_1]] = select i1 [[CMP6]], float [[TMP34]], float [[S_021]]
; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
;
; NO-EVL-LABEL: @csa_in_series_float_select(
; NO-EVL-NEXT: entry:
@@ -952,30 +1853,91 @@ define float @csa_in_series_float_select(i32 %N, ptr %data0, ptr %data1) {
; NO-EVL-NEXT: br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; NO-EVL: for.body.preheader:
; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL: vector.ph:
+; NO-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-EVL: vector.body:
+; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[TMP6]]
+; NO-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; NO-EVL-NEXT: [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
+; NO-EVL-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP10]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[CSA_DATA_PHI2]]
+; NO-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[TMP6]]
+; NO-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP12]], align 4
+; NO-EVL-NEXT: [[TMP13:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD3]], zeroinitializer
+; NO-EVL-NEXT: [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP13]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP14]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP14]], <vscale x 4 x float> [[WIDE_LOAD3]], <vscale x 4 x float> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; NO-EVL: middle.block:
+; NO-EVL-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-EVL-NEXT: [[TMP16:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP16]])
+; NO-EVL-NEXT: [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
+; NO-EVL-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP17]], 0
+; NO-EVL-NEXT: [[TMP20:%.*]] = and i1 [[TMP18]], [[TMP19]]
+; NO-EVL-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL5]], i32 [[TMP21]]
+; NO-EVL-NEXT: [[TMP22:%.*]] = icmp sge i32 [[TMP21]], 0
+; NO-EVL-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[CSA_EXTRACT7]], float 1.000000e+00
+; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-EVL-NEXT: [[TMP24:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP24]])
+; NO-EVL-NEXT: [[TMP26:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP25]], 0
+; NO-EVL-NEXT: [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
+; NO-EVL-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL]], i32 [[TMP29]]
+; NO-EVL-NEXT: [[TMP30:%.*]] = icmp sge i32 [[TMP29]], 0
+; NO-EVL-NEXT: [[TMP31:%.*]] = select i1 [[TMP30]], float [[CSA_EXTRACT]], float 1.000000e+00
+; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL: scalar.ph:
+; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT: [[TMP32:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT: [[ADD:%.*]] = phi float [ [[TMP32]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
; NO-EVL-NEXT: ret float [[ADD]]
; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[S_021:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[T_020:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP1]], 0.000000e+00
-; NO-EVL-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP1]], float [[T_020]]
-; NO-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
-; NO-EVL-NEXT: [[CMP6:%.*]] = fcmp ogt float [[TMP2]], 0.000000e+00
-; NO-EVL-NEXT: [[S_1]] = select i1 [[CMP6]], float [[TMP2]], float [[S_021]]
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[S_021:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[T_020:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP33:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP33]], 0.000000e+00
+; NO-EVL-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP33]], float [[T_020]]
+; NO-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP34:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; NO-EVL-NEXT: [[CMP6:%.*]] = fcmp ogt float [[TMP34]], 0.000000e+00
+; NO-EVL-NEXT: [[S_1]] = select i1 [[CMP6]], float [[TMP34]], float [[S_021]]
; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
;
; DATA-LABEL: @csa_in_series_float_select(
; DATA-NEXT: entry:
@@ -983,30 +1945,91 @@ define float @csa_in_series_float_select(i32 %N, ptr %data0, ptr %data1) {
; DATA-NEXT: br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; DATA: for.body.preheader:
; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA: vector.ph:
+; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
+; DATA: vector.body:
+; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[TMP6]]
+; DATA-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; DATA-NEXT: [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
+; DATA-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
+; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
+; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP10]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[CSA_DATA_PHI2]]
+; DATA-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[TMP6]]
+; DATA-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
+; DATA-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP12]], align 4
+; DATA-NEXT: [[TMP13:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD3]], zeroinitializer
+; DATA-NEXT: [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP13]])
+; DATA-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP14]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP14]], <vscale x 4 x float> [[WIDE_LOAD3]], <vscale x 4 x float> [[CSA_DATA_PHI]]
+; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; DATA-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; DATA: middle.block:
+; DATA-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; DATA-NEXT: [[TMP16:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
+; DATA-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP16]])
+; DATA-NEXT: [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
+; DATA-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP17]], 0
+; DATA-NEXT: [[TMP20:%.*]] = and i1 [[TMP18]], [[TMP19]]
+; DATA-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i32 0, i32 -1
+; DATA-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL5]], i32 [[TMP21]]
+; DATA-NEXT: [[TMP22:%.*]] = icmp sge i32 [[TMP21]], 0
+; DATA-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[CSA_EXTRACT7]], float 1.000000e+00
+; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; DATA-NEXT: [[TMP24:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; DATA-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP24]])
+; DATA-NEXT: [[TMP26:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP25]], 0
+; DATA-NEXT: [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
+; DATA-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 0, i32 -1
+; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL]], i32 [[TMP29]]
+; DATA-NEXT: [[TMP30:%.*]] = icmp sge i32 [[TMP29]], 0
+; DATA-NEXT: [[TMP31:%.*]] = select i1 [[TMP30]], float [[CSA_EXTRACT]], float 1.000000e+00
+; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA: scalar.ph:
+; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; DATA-NEXT: br label [[FOR_BODY:%.*]]
; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT: [[TMP32:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
; DATA: for.cond.cleanup:
-; DATA-NEXT: [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; DATA-NEXT: [[ADD:%.*]] = phi float [ [[TMP32]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
; DATA-NEXT: ret float [[ADD]]
; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[S_021:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_BODY]] ]
-; DATA-NEXT: [[T_020:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_BODY]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP1]], 0.000000e+00
-; DATA-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP1]], float [[T_020]]
-; DATA-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
-; DATA-NEXT: [[CMP6:%.*]] = fcmp ogt float [[TMP2]], 0.000000e+00
-; DATA-NEXT: [[S_1]] = select i1 [[CMP6]], float [[TMP2]], float [[S_021]]
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[S_021:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; DATA-NEXT: [[T_020:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP33:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP33]], 0.000000e+00
+; DATA-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP33]], float [[T_020]]
+; DATA-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP34:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; DATA-NEXT: [[CMP6:%.*]] = fcmp ogt float [[TMP34]], 0.000000e+00
+; DATA-NEXT: [[S_1]] = select i1 [[CMP6]], float [[TMP34]], float [[S_021]]
; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
;
entry:
%cmp19 = icmp sgt i32 %N, 0
@@ -3074,75 +4097,243 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
; EVL-NEXT: [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
; EVL-NEXT: br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; EVL: for.body.preheader:
+; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL: vector.ph:
+; EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; EVL-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; EVL-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
+; EVL-NEXT: [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; EVL-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; EVL: vector.body:
+; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP12]]
+; EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
+; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP14]], align 8
+; EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP12]]
+; EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i32 0
+; EVL-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP16]], align 8
+; EVL-NEXT: [[TMP17:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; EVL-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP17]])
+; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x i64> [[CSA_DATA_PHI]]
+; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; EVL: middle.block:
+; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; EVL-NEXT: [[TMP20:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; EVL-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP20]])
+; EVL-NEXT: [[TMP22:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
+; EVL-NEXT: [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
+; EVL-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x i64> [[CSA_DATA_SEL]], i32 [[TMP25]]
+; EVL-NEXT: [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
+; EVL-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL: scalar.ph:
+; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; EVL-NEXT: br label [[FOR_BODY:%.*]]
; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
; EVL: for.cond.cleanup:
-; EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II:%.*]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
; EVL-NEXT: ret i64 [[IDX_0_LCSSA]]
; EVL: for.body:
-; EVL-NEXT: [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT: [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[I_010]]
-; EVL-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[I_010]]
-; EVL-NEXT: [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
-; EVL-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; EVL-NEXT: [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; EVL-NEXT: [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I_010]]
+; EVL-NEXT: [[TMP28:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[I_010]]
+; EVL-NEXT: [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; EVL-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[TMP28]], [[TMP29]]
; EVL-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
; EVL-NEXT: [[INC]] = add nuw i64 [[I_010]], 1
; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
;
; NO-EVL-LABEL: @idx_scalar(
; NO-EVL-NEXT: entry:
; NO-EVL-NEXT: [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
; NO-EVL-NEXT: br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL: vector.ph:
+; NO-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; NO-EVL-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; NO-EVL-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
+; NO-EVL-NEXT: [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; NO-EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; NO-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; NO-EVL-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; NO-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; NO-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-EVL: vector.body:
+; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP12]]
+; NO-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP14]], align 8
+; NO-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP12]]
+; NO-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP16]], align 8
+; NO-EVL-NEXT: [[TMP17:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; NO-EVL-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP17]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x i64> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; NO-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; NO-EVL: middle.block:
+; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; NO-EVL-NEXT: [[TMP20:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP20]])
+; NO-EVL-NEXT: [[TMP22:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
+; NO-EVL-NEXT: [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
+; NO-EVL-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x i64> [[CSA_DATA_SEL]], i32 [[TMP25]]
+; NO-EVL-NEXT: [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
+; NO-EVL-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL: scalar.ph:
+; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II:%.*]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
; NO-EVL-NEXT: ret i64 [[IDX_0_LCSSA]]
; NO-EVL: for.body:
-; NO-EVL-NEXT: [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT: [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[I_010]]
-; NO-EVL-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; NO-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[I_010]]
-; NO-EVL-NEXT: [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
-; NO-EVL-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; NO-EVL-NEXT: [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; NO-EVL-NEXT: [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I_010]]
+; NO-EVL-NEXT: [[TMP28:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; NO-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[I_010]]
+; NO-EVL-NEXT: [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; NO-EVL-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[TMP28]], [[TMP29]]
; NO-EVL-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
; NO-EVL-NEXT: [[INC]] = add nuw i64 [[I_010]], 1
; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
;
; DATA-LABEL: @idx_scalar(
; DATA-NEXT: entry:
; DATA-NEXT: [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
; DATA-NEXT: br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; DATA: for.body.preheader:
+; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA: vector.ph:
+; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; DATA-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; DATA-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; DATA-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
+; DATA-NEXT: [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; DATA-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; DATA-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; DATA-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; DATA-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; DATA-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
+; DATA: vector.body:
+; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP12]]
+; DATA-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
+; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP14]], align 8
+; DATA-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP12]]
+; DATA-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i32 0
+; DATA-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP16]], align 8
+; DATA-NEXT: [[TMP17:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; DATA-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP17]])
+; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x i64> [[CSA_DATA_PHI]]
+; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; DATA-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; DATA-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; DATA: middle.block:
+; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; DATA-NEXT: [[TMP20:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; DATA-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP20]])
+; DATA-NEXT: [[TMP22:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
+; DATA-NEXT: [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
+; DATA-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
+; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x i64> [[CSA_DATA_SEL]], i32 [[TMP25]]
+; DATA-NEXT: [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
+; DATA-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA: scalar.ph:
+; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; DATA-NEXT: br label [[FOR_BODY:%.*]]
; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
; DATA: for.cond.cleanup:
-; DATA-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II:%.*]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
; DATA-NEXT: ret i64 [[IDX_0_LCSSA]]
; DATA: for.body:
-; DATA-NEXT: [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT: [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[I_010]]
-; DATA-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; DATA-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[I_010]]
-; DATA-NEXT: [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
-; DATA-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; DATA-NEXT: [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; DATA-NEXT: [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I_010]]
+; DATA-NEXT: [[TMP28:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; DATA-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[I_010]]
+; DATA-NEXT: [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; DATA-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[TMP28]], [[TMP29]]
; DATA-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
; DATA-NEXT: [[INC]] = add nuw i64 [[I_010]], 1
; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
;
entry:
%cmp8.not = icmp eq i64 %n, 0
@@ -3186,75 +4377,273 @@ define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
; EVL-NEXT: [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
; EVL-NEXT: br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; EVL: for.body.preheader:
+; EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL: vector.ph:
+; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; EVL-NEXT: [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
+; EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
+; EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; EVL-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3>
+; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; EVL: vector.body:
+; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL8:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <4 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL9:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -4, i64 -4, i64 -4, i64 -4>
+; EVL-NEXT: [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1>
+; EVL-NEXT: [[TMP1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 -1, i64 -1, i64 -1, i64 -1>
+; EVL-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
+; EVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
+; EVL-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
+; EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; EVL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
+; EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -3
+; EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -4
+; EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -3
+; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
+; EVL-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; EVL-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
+; EVL-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD2]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP2]]
+; EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
+; EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 -3
+; EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 -4
+; EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP14]], i32 -3
+; EVL-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP13]], align 8
+; EVL-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD4]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; EVL-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP15]], align 8
+; EVL-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD6]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; EVL-NEXT: [[TMP16:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE5]]
+; EVL-NEXT: [[TMP17:%.*]] = icmp sgt <4 x i64> [[REVERSE3]], [[REVERSE7]]
+; EVL-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]])
+; EVL-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]])
+; EVL-NEXT: [[CSA_MASK_SEL:%.*]] = select i1 [[TMP18]], <4 x i1> [[TMP16]], <4 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT: [[CSA_MASK_SEL8]] = select i1 [[TMP19]], <4 x i1> [[TMP17]], <4 x i1> [[CSA_MASK_SEL]]
+; EVL-NEXT: [[CSA_DATA_SEL:%.*]] = select i1 [[TMP18]], <4 x i64> [[VEC_IND]], <4 x i64> [[CSA_DATA_PHI]]
+; EVL-NEXT: [[CSA_DATA_SEL9]] = select i1 [[TMP19]], <4 x i64> [[STEP_ADD]], <4 x i64> [[CSA_DATA_SEL]]
+; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; EVL-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], <i64 -4, i64 -4, i64 -4, i64 -4>
+; EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; EVL: middle.block:
+; EVL-NEXT: [[TMP21:%.*]] = select <4 x i1> [[CSA_MASK_SEL8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> zeroinitializer
+; EVL-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP21]])
+; EVL-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[CSA_MASK_SEL8]], i64 0
+; EVL-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP22]], 0
+; EVL-NEXT: [[TMP25:%.*]] = and i1 [[TMP23]], [[TMP24]]
+; EVL-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <4 x i64> [[CSA_DATA_SEL9]], i32 [[TMP26]]
+; EVL-NEXT: [[TMP27:%.*]] = icmp sge i32 [[TMP26]], 0
+; EVL-NEXT: [[TMP28:%.*]] = select i1 [[TMP27]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL: scalar.ph:
+; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
; EVL-NEXT: br label [[FOR_BODY:%.*]]
; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
; EVL: for.cond.cleanup:
-; EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II:%.*]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
; EVL-NEXT: ret i64 [[IDX_0_LCSSA]]
; EVL: for.body:
-; EVL-NEXT: [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT: [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT: [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; EVL-NEXT: [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
; EVL-NEXT: [[SUB]] = add i64 [[I_011]], -1
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[SUB]]
-; EVL-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[SUB]]
-; EVL-NEXT: [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
-; EVL-NEXT: [[CMP3:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[SUB]]
+; EVL-NEXT: [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[SUB]]
+; EVL-NEXT: [[TMP30:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; EVL-NEXT: [[CMP3:%.*]] = icmp sgt i64 [[TMP29]], [[TMP30]]
; EVL-NEXT: [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
; EVL-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
-; EVL-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; EVL-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
;
; NO-EVL-LABEL: @idx_scalar_dec(
; NO-EVL-NEXT: entry:
; NO-EVL-NEXT: [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
; NO-EVL-NEXT: br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL: vector.ph:
+; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-EVL-NEXT: [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
+; NO-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
+; NO-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; NO-EVL-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3>
+; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-EVL: vector.body:
+; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL8:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <4 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL9:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -4, i64 -4, i64 -4, i64 -4>
+; NO-EVL-NEXT: [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1>
+; NO-EVL-NEXT: [[TMP1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 -1, i64 -1, i64 -1, i64 -1>
+; NO-EVL-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
+; NO-EVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
+; NO-EVL-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
+; NO-EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; NO-EVL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
+; NO-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -3
+; NO-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -4
+; NO-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -3
+; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
+; NO-EVL-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; NO-EVL-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
+; NO-EVL-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD2]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; NO-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP2]]
+; NO-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; NO-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
+; NO-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 -3
+; NO-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 -4
+; NO-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP14]], i32 -3
+; NO-EVL-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP13]], align 8
+; NO-EVL-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD4]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; NO-EVL-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP15]], align 8
+; NO-EVL-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD6]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; NO-EVL-NEXT: [[TMP16:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE5]]
+; NO-EVL-NEXT: [[TMP17:%.*]] = icmp sgt <4 x i64> [[REVERSE3]], [[REVERSE7]]
+; NO-EVL-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]])
+; NO-EVL-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL:%.*]] = select i1 [[TMP18]], <4 x i1> [[TMP16]], <4 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT: [[CSA_MASK_SEL8]] = select i1 [[TMP19]], <4 x i1> [[TMP17]], <4 x i1> [[CSA_MASK_SEL]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL:%.*]] = select i1 [[TMP18]], <4 x i64> [[VEC_IND]], <4 x i64> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL9]] = select i1 [[TMP19]], <4 x i64> [[STEP_ADD]], <4 x i64> [[CSA_DATA_SEL]]
+; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; NO-EVL-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], <i64 -4, i64 -4, i64 -4, i64 -4>
+; NO-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; NO-EVL: middle.block:
+; NO-EVL-NEXT: [[TMP21:%.*]] = select <4 x i1> [[CSA_MASK_SEL8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP21]])
+; NO-EVL-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[CSA_MASK_SEL8]], i64 0
+; NO-EVL-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP22]], 0
+; NO-EVL-NEXT: [[TMP25:%.*]] = and i1 [[TMP23]], [[TMP24]]
+; NO-EVL-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <4 x i64> [[CSA_DATA_SEL9]], i32 [[TMP26]]
+; NO-EVL-NEXT: [[TMP27:%.*]] = icmp sge i32 [[TMP26]], 0
+; NO-EVL-NEXT: [[TMP28:%.*]] = select i1 [[TMP27]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL: scalar.ph:
+; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II:%.*]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
; NO-EVL-NEXT: ret i64 [[IDX_0_LCSSA]]
; NO-EVL: for.body:
-; NO-EVL-NEXT: [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT: [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT: [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; NO-EVL-NEXT: [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
; NO-EVL-NEXT: [[SUB]] = add i64 [[I_011]], -1
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[SUB]]
-; NO-EVL-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[SUB]]
-; NO-EVL-NEXT: [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
-; NO-EVL-NEXT: [[CMP3:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[SUB]]
+; NO-EVL-NEXT: [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[SUB]]
+; NO-EVL-NEXT: [[TMP30:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; NO-EVL-NEXT: [[CMP3:%.*]] = icmp sgt i64 [[TMP29]], [[TMP30]]
; NO-EVL-NEXT: [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
; NO-EVL-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
-; NO-EVL-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; NO-EVL-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
;
; DATA-LABEL: @idx_scalar_dec(
; DATA-NEXT: entry:
; DATA-NEXT: [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
; DATA-NEXT: br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; DATA: for.body.preheader:
+; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA: vector.ph:
+; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; DATA-NEXT: [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
+; DATA-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
+; DATA-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; DATA-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3>
+; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
+; DATA: vector.body:
+; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL8:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <4 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL9:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -4, i64 -4, i64 -4, i64 -4>
+; DATA-NEXT: [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1>
+; DATA-NEXT: [[TMP1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 -1, i64 -1, i64 -1, i64 -1>
+; DATA-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
+; DATA-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
+; DATA-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
+; DATA-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; DATA-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
+; DATA-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -3
+; DATA-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -4
+; DATA-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -3
+; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
+; DATA-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; DATA-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
+; DATA-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD2]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; DATA-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP2]]
+; DATA-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; DATA-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
+; DATA-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 -3
+; DATA-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 -4
+; DATA-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP14]], i32 -3
+; DATA-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP13]], align 8
+; DATA-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD4]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; DATA-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP15]], align 8
+; DATA-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD6]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; DATA-NEXT: [[TMP16:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE5]]
+; DATA-NEXT: [[TMP17:%.*]] = icmp sgt <4 x i64> [[REVERSE3]], [[REVERSE7]]
+; DATA-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]])
+; DATA-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]])
+; DATA-NEXT: [[CSA_MASK_SEL:%.*]] = select i1 [[TMP18]], <4 x i1> [[TMP16]], <4 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT: [[CSA_MASK_SEL8]] = select i1 [[TMP19]], <4 x i1> [[TMP17]], <4 x i1> [[CSA_MASK_SEL]]
+; DATA-NEXT: [[CSA_DATA_SEL:%.*]] = select i1 [[TMP18]], <4 x i64> [[VEC_IND]], <4 x i64> [[CSA_DATA_PHI]]
+; DATA-NEXT: [[CSA_DATA_SEL9]] = select i1 [[TMP19]], <4 x i64> [[STEP_ADD]], <4 x i64> [[CSA_DATA_SEL]]
+; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; DATA-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], <i64 -4, i64 -4, i64 -4, i64 -4>
+; DATA-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; DATA: middle.block:
+; DATA-NEXT: [[TMP21:%.*]] = select <4 x i1> [[CSA_MASK_SEL8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> zeroinitializer
+; DATA-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP21]])
+; DATA-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[CSA_MASK_SEL8]], i64 0
+; DATA-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP22]], 0
+; DATA-NEXT: [[TMP25:%.*]] = and i1 [[TMP23]], [[TMP24]]
+; DATA-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 0, i32 -1
+; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <4 x i64> [[CSA_DATA_SEL9]], i32 [[TMP26]]
+; DATA-NEXT: [[TMP27:%.*]] = icmp sge i32 [[TMP26]], 0
+; DATA-NEXT: [[TMP28:%.*]] = select i1 [[TMP27]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA: scalar.ph:
+; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
; DATA-NEXT: br label [[FOR_BODY:%.*]]
; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
; DATA: for.cond.cleanup:
-; DATA-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II:%.*]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
; DATA-NEXT: ret i64 [[IDX_0_LCSSA]]
; DATA: for.body:
-; DATA-NEXT: [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT: [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[FOR_BODY_PREHEADER]] ]
+; DATA-NEXT: [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; DATA-NEXT: [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
; DATA-NEXT: [[SUB]] = add i64 [[I_011]], -1
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[SUB]]
-; DATA-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; DATA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[SUB]]
-; DATA-NEXT: [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
-; DATA-NEXT: [[CMP3:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[SUB]]
+; DATA-NEXT: [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; DATA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[SUB]]
+; DATA-NEXT: [[TMP30:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; DATA-NEXT: [[CMP3:%.*]] = icmp sgt i64 [[TMP29]], [[TMP30]]
; DATA-NEXT: [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
; DATA-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
-; DATA-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; DATA-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
;
entry:
%cmp.not9 = icmp eq i64 %n, 0
@@ -3303,24 +4692,79 @@ define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
; EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; EVL: for.body.preheader:
; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL: vector.ph:
+; EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; EVL-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; EVL-NEXT: [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
+; EVL-NEXT: [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
+; EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; EVL-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
+; EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; EVL: vector.body:
+; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP12]]
+; EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; EVL-NEXT: [[TMP15:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; EVL-NEXT: [[TMP16:%.*]] = icmp eq <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
+; EVL-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i1> [[TMP16]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; EVL-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP17]])
+; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; EVL: middle.block:
+; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EVL-NEXT: [[TMP20:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; EVL-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP20]])
+; EVL-NEXT: [[TMP22:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
+; EVL-NEXT: [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
+; EVL-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP25]]
+; EVL-NEXT: [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
+; EVL-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[CSA_EXTRACT]], i32 0
+; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL: scalar.ph:
+; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; EVL-NEXT: br label [[FOR_BODY:%.*]]
; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
; EVL: for.cond.cleanup:
-; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
; EVL-NEXT: ret i32 [[T_0_LCSSA]]
; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT: [[T_010:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
-; EVL-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP1]]
-; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP0]]
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT: [[TMP29:%.*]] = zext i32 [[TMP28]] to i64
+; EVL-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP29]]
+; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP28]]
; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
;
; NO-EVL-LABEL: @simple_csa_int_select_neg_cond(
; NO-EVL-NEXT: entry:
@@ -3328,24 +4772,79 @@ define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
; NO-EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; NO-EVL: for.body.preheader:
; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL: vector.ph:
+; NO-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-EVL-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; NO-EVL-NEXT: [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
+; NO-EVL-NEXT: [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; NO-EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
+; NO-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; NO-EVL-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; NO-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
+; NO-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-EVL: vector.body:
+; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP12]]
+; NO-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; NO-EVL-NEXT: [[TMP15:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; NO-EVL-NEXT: [[TMP16:%.*]] = icmp eq <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
+; NO-EVL-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i1> [[TMP16]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; NO-EVL-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP17]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; NO-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; NO-EVL: middle.block:
+; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; NO-EVL-NEXT: [[TMP20:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP20]])
+; NO-EVL-NEXT: [[TMP22:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
+; NO-EVL-NEXT: [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
+; NO-EVL-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP25]]
+; NO-EVL-NEXT: [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
+; NO-EVL-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[CSA_EXTRACT]], i32 0
+; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL: scalar.ph:
+; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
; NO-EVL-NEXT: ret i32 [[T_0_LCSSA]]
; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[T_010:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
-; NO-EVL-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP1]]
-; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP0]]
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT: [[TMP29:%.*]] = zext i32 [[TMP28]] to i64
+; NO-EVL-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP29]]
+; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP28]]
; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
;
; DATA-LABEL: @simple_csa_int_select_neg_cond(
; DATA-NEXT: entry:
@@ -3353,24 +4852,79 @@ define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
; DATA-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; DATA: for.body.preheader:
; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA: vector.ph:
+; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; DATA-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; DATA-NEXT: [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
+; DATA-NEXT: [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; DATA-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
+; DATA-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; DATA-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; DATA-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
+; DATA-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
+; DATA: vector.body:
+; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP12]]
+; DATA-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; DATA-NEXT: [[TMP15:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
+; DATA-NEXT: [[TMP16:%.*]] = icmp eq <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
+; DATA-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i1> [[TMP16]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; DATA-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP17]])
+; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
+; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; DATA-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; DATA-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; DATA: middle.block:
+; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; DATA-NEXT: [[TMP20:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
+; DATA-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP20]])
+; DATA-NEXT: [[TMP22:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
+; DATA-NEXT: [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
+; DATA-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
+; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP25]]
+; DATA-NEXT: [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
+; DATA-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[CSA_EXTRACT]], i32 0
+; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA: scalar.ph:
+; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; DATA-NEXT: br label [[FOR_BODY:%.*]]
; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
; DATA: for.cond.cleanup:
-; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
; DATA-NEXT: ret i32 [[T_0_LCSSA]]
; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[T_010:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
-; DATA-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP1]]
-; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP0]]
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT: [[TMP29:%.*]] = zext i32 [[TMP28]] to i64
+; DATA-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP29]]
+; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP28]]
; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
;
entry:
%cmp9 = icmp sgt i32 %N, 0
@@ -3417,25 +4971,80 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
; EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; EVL: for.body.preheader:
; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL: vector.ph:
+; EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; EVL-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; EVL-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
+; EVL-NEXT: [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; EVL-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; EVL: vector.body:
+; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x ptr> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[TMP12]]
+; EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds ptr, ptr [[TMP13]], i32 0
+; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP14]], align 8
+; EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[WIDE_LOAD]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
+; EVL-NEXT: [[TMP15:%.*]] = sext <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i64>
+; EVL-NEXT: [[TMP16:%.*]] = icmp slt <vscale x 2 x i64> [[VEC_IND]], [[TMP15]]
+; EVL-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP16]])
+; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 2 x ptr> [[WIDE_LOAD]], <vscale x 2 x ptr> [[CSA_DATA_PHI]]
+; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; EVL: middle.block:
+; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; EVL-NEXT: [[TMP19:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; EVL-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP19]])
+; EVL-NEXT: [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
+; EVL-NEXT: [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
+; EVL-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x ptr> [[CSA_DATA_SEL]], i32 [[TMP24]]
+; EVL-NEXT: [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
+; EVL-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], ptr [[CSA_EXTRACT]], ptr null
+; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL: scalar.ph:
+; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; EVL-NEXT: br label [[FOR_BODY:%.*]]
; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
; EVL: for.cond.cleanup:
-; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
; EVL-NEXT: ret ptr [[T_0_LCSSA]]
; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT: [[T_010:%.*]] = phi ptr [ null, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-; EVL-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
-; EVL-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
-; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[T_010:%.*]] = phi ptr [ null, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; EVL-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
+; EVL-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
+; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP29]]
+; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP27]], ptr [[T_010]]
; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
;
; NO-EVL-LABEL: @simple_csa_ptr_select(
; NO-EVL-NEXT: entry:
@@ -3443,25 +5052,80 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
; NO-EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; NO-EVL: for.body.preheader:
; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL: vector.ph:
+; NO-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; NO-EVL-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; NO-EVL-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
+; NO-EVL-NEXT: [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; NO-EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; NO-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; NO-EVL-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; NO-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; NO-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-EVL: vector.body:
+; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x ptr> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[TMP12]]
+; NO-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds ptr, ptr [[TMP13]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP14]], align 8
+; NO-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[WIDE_LOAD]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
+; NO-EVL-NEXT: [[TMP15:%.*]] = sext <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i64>
+; NO-EVL-NEXT: [[TMP16:%.*]] = icmp slt <vscale x 2 x i64> [[VEC_IND]], [[TMP15]]
+; NO-EVL-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP16]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 2 x ptr> [[WIDE_LOAD]], <vscale x 2 x ptr> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; NO-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; NO-EVL: middle.block:
+; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; NO-EVL-NEXT: [[TMP19:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP19]])
+; NO-EVL-NEXT: [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
+; NO-EVL-NEXT: [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
+; NO-EVL-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x ptr> [[CSA_DATA_SEL]], i32 [[TMP24]]
+; NO-EVL-NEXT: [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
+; NO-EVL-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], ptr [[CSA_EXTRACT]], ptr null
+; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL: scalar.ph:
+; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
; NO-EVL-NEXT: ret ptr [[T_0_LCSSA]]
; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[T_010:%.*]] = phi ptr [ null, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-; NO-EVL-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
-; NO-EVL-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
-; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[T_010:%.*]] = phi ptr [ null, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; NO-EVL-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
+; NO-EVL-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
+; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP29]]
+; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP27]], ptr [[T_010]]
; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
;
; DATA-LABEL: @simple_csa_ptr_select(
; DATA-NEXT: entry:
@@ -3469,25 +5133,80 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
; DATA-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; DATA: for.body.preheader:
; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA: vector.ph:
+; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; DATA-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; DATA-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
+; DATA-NEXT: [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; DATA-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; DATA-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; DATA-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; DATA-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; DATA-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
+; DATA: vector.body:
+; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x ptr> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT: [[TMP13:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[TMP12]]
+; DATA-NEXT: [[TMP14:%.*]] = getelementptr inbounds ptr, ptr [[TMP13]], i32 0
+; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP14]], align 8
+; DATA-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[WIDE_LOAD]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
+; DATA-NEXT: [[TMP15:%.*]] = sext <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i64>
+; DATA-NEXT: [[TMP16:%.*]] = icmp slt <vscale x 2 x i64> [[VEC_IND]], [[TMP15]]
+; DATA-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP16]])
+; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 2 x ptr> [[WIDE_LOAD]], <vscale x 2 x ptr> [[CSA_DATA_PHI]]
+; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; DATA-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; DATA-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; DATA: middle.block:
+; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; DATA-NEXT: [[TMP19:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; DATA-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP19]])
+; DATA-NEXT: [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
+; DATA-NEXT: [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
+; DATA-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
+; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x ptr> [[CSA_DATA_SEL]], i32 [[TMP24]]
+; DATA-NEXT: [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
+; DATA-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], ptr [[CSA_EXTRACT]], ptr null
+; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA: scalar.ph:
+; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; DATA-NEXT: br label [[FOR_BODY:%.*]]
; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
; DATA: for.cond.cleanup:
-; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
; DATA-NEXT: ret ptr [[T_0_LCSSA]]
; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[T_010:%.*]] = phi ptr [ null, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-; DATA-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
-; DATA-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
-; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[T_010:%.*]] = phi ptr [ null, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; DATA-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
+; DATA-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
+; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP29]]
+; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP27]], ptr [[T_010]]
; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
;
entry:
%cmp9 = icmp sgt i32 %N, 0
>From c79b11a735847418ae78e569250550b2f6c53cf6 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Tue, 20 Aug 2024 10:11:25 -0700
Subject: [PATCH 05/16] [VPlan] Add cost model for CSA
---
.../Transforms/Vectorize/LoopVectorize.cpp | 16 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 11 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 100 +++++++
.../Transforms/LoopVectorize/RISCV/csa.ll | 279 +++++++-----------
4 files changed, 236 insertions(+), 170 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 66819a9d9fdbd5..b3784ca17bd6c7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7309,9 +7309,17 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
/// not have corresponding recipes in \p Plan and are not marked to be ignored
/// in \p CostCtx. This means the VPlan contains simplification that the legacy
/// cost-model did not account for.
-static bool planContainsAdditionalSimplifications(VPlan &Plan,
- VPCostContext &CostCtx,
- Loop *TheLoop) {
+static bool
+planContainsAdditionalSimplifications(VPlan &Plan, VPCostContext &CostCtx,
+ Loop *TheLoop,
+ LoopVectorizationLegality &Legal) {
+ // CSA cost is more complicated since there is significant overhead in the
+ // preheader and middle block. It also contains recipes that are not backed by
+ // underlying instructions in the original loop. This makes it difficult to
+ // model in the legacy cost model.
+ if (!Legal.getCSAs().empty())
+ return true;
+
// First collect all instructions for the recipes in Plan.
auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
@@ -7418,7 +7426,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
assert((BestFactor.Width == LegacyVF.Width ||
planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
- CostCtx, OrigLoop)) &&
+ CostCtx, OrigLoop, *Legal)) &&
" VPlan cost model and legacy cost model disagreed");
assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
"when vectorizing, the scalar cost must be computed.");
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index eb65ec94e8bed3..d71f5340dcd71d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2610,6 +2610,9 @@ class VPCSAHeaderPHIRecipe final : public VPHeaderPHIRecipe {
void execute(VPTransformState &State) override;
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override;
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
@@ -2641,6 +2644,9 @@ class VPCSADataUpdateRecipe final : public VPSingleDefRecipe {
void execute(VPTransformState &State) override;
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override;
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
@@ -2687,6 +2693,9 @@ class VPCSAExtractScalarRecipe final : public VPSingleDefRecipe {
void execute(VPTransformState &State) override;
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override;
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
@@ -2697,7 +2706,7 @@ class VPCSAExtractScalarRecipe final : public VPSingleDefRecipe {
VPValue *getVPMaskSel() const { return getOperand(1); }
VPValue *getVPDataSel() const { return getOperand(2); }
VPValue *getVPCSAVLSel() const { return getOperand(3); }
- bool usesEVL() { return getNumOperands() == 4; }
+ bool usesEVL() const { return getNumOperands() == 4; }
};
/// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 37040fbf76f78c..0f9e91932c027f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2289,6 +2289,24 @@ void VPCSAHeaderPHIRecipe::execute(VPTransformState &State) {
State.set(this, DataPhi, Part);
}
+InstructionCost VPCSAHeaderPHIRecipe::computeCost(ElementCount VF,
+ VPCostContext &Ctx) const {
+ if (VF.isScalar())
+ return 0;
+
+ InstructionCost C = 0;
+ auto *VTy = VectorType::get(getUnderlyingValue()->getType(), VF);
+ const TargetTransformInfo &TTI = Ctx.TTI;
+
+ // FIXME: These costs should be moved into VPInstruction::computeCost. We put
+ // them here for now since there is no VPInstruction::computeCost support.
+ // CSAInitMask
+ C += TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VTy);
+ // CSAInitData
+ C += TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VTy);
+ return C;
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPCSADataUpdateRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
@@ -2317,6 +2335,34 @@ void VPCSADataUpdateRecipe::execute(VPTransformState &State) {
}
}
+InstructionCost VPCSADataUpdateRecipe::computeCost(ElementCount VF,
+ VPCostContext &Ctx) const {
+ if (VF.isScalar())
+ return 0;
+
+ InstructionCost C = 0;
+ auto *VTy = VectorType::get(getUnderlyingValue()->getType(), VF);
+ auto *MaskTy = VectorType::get(IntegerType::getInt1Ty(VTy->getContext()), VF);
+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ const TargetTransformInfo &TTI = Ctx.TTI;
+
+ // Data Update
+ C += TTI.getArithmeticInstrCost(Instruction::Select, VTy, CostKind);
+
+ // FIXME: These costs should be moved into VPInstruction::computeCost. We put
+ // them here for now since they are related to updating the data and there is
+ // no VPInstruction::computeCost support at the moment. CSAInitMask AnyActive
+ C += TTI.getArithmeticInstrCost(Instruction::Select, VTy, CostKind);
+ // vp.reduce.or
+ C += TTI.getArithmeticReductionCost(Instruction::Or, VTy, std::nullopt,
+ CostKind);
+ // VPVLSel
+ C += TTI.getArithmeticInstrCost(Instruction::Select, VTy, CostKind);
+ // MaskUpdate
+ C += TTI.getArithmeticInstrCost(Instruction::Select, MaskTy, CostKind);
+ return C;
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPCSAExtractScalarRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
@@ -2377,6 +2423,60 @@ void VPCSAExtractScalarRecipe::execute(VPTransformState &State) {
State.set(this, ChooseFromVecOrInit, 0, /*IsScalar=*/true);
}
+InstructionCost
+VPCSAExtractScalarRecipe::computeCost(ElementCount VF,
+ VPCostContext &Ctx) const {
+ if (VF.isScalar())
+ return 0;
+
+ InstructionCost C = 0;
+ auto *VTy = VectorType::get(getUnderlyingValue()->getType(), VF);
+ auto *Int32VTy =
+ VectorType::get(IntegerType::getInt32Ty(VTy->getContext()), VF);
+ auto *MaskTy = VectorType::get(IntegerType::getInt1Ty(VTy->getContext()), VF);
+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ const TargetTransformInfo &TTI = Ctx.TTI;
+
+ // StepVector
+ ArrayRef<Value *> Args;
+ IntrinsicCostAttributes CostAttrs(Intrinsic::stepvector, Int32VTy, Args);
+ C += TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
+ // NegOneSplat
+ C += TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, Int32VTy);
+ // LastIdx
+ if (usesEVL()) {
+ C += TTI.getMinMaxReductionCost(Intrinsic::smax, Int32VTy, FastMathFlags(),
+ CostKind);
+ } else {
+ // ActiveLaneIdxs
+ C += TTI.getArithmeticInstrCost(Instruction::Select,
+ MaskTy->getScalarType(), CostKind);
+ // MaybeLastIdx
+ C += TTI.getMinMaxReductionCost(Intrinsic::smax, Int32VTy, FastMathFlags(),
+ CostKind);
+ // IsLaneZeroActive
+ C += TTI.getArithmeticInstrCost(Instruction::ExtractElement, MaskTy,
+ CostKind);
+ // MaybeLastIdxEQZero
+ C += TTI.getArithmeticInstrCost(Instruction::ICmp, MaskTy->getScalarType(),
+ CostKind);
+ // And
+ C += TTI.getArithmeticInstrCost(Instruction::And, MaskTy->getScalarType(),
+ CostKind);
+ // LastIdx
+ C += TTI.getArithmeticInstrCost(Instruction::Select, VTy->getScalarType(),
+ CostKind);
+ }
+ // ExtractFromVec
+ C += TTI.getArithmeticInstrCost(Instruction::ExtractElement, VTy, CostKind);
+ // LastIdxGeZero
+ C += TTI.getArithmeticInstrCost(Instruction::ICmp, Int32VTy, CostKind);
+ // ChooseFromVecOrInit
+ C += TTI.getArithmeticInstrCost(Instruction::Select, VTy->getScalarType(),
+ CostKind);
+ return C;
+}
+
void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
assert(State.Instance && "Branch on Mask works only on single instance.");
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll b/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll
index 89a2dbadd3a2ff..c90d88d912edaa 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll
@@ -4383,69 +4383,52 @@ define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
; EVL-NEXT: [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
-; EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
-; EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; EVL-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3>
+; EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[N]], i64 0
+; EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; EVL-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3, i64 -4, i64 -5, i64 -6, i64 -7>
; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; EVL: vector.body:
; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL8:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <4 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL9:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -4, i64 -4, i64 -4, i64 -4>
-; EVL-NEXT: [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1>
-; EVL-NEXT: [[TMP1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 -1, i64 -1, i64 -1, i64 -1>
-; EVL-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
-; EVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
-; EVL-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
-; EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; EVL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -3
-; EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -4
-; EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -3
-; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
-; EVL-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; EVL-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
-; EVL-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD2]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP2]]
-; EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
-; EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 -3
-; EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 -4
-; EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP14]], i32 -3
-; EVL-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP13]], align 8
-; EVL-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD4]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; EVL-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP15]], align 8
-; EVL-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD6]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; EVL-NEXT: [[TMP16:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE5]]
-; EVL-NEXT: [[TMP17:%.*]] = icmp sgt <4 x i64> [[REVERSE3]], [[REVERSE7]]
-; EVL-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]])
-; EVL-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]])
-; EVL-NEXT: [[CSA_MASK_SEL:%.*]] = select i1 [[TMP18]], <4 x i1> [[TMP16]], <4 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT: [[CSA_MASK_SEL8]] = select i1 [[TMP19]], <4 x i1> [[TMP17]], <4 x i1> [[CSA_MASK_SEL]]
-; EVL-NEXT: [[CSA_DATA_SEL:%.*]] = select i1 [[TMP18]], <4 x i64> [[VEC_IND]], <4 x i64> [[CSA_DATA_PHI]]
-; EVL-NEXT: [[CSA_DATA_SEL9]] = select i1 [[TMP19]], <4 x i64> [[STEP_ADD]], <4 x i64> [[CSA_DATA_SEL]]
+; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <8 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <8 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[TMP0:%.*]] = add <8 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+; EVL-NEXT: [[TMP1:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0
+; EVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP1]]
+; EVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
+; EVL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -7
+; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
+; EVL-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP1]]
+; EVL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -7
+; EVL-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8
+; EVL-NEXT: [[REVERSE2:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD1]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; EVL-NEXT: [[TMP8:%.*]] = icmp sgt <8 x i64> [[REVERSE]], [[REVERSE2]]
+; EVL-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP8]])
+; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP9]], <8 x i1> [[TMP8]], <8 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP9]], <8 x i64> [[VEC_IND]], <8 x i64> [[CSA_DATA_PHI]]
; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; EVL-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], <i64 -4, i64 -4, i64 -4, i64 -4>
-; EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; EVL-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8>
+; EVL-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; EVL: middle.block:
-; EVL-NEXT: [[TMP21:%.*]] = select <4 x i1> [[CSA_MASK_SEL8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> zeroinitializer
-; EVL-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP21]])
-; EVL-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[CSA_MASK_SEL8]], i64 0
-; EVL-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP22]], 0
-; EVL-NEXT: [[TMP25:%.*]] = and i1 [[TMP23]], [[TMP24]]
-; EVL-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 0, i32 -1
-; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <4 x i64> [[CSA_DATA_SEL9]], i32 [[TMP26]]
-; EVL-NEXT: [[TMP27:%.*]] = icmp sge i32 [[TMP26]], 0
-; EVL-NEXT: [[TMP28:%.*]] = select i1 [[TMP27]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; EVL-NEXT: [[TMP11:%.*]] = select <8 x i1> [[CSA_MASK_SEL]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> zeroinitializer
+; EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> [[TMP11]])
+; EVL-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP12]], 0
+; EVL-NEXT: [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]]
+; EVL-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <8 x i64> [[CSA_DATA_SEL]], i32 [[TMP16]]
+; EVL-NEXT: [[TMP17:%.*]] = icmp sge i32 [[TMP16]], 0
+; EVL-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; EVL: scalar.ph:
; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
; EVL-NEXT: br label [[FOR_BODY:%.*]]
; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
; EVL: for.cond.cleanup:
; EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
@@ -4455,10 +4438,10 @@ define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
; EVL-NEXT: [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
; EVL-NEXT: [[SUB]] = add i64 [[I_011]], -1
; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[SUB]]
-; EVL-NEXT: [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; EVL-NEXT: [[TMP19:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
; EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[SUB]]
-; EVL-NEXT: [[TMP30:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
-; EVL-NEXT: [[CMP3:%.*]] = icmp sgt i64 [[TMP29]], [[TMP30]]
+; EVL-NEXT: [[TMP20:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; EVL-NEXT: [[CMP3:%.*]] = icmp sgt i64 [[TMP19]], [[TMP20]]
; EVL-NEXT: [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
; EVL-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
; EVL-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
@@ -4474,69 +4457,52 @@ define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
; NO-EVL-NEXT: [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
-; NO-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
-; NO-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; NO-EVL-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3>
+; NO-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[N]], i64 0
+; NO-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; NO-EVL-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3, i64 -4, i64 -5, i64 -6, i64 -7>
; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; NO-EVL: vector.body:
; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL8:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <4 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL9:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -4, i64 -4, i64 -4, i64 -4>
-; NO-EVL-NEXT: [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1>
-; NO-EVL-NEXT: [[TMP1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 -1, i64 -1, i64 -1, i64 -1>
-; NO-EVL-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
-; NO-EVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
-; NO-EVL-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
-; NO-EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; NO-EVL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; NO-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -3
-; NO-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -4
-; NO-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -3
-; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
-; NO-EVL-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; NO-EVL-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
-; NO-EVL-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD2]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; NO-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP2]]
-; NO-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; NO-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
-; NO-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 -3
-; NO-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 -4
-; NO-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP14]], i32 -3
-; NO-EVL-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP13]], align 8
-; NO-EVL-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD4]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; NO-EVL-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP15]], align 8
-; NO-EVL-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD6]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; NO-EVL-NEXT: [[TMP16:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE5]]
-; NO-EVL-NEXT: [[TMP17:%.*]] = icmp sgt <4 x i64> [[REVERSE3]], [[REVERSE7]]
-; NO-EVL-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]])
-; NO-EVL-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]])
-; NO-EVL-NEXT: [[CSA_MASK_SEL:%.*]] = select i1 [[TMP18]], <4 x i1> [[TMP16]], <4 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT: [[CSA_MASK_SEL8]] = select i1 [[TMP19]], <4 x i1> [[TMP17]], <4 x i1> [[CSA_MASK_SEL]]
-; NO-EVL-NEXT: [[CSA_DATA_SEL:%.*]] = select i1 [[TMP18]], <4 x i64> [[VEC_IND]], <4 x i64> [[CSA_DATA_PHI]]
-; NO-EVL-NEXT: [[CSA_DATA_SEL9]] = select i1 [[TMP19]], <4 x i64> [[STEP_ADD]], <4 x i64> [[CSA_DATA_SEL]]
+; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <8 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <8 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[TMP0:%.*]] = add <8 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+; NO-EVL-NEXT: [[TMP1:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0
+; NO-EVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP1]]
+; NO-EVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
+; NO-EVL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -7
+; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
+; NO-EVL-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; NO-EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP1]]
+; NO-EVL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; NO-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -7
+; NO-EVL-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8
+; NO-EVL-NEXT: [[REVERSE2:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD1]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; NO-EVL-NEXT: [[TMP8:%.*]] = icmp sgt <8 x i64> [[REVERSE]], [[REVERSE2]]
+; NO-EVL-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP8]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP9]], <8 x i1> [[TMP8]], <8 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP9]], <8 x i64> [[VEC_IND]], <8 x i64> [[CSA_DATA_PHI]]
; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; NO-EVL-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], <i64 -4, i64 -4, i64 -4, i64 -4>
-; NO-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; NO-EVL-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8>
+; NO-EVL-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; NO-EVL: middle.block:
-; NO-EVL-NEXT: [[TMP21:%.*]] = select <4 x i1> [[CSA_MASK_SEL8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> zeroinitializer
-; NO-EVL-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP21]])
-; NO-EVL-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[CSA_MASK_SEL8]], i64 0
-; NO-EVL-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP22]], 0
-; NO-EVL-NEXT: [[TMP25:%.*]] = and i1 [[TMP23]], [[TMP24]]
-; NO-EVL-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 0, i32 -1
-; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <4 x i64> [[CSA_DATA_SEL9]], i32 [[TMP26]]
-; NO-EVL-NEXT: [[TMP27:%.*]] = icmp sge i32 [[TMP26]], 0
-; NO-EVL-NEXT: [[TMP28:%.*]] = select i1 [[TMP27]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; NO-EVL-NEXT: [[TMP11:%.*]] = select <8 x i1> [[CSA_MASK_SEL]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> [[TMP11]])
+; NO-EVL-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP12]], 0
+; NO-EVL-NEXT: [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]]
+; NO-EVL-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <8 x i64> [[CSA_DATA_SEL]], i32 [[TMP16]]
+; NO-EVL-NEXT: [[TMP17:%.*]] = icmp sge i32 [[TMP16]], 0
+; NO-EVL-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; NO-EVL: scalar.ph:
; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
; NO-EVL: for.cond.cleanup:
; NO-EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
@@ -4546,10 +4512,10 @@ define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
; NO-EVL-NEXT: [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
; NO-EVL-NEXT: [[SUB]] = add i64 [[I_011]], -1
; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[SUB]]
-; NO-EVL-NEXT: [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; NO-EVL-NEXT: [[TMP19:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[SUB]]
-; NO-EVL-NEXT: [[TMP30:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
-; NO-EVL-NEXT: [[CMP3:%.*]] = icmp sgt i64 [[TMP29]], [[TMP30]]
+; NO-EVL-NEXT: [[TMP20:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; NO-EVL-NEXT: [[CMP3:%.*]] = icmp sgt i64 [[TMP19]], [[TMP20]]
; NO-EVL-NEXT: [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
; NO-EVL-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
; NO-EVL-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
@@ -4565,69 +4531,52 @@ define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
; DATA-NEXT: [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
-; DATA-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0
-; DATA-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; DATA-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3>
+; DATA-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[N]], i64 0
+; DATA-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; DATA-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3, i64 -4, i64 -5, i64 -6, i64 -7>
; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
; DATA: vector.body:
; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL8:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <4 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL9:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -4, i64 -4, i64 -4, i64 -4>
-; DATA-NEXT: [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1>
-; DATA-NEXT: [[TMP1:%.*]] = add <4 x i64> [[STEP_ADD]], <i64 -1, i64 -1, i64 -1, i64 -1>
-; DATA-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
-; DATA-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]]
-; DATA-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
-; DATA-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; DATA-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; DATA-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -3
-; DATA-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -4
-; DATA-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -3
-; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
-; DATA-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; DATA-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
-; DATA-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD2]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; DATA-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP2]]
-; DATA-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; DATA-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
-; DATA-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 -3
-; DATA-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 -4
-; DATA-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP14]], i32 -3
-; DATA-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP13]], align 8
-; DATA-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD4]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; DATA-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP15]], align 8
-; DATA-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD6]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; DATA-NEXT: [[TMP16:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE5]]
-; DATA-NEXT: [[TMP17:%.*]] = icmp sgt <4 x i64> [[REVERSE3]], [[REVERSE7]]
-; DATA-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]])
-; DATA-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]])
-; DATA-NEXT: [[CSA_MASK_SEL:%.*]] = select i1 [[TMP18]], <4 x i1> [[TMP16]], <4 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT: [[CSA_MASK_SEL8]] = select i1 [[TMP19]], <4 x i1> [[TMP17]], <4 x i1> [[CSA_MASK_SEL]]
-; DATA-NEXT: [[CSA_DATA_SEL:%.*]] = select i1 [[TMP18]], <4 x i64> [[VEC_IND]], <4 x i64> [[CSA_DATA_PHI]]
-; DATA-NEXT: [[CSA_DATA_SEL9]] = select i1 [[TMP19]], <4 x i64> [[STEP_ADD]], <4 x i64> [[CSA_DATA_SEL]]
+; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <8 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <8 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[TMP0:%.*]] = add <8 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+; DATA-NEXT: [[TMP1:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0
+; DATA-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP1]]
+; DATA-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
+; DATA-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -7
+; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
+; DATA-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; DATA-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP1]]
+; DATA-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; DATA-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -7
+; DATA-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8
+; DATA-NEXT: [[REVERSE2:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD1]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; DATA-NEXT: [[TMP8:%.*]] = icmp sgt <8 x i64> [[REVERSE]], [[REVERSE2]]
+; DATA-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP8]])
+; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP9]], <8 x i1> [[TMP8]], <8 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP9]], <8 x i64> [[VEC_IND]], <8 x i64> [[CSA_DATA_PHI]]
; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; DATA-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], <i64 -4, i64 -4, i64 -4, i64 -4>
-; DATA-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; DATA-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8>
+; DATA-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; DATA: middle.block:
-; DATA-NEXT: [[TMP21:%.*]] = select <4 x i1> [[CSA_MASK_SEL8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> zeroinitializer
-; DATA-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP21]])
-; DATA-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[CSA_MASK_SEL8]], i64 0
-; DATA-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP22]], 0
-; DATA-NEXT: [[TMP25:%.*]] = and i1 [[TMP23]], [[TMP24]]
-; DATA-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 0, i32 -1
-; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <4 x i64> [[CSA_DATA_SEL9]], i32 [[TMP26]]
-; DATA-NEXT: [[TMP27:%.*]] = icmp sge i32 [[TMP26]], 0
-; DATA-NEXT: [[TMP28:%.*]] = select i1 [[TMP27]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; DATA-NEXT: [[TMP11:%.*]] = select <8 x i1> [[CSA_MASK_SEL]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> zeroinitializer
+; DATA-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> [[TMP11]])
+; DATA-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP12]], 0
+; DATA-NEXT: [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]]
+; DATA-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 0, i32 -1
+; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <8 x i64> [[CSA_DATA_SEL]], i32 [[TMP16]]
+; DATA-NEXT: [[TMP17:%.*]] = icmp sge i32 [[TMP16]], 0
+; DATA-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; DATA: scalar.ph:
; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
; DATA-NEXT: br label [[FOR_BODY:%.*]]
; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
; DATA: for.cond.cleanup:
; DATA-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
@@ -4637,10 +4586,10 @@ define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
; DATA-NEXT: [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
; DATA-NEXT: [[SUB]] = add i64 [[I_011]], -1
; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[SUB]]
-; DATA-NEXT: [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; DATA-NEXT: [[TMP19:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
; DATA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[SUB]]
-; DATA-NEXT: [[TMP30:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
-; DATA-NEXT: [[CMP3:%.*]] = icmp sgt i64 [[TMP29]], [[TMP30]]
+; DATA-NEXT: [[TMP20:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; DATA-NEXT: [[CMP3:%.*]] = icmp sgt i64 [[TMP19]], [[TMP20]]
; DATA-NEXT: [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
; DATA-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
; DATA-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
>From bdc9d288bf6540a55ec81ee08008cfe52a7dbc31 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Fri, 30 Aug 2024 08:48:08 -0700
Subject: [PATCH 06/16] fixup! repond to @artagnon initial set of review
---
llvm/lib/Transforms/Vectorize/VPlan.h | 13 ---------
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 26 ------------------
llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 27 +++++++++++++++++++
llvm/lib/Transforms/Vectorize/VPlanUtils.h | 11 ++++++++
.../Transforms/Vectorize/VPlanVerifier.cpp | 1 +
.../Transforms/LoopVectorize/RISCV/csa.ll | 8 +++---
6 files changed, 43 insertions(+), 43 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index d71f5340dcd71d..2eb887a0e7d198 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -4105,19 +4105,6 @@ class VPlanSlp {
bool isCompletelySLP() const { return CompletelySLP; }
};
-namespace vputils {
-
-/// Returns true for PHI-like recipes.
-bool isPhi(const VPRecipeBase &R);
-
-/// Returns true for PHI-like recipes that generate their own backedge
-bool isPhiThatGeneratesBackedge(const VPRecipeBase &R);
-
-/// Returns true for PHI-like recipes that exists in vector loop header basic
-/// block
-bool isHeaderPhi(const VPRecipeBase &R);
-} // end namespace vputils
-
} // end namespace llvm
#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 0f9e91932c027f..6d75a7273c69a4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3541,29 +3541,3 @@ void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
-bool vputils::isPhi(const VPRecipeBase &R) {
- if (R.isPhi())
- return true;
- if (auto *VPInst = dyn_cast<VPInstruction>(&R))
- return VPInst->getOpcode() == VPInstruction::CSAMaskPhi ||
- VPInst->getOpcode() == VPInstruction::CSAVLPhi;
- return false;
-}
-
-bool vputils::isPhiThatGeneratesBackedge(const VPRecipeBase &R) {
- if (isa<VPWidenPHIRecipe, VPCSAHeaderPHIRecipe>(&R))
- return true;
- if (auto *VPInst = dyn_cast<VPInstruction>(&R))
- return VPInst->getOpcode() == VPInstruction::CSAMaskPhi ||
- VPInst->getOpcode() == VPInstruction::CSAVLPhi;
- return false;
-}
-
-bool vputils::isHeaderPhi(const VPRecipeBase &R) {
- if (isa<VPHeaderPHIRecipe, VPWidenPHIRecipe>(&R))
- return true;
- if (auto *VPInst = dyn_cast<VPInstruction>(&R))
- return VPInst->getOpcode() == VPInstruction::CSAMaskPhi ||
- VPInst->getOpcode() == VPInstruction::CSAVLPhi;
- return false;
-}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index c18bea4f4c5926..34785d6aba39fe 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -60,3 +60,30 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) {
return match(V, m_Binary<Instruction::ICmp>(m_VPValue(A), m_VPValue(B))) &&
IsWideCanonicalIV(A) && B == Plan.getOrCreateBackedgeTakenCount();
}
+
+bool vputils::isPhi(const VPRecipeBase &R) {
+ if (R.isPhi())
+ return true;
+ if (auto *VPInst = dyn_cast<VPInstruction>(&R))
+ return VPInst->getOpcode() == VPInstruction::CSAMaskPhi ||
+ VPInst->getOpcode() == VPInstruction::CSAVLPhi;
+ return false;
+}
+
+bool vputils::isPhiThatGeneratesBackedge(const VPRecipeBase &R) {
+ if (isa<VPWidenPHIRecipe, VPCSAHeaderPHIRecipe>(&R))
+ return true;
+ if (auto *VPInst = dyn_cast<VPInstruction>(&R))
+ return VPInst->getOpcode() == VPInstruction::CSAMaskPhi ||
+ VPInst->getOpcode() == VPInstruction::CSAVLPhi;
+ return false;
+}
+
+bool vputils::isHeaderPhi(const VPRecipeBase &R) {
+ if (isa<VPHeaderPHIRecipe, VPWidenPHIRecipe>(&R))
+ return true;
+ if (auto *VPInst = dyn_cast<VPInstruction>(&R))
+ return VPInst->getOpcode() == VPInstruction::CSAMaskPhi ||
+ VPInst->getOpcode() == VPInstruction::CSAVLPhi;
+ return false;
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index fc11208a433961..ddbb32a1ec0c83 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -45,6 +45,17 @@ inline bool isUniformAfterVectorization(const VPValue *VPV) {
/// Return true if \p V is a header mask in \p Plan.
bool isHeaderMask(const VPValue *V, VPlan &Plan);
+
+/// Returns true for PHI-like recipes.
+bool isPhi(const VPRecipeBase &R);
+
+/// Returns true for PHI-like recipes that generate their own backedge
+bool isPhiThatGeneratesBackedge(const VPRecipeBase &R);
+
+/// Returns true for PHI-like recipes that exists in vector loop header basic
+/// block
+bool isHeaderPhi(const VPRecipeBase &R);
+
} // end namespace llvm::vputils
#endif
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index b4bc3de463de11..baafa70b9effbf 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -15,6 +15,7 @@
#include "VPlanVerifier.h"
#include "VPlan.h"
#include "VPlanCFG.h"
+#include "VPlanUtils.h"
#include "VPlanDominatorTree.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/SmallPtrSet.h"
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll b/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll
index c90d88d912edaa..e67002e2b2d905 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll
@@ -259,7 +259,7 @@ for.body: ; preds = %for.body.preheader,
}
; This function is generated from the following C/C++ program:
-; int simple_csa_int_select(int N, int *data) {
+; int simple_csa_int_select_induction_cmp(int N, int *data) {
; int t = -1;
; for (int i = 0; i < N; i++) {
; if (i < data[i])
@@ -1373,13 +1373,13 @@ for.body: ; preds = %for.body.preheader,
}
; This function is generated from the following C/C++ program:
-; int csa_in_series_int_select(int N, int *data0, int *data1) {
+; int csa_in_series_int_select_induction_cmp(int N, int *data0, int *data1) {
; int t = -1;
; int s = -1;
; for (int i = 0; i < N; i++) {
-; if (a < data0[i])
+; if (i < data0[i])
; t = data0[i];
-; if (a < data1[i])
+; if (i < data1[i])
; s = data1[i];
; }
; return t | s; // use t and s
>From 6ecc529450c4de5e0b63cac8d1a80e4c80ebc63c Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Fri, 30 Aug 2024 10:33:17 -0700
Subject: [PATCH 07/16] fixup! reorganize tests
---
.../RISCV/conditional-scalar-assignment.ll | 833 +++
.../Transforms/LoopVectorize/RISCV/csa.ll | 5188 -----------------
.../conditional-scalar-assignment.ll | 3091 ++++++++++
3 files changed, 3924 insertions(+), 5188 deletions(-)
create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll
delete mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/csa.ll
create mode 100644 llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll
new file mode 100644
index 00000000000000..6d7816800603e2
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll
@@ -0,0 +1,833 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -passes=loop-vectorize -force-tail-folding-style=data-with-evl \
+; RUN: -enable-csa-vectorization -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=256 \
+; RUN: | FileCheck %s -check-prefix=EVL
+; RUN: opt < %s -S -passes=loop-vectorize -force-tail-folding-style=none \
+; RUN: -enable-csa-vectorization -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=256 \
+; RUN: | FileCheck %s -check-prefix=NO-EVL
+; RUN: opt < %s -S -passes=loop-vectorize -force-tail-folding-style=data \
+; RUN: -enable-csa-vectorization -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=256 \
+; RUN: | FileCheck %s -check-prefix=DATA
+
+; This function is generated from the following C/C++ program:
+; uint64_t idx_scalar(int64_t *a, int64_t *b, uint64_t ii, uint64_t n) {
+; uint64_t idx = ii;
+; for (uint64_t i = 0; i < n; ++i)
+; idx = (a[i] > b[i]) ? i : idx;
+; return idx;
+; }
+define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
+; EVL-LABEL: @idx_scalar(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL: vector.ph:
+; EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; EVL-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; EVL-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
+; EVL-NEXT: [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; EVL-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; EVL: vector.body:
+; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP12]]
+; EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
+; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP14]], align 8
+; EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP12]]
+; EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i32 0
+; EVL-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP16]], align 8
+; EVL-NEXT: [[TMP17:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; EVL-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP17]])
+; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x i64> [[CSA_DATA_PHI]]
+; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; EVL: middle.block:
+; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; EVL-NEXT: [[TMP20:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; EVL-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP20]])
+; EVL-NEXT: [[TMP22:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
+; EVL-NEXT: [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
+; EVL-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x i64> [[CSA_DATA_SEL]], i32 [[TMP25]]
+; EVL-NEXT: [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
+; EVL-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL: scalar.ph:
+; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; EVL-NEXT: ret i64 [[IDX_0_LCSSA]]
+; EVL: for.body:
+; EVL-NEXT: [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; EVL-NEXT: [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I_010]]
+; EVL-NEXT: [[TMP28:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[I_010]]
+; EVL-NEXT: [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; EVL-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[TMP28]], [[TMP29]]
+; EVL-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
+; EVL-NEXT: [[INC]] = add nuw i64 [[I_010]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+; NO-EVL-LABEL: @idx_scalar(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL: vector.ph:
+; NO-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; NO-EVL-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; NO-EVL-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
+; NO-EVL-NEXT: [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; NO-EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; NO-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; NO-EVL-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; NO-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; NO-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-EVL: vector.body:
+; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP12]]
+; NO-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP14]], align 8
+; NO-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP12]]
+; NO-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP16]], align 8
+; NO-EVL-NEXT: [[TMP17:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; NO-EVL-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP17]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x i64> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; NO-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-EVL: middle.block:
+; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; NO-EVL-NEXT: [[TMP20:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP20]])
+; NO-EVL-NEXT: [[TMP22:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
+; NO-EVL-NEXT: [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
+; NO-EVL-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x i64> [[CSA_DATA_SEL]], i32 [[TMP25]]
+; NO-EVL-NEXT: [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
+; NO-EVL-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL: scalar.ph:
+; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; NO-EVL-NEXT: ret i64 [[IDX_0_LCSSA]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; NO-EVL-NEXT: [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I_010]]
+; NO-EVL-NEXT: [[TMP28:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; NO-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[I_010]]
+; NO-EVL-NEXT: [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; NO-EVL-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[TMP28]], [[TMP29]]
+; NO-EVL-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
+; NO-EVL-NEXT: [[INC]] = add nuw i64 [[I_010]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+; DATA-LABEL: @idx_scalar(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA: vector.ph:
+; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; DATA-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; DATA-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; DATA-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
+; DATA-NEXT: [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; DATA-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; DATA-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; DATA-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; DATA-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; DATA-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
+; DATA: vector.body:
+; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP12]]
+; DATA-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
+; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP14]], align 8
+; DATA-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP12]]
+; DATA-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i32 0
+; DATA-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP16]], align 8
+; DATA-NEXT: [[TMP17:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; DATA-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP17]])
+; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x i64> [[CSA_DATA_PHI]]
+; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; DATA-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; DATA-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; DATA: middle.block:
+; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; DATA-NEXT: [[TMP20:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; DATA-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP20]])
+; DATA-NEXT: [[TMP22:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
+; DATA-NEXT: [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
+; DATA-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
+; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x i64> [[CSA_DATA_SEL]], i32 [[TMP25]]
+; DATA-NEXT: [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
+; DATA-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA: scalar.ph:
+; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; DATA-NEXT: ret i64 [[IDX_0_LCSSA]]
+; DATA: for.body:
+; DATA-NEXT: [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; DATA-NEXT: [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I_010]]
+; DATA-NEXT: [[TMP28:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; DATA-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[I_010]]
+; DATA-NEXT: [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; DATA-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[TMP28]], [[TMP29]]
+; DATA-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
+; DATA-NEXT: [[INC]] = add nuw i64 [[I_010]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+entry:
+ %cmp8.not = icmp eq i64 %n, 0
+ br i1 %cmp8.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %cond.lcssa = phi i64 [ %cond, %for.body ]
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond.lcssa, %for.cond.cleanup.loopexit ]
+ ret i64 %idx.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.010 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %idx.09 = phi i64 [ %cond, %for.body ], [ %ii, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i64, ptr %a, i64 %i.010
+ %0 = load i64, ptr %arrayidx, align 8
+ %arrayidx1 = getelementptr inbounds i64, ptr %b, i64 %i.010
+ %1 = load i64, ptr %arrayidx1, align 8
+ %cmp2 = icmp sgt i64 %0, %1
+ %cond = select i1 %cmp2, i64 %i.010, i64 %idx.09
+ %inc = add nuw i64 %i.010, 1
+ %exitcond.not = icmp eq i64 %inc, %n
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; uint64_t idx_scalar_dec(int64_t *a, int64_t *b, uint64_t ii, uint64_t n) {
+; uint64_t idx = ii;
+; for (uint64_t i = n; i > 0; --i) // decreasing
+; idx = (a[i - 1] > b[i - 1]) ? i : idx;
+; return idx;
+; }
+define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
+; EVL-LABEL: @idx_scalar_dec(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL: vector.ph:
+; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; EVL-NEXT: [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
+; EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[N]], i64 0
+; EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; EVL-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3, i64 -4, i64 -5, i64 -6, i64 -7>
+; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; EVL: vector.body:
+; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <8 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <8 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[TMP0:%.*]] = add <8 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+; EVL-NEXT: [[TMP1:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0
+; EVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP1]]
+; EVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
+; EVL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -7
+; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
+; EVL-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP1]]
+; EVL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -7
+; EVL-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8
+; EVL-NEXT: [[REVERSE2:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD1]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; EVL-NEXT: [[TMP8:%.*]] = icmp sgt <8 x i64> [[REVERSE]], [[REVERSE2]]
+; EVL-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP8]])
+; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP9]], <8 x i1> [[TMP8]], <8 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP9]], <8 x i64> [[VEC_IND]], <8 x i64> [[CSA_DATA_PHI]]
+; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; EVL-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8>
+; EVL-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; EVL: middle.block:
+; EVL-NEXT: [[TMP11:%.*]] = select <8 x i1> [[CSA_MASK_SEL]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> zeroinitializer
+; EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> [[TMP11]])
+; EVL-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP12]], 0
+; EVL-NEXT: [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]]
+; EVL-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <8 x i64> [[CSA_DATA_SEL]], i32 [[TMP16]]
+; EVL-NEXT: [[TMP17:%.*]] = icmp sge i32 [[TMP16]], 0
+; EVL-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL: scalar.ph:
+; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; EVL-NEXT: ret i64 [[IDX_0_LCSSA]]
+; EVL: for.body:
+; EVL-NEXT: [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; EVL-NEXT: [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
+; EVL-NEXT: [[SUB]] = add i64 [[I_011]], -1
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[SUB]]
+; EVL-NEXT: [[TMP19:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[SUB]]
+; EVL-NEXT: [[TMP20:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; EVL-NEXT: [[CMP3:%.*]] = icmp sgt i64 [[TMP19]], [[TMP20]]
+; EVL-NEXT: [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
+; EVL-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
+; EVL-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+;
+; NO-EVL-LABEL: @idx_scalar_dec(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL: vector.ph:
+; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-EVL-NEXT: [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
+; NO-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[N]], i64 0
+; NO-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; NO-EVL-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3, i64 -4, i64 -5, i64 -6, i64 -7>
+; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-EVL: vector.body:
+; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <8 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <8 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[TMP0:%.*]] = add <8 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+; NO-EVL-NEXT: [[TMP1:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0
+; NO-EVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP1]]
+; NO-EVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
+; NO-EVL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -7
+; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
+; NO-EVL-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; NO-EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP1]]
+; NO-EVL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; NO-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -7
+; NO-EVL-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8
+; NO-EVL-NEXT: [[REVERSE2:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD1]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; NO-EVL-NEXT: [[TMP8:%.*]] = icmp sgt <8 x i64> [[REVERSE]], [[REVERSE2]]
+; NO-EVL-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP8]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP9]], <8 x i1> [[TMP8]], <8 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP9]], <8 x i64> [[VEC_IND]], <8 x i64> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; NO-EVL-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8>
+; NO-EVL-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-EVL: middle.block:
+; NO-EVL-NEXT: [[TMP11:%.*]] = select <8 x i1> [[CSA_MASK_SEL]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> [[TMP11]])
+; NO-EVL-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP12]], 0
+; NO-EVL-NEXT: [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]]
+; NO-EVL-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <8 x i64> [[CSA_DATA_SEL]], i32 [[TMP16]]
+; NO-EVL-NEXT: [[TMP17:%.*]] = icmp sge i32 [[TMP16]], 0
+; NO-EVL-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL: scalar.ph:
+; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; NO-EVL-NEXT: ret i64 [[IDX_0_LCSSA]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; NO-EVL-NEXT: [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
+; NO-EVL-NEXT: [[SUB]] = add i64 [[I_011]], -1
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[SUB]]
+; NO-EVL-NEXT: [[TMP19:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[SUB]]
+; NO-EVL-NEXT: [[TMP20:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; NO-EVL-NEXT: [[CMP3:%.*]] = icmp sgt i64 [[TMP19]], [[TMP20]]
+; NO-EVL-NEXT: [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
+; NO-EVL-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
+; NO-EVL-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+;
+; DATA-LABEL: @idx_scalar_dec(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA: vector.ph:
+; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; DATA-NEXT: [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
+; DATA-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[N]], i64 0
+; DATA-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; DATA-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3, i64 -4, i64 -5, i64 -6, i64 -7>
+; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
+; DATA: vector.body:
+; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <8 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <8 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[TMP0:%.*]] = add <8 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+; DATA-NEXT: [[TMP1:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0
+; DATA-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP1]]
+; DATA-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
+; DATA-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -7
+; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
+; DATA-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; DATA-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP1]]
+; DATA-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; DATA-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -7
+; DATA-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8
+; DATA-NEXT: [[REVERSE2:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD1]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; DATA-NEXT: [[TMP8:%.*]] = icmp sgt <8 x i64> [[REVERSE]], [[REVERSE2]]
+; DATA-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP8]])
+; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP9]], <8 x i1> [[TMP8]], <8 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP9]], <8 x i64> [[VEC_IND]], <8 x i64> [[CSA_DATA_PHI]]
+; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; DATA-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8>
+; DATA-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; DATA: middle.block:
+; DATA-NEXT: [[TMP11:%.*]] = select <8 x i1> [[CSA_MASK_SEL]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> zeroinitializer
+; DATA-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> [[TMP11]])
+; DATA-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP12]], 0
+; DATA-NEXT: [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]]
+; DATA-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 0, i32 -1
+; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <8 x i64> [[CSA_DATA_SEL]], i32 [[TMP16]]
+; DATA-NEXT: [[TMP17:%.*]] = icmp sge i32 [[TMP16]], 0
+; DATA-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA: scalar.ph:
+; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; DATA-NEXT: ret i64 [[IDX_0_LCSSA]]
+; DATA: for.body:
+; DATA-NEXT: [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; DATA-NEXT: [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
+; DATA-NEXT: [[SUB]] = add i64 [[I_011]], -1
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[SUB]]
+; DATA-NEXT: [[TMP19:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; DATA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[SUB]]
+; DATA-NEXT: [[TMP20:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; DATA-NEXT: [[CMP3:%.*]] = icmp sgt i64 [[TMP19]], [[TMP20]]
+; DATA-NEXT: [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
+; DATA-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
+; DATA-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+;
+entry:
+ %cmp.not9 = icmp eq i64 %n, 0
+ br i1 %cmp.not9, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %cond.lcssa = phi i64 [ %cond, %for.body ]
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond.lcssa, %for.cond.cleanup.loopexit ]
+ ret i64 %idx.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.011 = phi i64 [ %sub, %for.body ], [ %n, %for.body.preheader ]
+ %idx.010 = phi i64 [ %cond, %for.body ], [ %ii, %for.body.preheader ]
+ %sub = add i64 %i.011, -1
+ %arrayidx = getelementptr inbounds i64, ptr %a, i64 %sub
+ %0 = load i64, ptr %arrayidx, align 8
+ %arrayidx2 = getelementptr inbounds i64, ptr %b, i64 %sub
+ %1 = load i64, ptr %arrayidx2, align 8
+ %cmp3 = icmp sgt i64 %0, %1
+ %cond = select i1 %cmp3, i64 %i.011, i64 %idx.010
+ %cmp.not = icmp eq i64 %sub, 0
+ br i1 %cmp.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+
+; This function is generated from the following C/C++ program:
+; int *simple_csa_ptr_select(int N, int **data) {
+; int *t = nullptr;
+; for (int i = 0; i < N; i++) {
+; if (a < *data[i])
+; t = data[i];
+; }
+; return t; // use t
+; }
+define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
+; EVL-LABEL: @simple_csa_ptr_select(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL: vector.ph:
+; EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; EVL-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; EVL-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
+; EVL-NEXT: [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; EVL-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; EVL: vector.body:
+; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x ptr> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[TMP12]]
+; EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds ptr, ptr [[TMP13]], i32 0
+; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP14]], align 8
+; EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[WIDE_LOAD]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
+; EVL-NEXT: [[TMP15:%.*]] = sext <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i64>
+; EVL-NEXT: [[TMP16:%.*]] = icmp slt <vscale x 2 x i64> [[VEC_IND]], [[TMP15]]
+; EVL-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP16]])
+; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 2 x ptr> [[WIDE_LOAD]], <vscale x 2 x ptr> [[CSA_DATA_PHI]]
+; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; EVL: middle.block:
+; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; EVL-NEXT: [[TMP19:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; EVL-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP19]])
+; EVL-NEXT: [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
+; EVL-NEXT: [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
+; EVL-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x ptr> [[CSA_DATA_SEL]], i32 [[TMP24]]
+; EVL-NEXT: [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
+; EVL-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], ptr [[CSA_EXTRACT]], ptr null
+; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL: scalar.ph:
+; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; EVL-NEXT: ret ptr [[T_0_LCSSA]]
+; EVL: for.body:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[T_010:%.*]] = phi ptr [ null, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; EVL-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
+; EVL-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
+; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP29]]
+; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP27]], ptr [[T_010]]
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+;
+; NO-EVL-LABEL: @simple_csa_ptr_select(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL: vector.ph:
+; NO-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; NO-EVL-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; NO-EVL-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
+; NO-EVL-NEXT: [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; NO-EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; NO-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; NO-EVL-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; NO-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; NO-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-EVL: vector.body:
+; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x ptr> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[TMP12]]
+; NO-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds ptr, ptr [[TMP13]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP14]], align 8
+; NO-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[WIDE_LOAD]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
+; NO-EVL-NEXT: [[TMP15:%.*]] = sext <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i64>
+; NO-EVL-NEXT: [[TMP16:%.*]] = icmp slt <vscale x 2 x i64> [[VEC_IND]], [[TMP15]]
+; NO-EVL-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP16]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 2 x ptr> [[WIDE_LOAD]], <vscale x 2 x ptr> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; NO-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; NO-EVL: middle.block:
+; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; NO-EVL-NEXT: [[TMP19:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP19]])
+; NO-EVL-NEXT: [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
+; NO-EVL-NEXT: [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
+; NO-EVL-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x ptr> [[CSA_DATA_SEL]], i32 [[TMP24]]
+; NO-EVL-NEXT: [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
+; NO-EVL-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], ptr [[CSA_EXTRACT]], ptr null
+; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL: scalar.ph:
+; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; NO-EVL-NEXT: ret ptr [[T_0_LCSSA]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[T_010:%.*]] = phi ptr [ null, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; NO-EVL-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
+; NO-EVL-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
+; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP29]]
+; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP27]], ptr [[T_010]]
+; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+;
+; DATA-LABEL: @simple_csa_ptr_select(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA: vector.ph:
+; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; DATA-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; DATA-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
+; DATA-NEXT: [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; DATA-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
+; DATA-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; DATA-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
+; DATA-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; DATA-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
+; DATA: vector.body:
+; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x ptr> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT: [[TMP13:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[TMP12]]
+; DATA-NEXT: [[TMP14:%.*]] = getelementptr inbounds ptr, ptr [[TMP13]], i32 0
+; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP14]], align 8
+; DATA-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[WIDE_LOAD]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
+; DATA-NEXT: [[TMP15:%.*]] = sext <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i64>
+; DATA-NEXT: [[TMP16:%.*]] = icmp slt <vscale x 2 x i64> [[VEC_IND]], [[TMP15]]
+; DATA-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP16]])
+; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 2 x ptr> [[WIDE_LOAD]], <vscale x 2 x ptr> [[CSA_DATA_PHI]]
+; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; DATA-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; DATA-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; DATA: middle.block:
+; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; DATA-NEXT: [[TMP19:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; DATA-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP19]])
+; DATA-NEXT: [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
+; DATA-NEXT: [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
+; DATA-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
+; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x ptr> [[CSA_DATA_SEL]], i32 [[TMP24]]
+; DATA-NEXT: [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
+; DATA-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], ptr [[CSA_EXTRACT]], ptr null
+; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA: scalar.ph:
+; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; DATA-NEXT: ret ptr [[T_0_LCSSA]]
+; DATA: for.body:
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[T_010:%.*]] = phi ptr [ null, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; DATA-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
+; DATA-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
+; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP29]]
+; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP27]], ptr [[T_010]]
+; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+;
+entry:
+ %cmp9 = icmp sgt i32 %N, 0
+ br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit:
+ %spec.select.lcssa = phi ptr [ %spec.select, %for.body ]
+ br label %for.cond.cleanup
+
+for.cond.cleanup:
+ %t.0.lcssa = phi ptr [ null, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+ ret ptr %t.0.lcssa
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %t.010 = phi ptr [ null, %for.body.preheader ], [ %spec.select, %for.body ]
+ %arrayidx = getelementptr inbounds ptr, ptr %data, i64 %indvars.iv
+ %0 = load ptr, ptr %arrayidx, align 8
+ %1 = load i32, ptr %0, align 4
+ %2 = sext i32 %1 to i64
+ %cmp1 = icmp slt i64 %indvars.iv, %2
+ %spec.select = select i1 %cmp1, ptr %0, ptr %t.010
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll b/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll
deleted file mode 100644
index e67002e2b2d905..00000000000000
--- a/llvm/test/Transforms/LoopVectorize/RISCV/csa.ll
+++ /dev/null
@@ -1,5188 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -mtriple riscv64 -mattr="+v" -riscv-v-vector-bits-min=256 \
-; RUN: -passes=loop-vectorize -force-tail-folding-style=data-with-evl \
-; RUN: -enable-csa-vectorization | FileCheck %s -check-prefix=EVL
-; RUN: opt < %s -S -mtriple riscv64 -mattr="+v" -riscv-v-vector-bits-min=256 \
-; RUN: -passes=loop-vectorize -force-tail-folding-style=none \
-; RUN: -enable-csa-vectorization | FileCheck %s -check-prefix=NO-EVL
-; RUN: opt < %s -S -mtriple riscv64 -mattr="+v" -riscv-v-vector-bits-min=256 \
-; RUN: -passes=loop-vectorize -force-tail-folding-style=data \
-; RUN: -enable-csa-vectorization | FileCheck %s -check-prefix=DATA
-
-; This function is generated from the following C/C++ program:
-; int simple_csa_int_select(int N, int *data, int a) {
-; int t = -1;
-; for (int i = 0; i < N; i++) {
-; if (a < data[i])
-; t = data[i];
-; }
-; return t; // use t
-; }
-define i32 @simple_csa_int_select(i32 %N, ptr %data, i64 %a) {
-; EVL-LABEL: @simple_csa_int_select(
-; EVL-NEXT: entry:
-; EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL: for.body.preheader:
-; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL: vector.ph:
-; EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
-; EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
-; EVL: vector.body:
-; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP6]]
-; EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
-; EVL-NEXT: [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; EVL-NEXT: [[TMP10:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
-; EVL-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP10]])
-; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP11]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP11]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
-; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; EVL-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; EVL: middle.block:
-; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; EVL-NEXT: [[TMP13:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP13]])
-; EVL-NEXT: [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP14]], 0
-; EVL-NEXT: [[TMP17:%.*]] = and i1 [[TMP15]], [[TMP16]]
-; EVL-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], i32 0, i32 -1
-; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP18]]
-; EVL-NEXT: [[TMP19:%.*]] = icmp sge i32 [[TMP18]], 0
-; EVL-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[CSA_EXTRACT]], i32 -1
-; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL: scalar.ph:
-; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; EVL-NEXT: ret i32 [[T_0_LCSSA]]
-; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT: [[TMP22:%.*]] = sext i32 [[TMP21]] to i64
-; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP22]]
-; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP21]], i32 [[T_010]]
-; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-;
-; NO-EVL-LABEL: @simple_csa_int_select(
-; NO-EVL-NEXT: entry:
-; NO-EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL: for.body.preheader:
-; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL: vector.ph:
-; NO-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; NO-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
-; NO-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
-; NO-EVL: vector.body:
-; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP6]]
-; NO-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
-; NO-EVL-NEXT: [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; NO-EVL-NEXT: [[TMP10:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
-; NO-EVL-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP10]])
-; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP11]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP11]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
-; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; NO-EVL-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; NO-EVL: middle.block:
-; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; NO-EVL-NEXT: [[TMP13:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP13]])
-; NO-EVL-NEXT: [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP14]], 0
-; NO-EVL-NEXT: [[TMP17:%.*]] = and i1 [[TMP15]], [[TMP16]]
-; NO-EVL-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], i32 0, i32 -1
-; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP18]]
-; NO-EVL-NEXT: [[TMP19:%.*]] = icmp sge i32 [[TMP18]], 0
-; NO-EVL-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[CSA_EXTRACT]], i32 -1
-; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL: scalar.ph:
-; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; NO-EVL-NEXT: ret i32 [[T_0_LCSSA]]
-; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT: [[TMP22:%.*]] = sext i32 [[TMP21]] to i64
-; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP22]]
-; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP21]], i32 [[T_010]]
-; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-;
-; DATA-LABEL: @simple_csa_int_select(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA: vector.ph:
-; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; DATA-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
-; DATA-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
-; DATA: vector.body:
-; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP6]]
-; DATA-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
-; DATA-NEXT: [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; DATA-NEXT: [[TMP10:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
-; DATA-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP10]])
-; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP11]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP11]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
-; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; DATA-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; DATA: middle.block:
-; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; DATA-NEXT: [[TMP13:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; DATA-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP13]])
-; DATA-NEXT: [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP14]], 0
-; DATA-NEXT: [[TMP17:%.*]] = and i1 [[TMP15]], [[TMP16]]
-; DATA-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], i32 0, i32 -1
-; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP18]]
-; DATA-NEXT: [[TMP19:%.*]] = icmp sge i32 [[TMP18]], 0
-; DATA-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[CSA_EXTRACT]], i32 -1
-; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA: scalar.ph:
-; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; DATA-NEXT: ret i32 [[T_0_LCSSA]]
-; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT: [[TMP22:%.*]] = sext i32 [[TMP21]] to i64
-; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP22]]
-; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP21]], i32 [[T_010]]
-; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-;
-entry:
- %cmp9 = icmp sgt i32 %N, 0
- br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader: ; preds = %entry
- %wide.trip.count = zext i32 %N to i64
- br label %for.body
-
-for.cond.cleanup.loopexit: ; preds = %for.body
- %spec.select.lcssa = phi i32 [ %spec.select, %for.body ]
- br label %for.cond.cleanup
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %t.0.lcssa = phi i32 [ -1, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
- ret i32 %t.0.lcssa
-
-for.body: ; preds = %for.body.preheader, %for.body
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
- %t.010 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
- %arrayidx = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
- %0 = load i32, ptr %arrayidx, align 4
- %1 = sext i32 %0 to i64
- %cmp1 = icmp slt i64 %a, %1
- %spec.select = select i1 %cmp1, i32 %0, i32 %t.010
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int simple_csa_int_select_induction_cmp(int N, int *data) {
-; int t = -1;
-; for (int i = 0; i < N; i++) {
-; if (i < data[i])
-; t = data[i];
-; }
-; return t; // use t
-; }
-define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
-; EVL-LABEL: @simple_csa_int_select_induction_cmp(
-; EVL-NEXT: entry:
-; EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL: for.body.preheader:
-; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL: vector.ph:
-; EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; EVL-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; EVL-NEXT: [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
-; EVL-NEXT: [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
-; EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; EVL-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
-; EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
-; EVL: vector.body:
-; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP12]]
-; EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
-; EVL-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; EVL-NEXT: [[TMP16:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
-; EVL-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP16]])
-; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
-; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; EVL: middle.block:
-; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; EVL-NEXT: [[TMP19:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP19]])
-; EVL-NEXT: [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
-; EVL-NEXT: [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
-; EVL-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
-; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP24]]
-; EVL-NEXT: [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
-; EVL-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[CSA_EXTRACT]], i32 -1
-; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL: scalar.ph:
-; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; EVL-NEXT: ret i32 [[T_0_LCSSA]]
-; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT: [[TMP28:%.*]] = sext i32 [[TMP27]] to i64
-; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP28]]
-; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP27]], i32 [[T_010]]
-; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-;
-; NO-EVL-LABEL: @simple_csa_int_select_induction_cmp(
-; NO-EVL-NEXT: entry:
-; NO-EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL: for.body.preheader:
-; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL: vector.ph:
-; NO-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; NO-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-EVL-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; NO-EVL-NEXT: [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
-; NO-EVL-NEXT: [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; NO-EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
-; NO-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; NO-EVL-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; NO-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
-; NO-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
-; NO-EVL: vector.body:
-; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP12]]
-; NO-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
-; NO-EVL-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; NO-EVL-NEXT: [[TMP16:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
-; NO-EVL-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP16]])
-; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
-; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; NO-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; NO-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; NO-EVL: middle.block:
-; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; NO-EVL-NEXT: [[TMP19:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP19]])
-; NO-EVL-NEXT: [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
-; NO-EVL-NEXT: [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
-; NO-EVL-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
-; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP24]]
-; NO-EVL-NEXT: [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
-; NO-EVL-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[CSA_EXTRACT]], i32 -1
-; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL: scalar.ph:
-; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; NO-EVL-NEXT: ret i32 [[T_0_LCSSA]]
-; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT: [[TMP28:%.*]] = sext i32 [[TMP27]] to i64
-; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP28]]
-; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP27]], i32 [[T_010]]
-; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-;
-; DATA-LABEL: @simple_csa_int_select_induction_cmp(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA: vector.ph:
-; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; DATA-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; DATA-NEXT: [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
-; DATA-NEXT: [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; DATA-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
-; DATA-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; DATA-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; DATA-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
-; DATA-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
-; DATA: vector.body:
-; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP12]]
-; DATA-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
-; DATA-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; DATA-NEXT: [[TMP16:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
-; DATA-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP16]])
-; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
-; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; DATA-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; DATA-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; DATA: middle.block:
-; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; DATA-NEXT: [[TMP19:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; DATA-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP19]])
-; DATA-NEXT: [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
-; DATA-NEXT: [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
-; DATA-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
-; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP24]]
-; DATA-NEXT: [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
-; DATA-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[CSA_EXTRACT]], i32 -1
-; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA: scalar.ph:
-; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; DATA-NEXT: ret i32 [[T_0_LCSSA]]
-; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT: [[TMP28:%.*]] = sext i32 [[TMP27]] to i64
-; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP28]]
-; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP27]], i32 [[T_010]]
-; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-;
-entry:
- %cmp9 = icmp sgt i32 %N, 0
- br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader: ; preds = %entry
- %wide.trip.count = zext i32 %N to i64
- br label %for.body
-
-for.cond.cleanup.loopexit: ; preds = %for.body
- %spec.select.lcssa = phi i32 [ %spec.select, %for.body ]
- br label %for.cond.cleanup
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %t.0.lcssa = phi i32 [ -1, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
- ret i32 %t.0.lcssa
-
-for.body: ; preds = %for.body.preheader, %for.body
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
- %t.010 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
- %arrayidx = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
- %0 = load i32, ptr %arrayidx, align 4
- %1 = sext i32 %0 to i64
- %cmp1 = icmp slt i64 %indvars.iv, %1
- %spec.select = select i1 %cmp1, i32 %0, i32 %t.010
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; float simple_csa_float_select(int N, float *data) {
-; float t = 1.0f;
-; for (int i = 0; i < N; i++) {
-; if (0.0f < data[i])
-; t = data[i];
-; }
-; return t; // use t
-; }
-define float @simple_csa_float_select(i32 %N, ptr %data) {
-; EVL-LABEL: @simple_csa_float_select(
-; EVL-NEXT: entry:
-; EVL-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL: for.body.preheader:
-; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL: vector.ph:
-; EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
-; EVL: vector.body:
-; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[TMP6]]
-; EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
-; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
-; EVL-NEXT: [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
-; EVL-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
-; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP10]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[CSA_DATA_PHI]]
-; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; EVL-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; EVL: middle.block:
-; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; EVL-NEXT: [[TMP12:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP12]])
-; EVL-NEXT: [[TMP14:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP13]], 0
-; EVL-NEXT: [[TMP16:%.*]] = and i1 [[TMP14]], [[TMP15]]
-; EVL-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 0, i32 -1
-; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL]], i32 [[TMP17]]
-; EVL-NEXT: [[TMP18:%.*]] = icmp sge i32 [[TMP17]], 0
-; EVL-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], float [[CSA_EXTRACT]], float 1.000000e+00
-; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL: scalar.ph:
-; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; EVL-NEXT: ret float [[T_0_LCSSA]]
-; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT: [[T_09:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP20]], 0.000000e+00
-; EVL-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP20]], float [[T_09]]
-; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-;
-; NO-EVL-LABEL: @simple_csa_float_select(
-; NO-EVL-NEXT: entry:
-; NO-EVL-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL: for.body.preheader:
-; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL: vector.ph:
-; NO-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; NO-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
-; NO-EVL: vector.body:
-; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[TMP6]]
-; NO-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
-; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
-; NO-EVL-NEXT: [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
-; NO-EVL-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
-; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP10]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[CSA_DATA_PHI]]
-; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; NO-EVL-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; NO-EVL: middle.block:
-; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; NO-EVL-NEXT: [[TMP12:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP12]])
-; NO-EVL-NEXT: [[TMP14:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP13]], 0
-; NO-EVL-NEXT: [[TMP16:%.*]] = and i1 [[TMP14]], [[TMP15]]
-; NO-EVL-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 0, i32 -1
-; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL]], i32 [[TMP17]]
-; NO-EVL-NEXT: [[TMP18:%.*]] = icmp sge i32 [[TMP17]], 0
-; NO-EVL-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], float [[CSA_EXTRACT]], float 1.000000e+00
-; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL: scalar.ph:
-; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; NO-EVL-NEXT: ret float [[T_0_LCSSA]]
-; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[T_09:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP20]], 0.000000e+00
-; NO-EVL-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP20]], float [[T_09]]
-; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-;
-; DATA-LABEL: @simple_csa_float_select(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA: vector.ph:
-; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
-; DATA: vector.body:
-; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[TMP6]]
-; DATA-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
-; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
-; DATA-NEXT: [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
-; DATA-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
-; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP10]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[CSA_DATA_PHI]]
-; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; DATA-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; DATA: middle.block:
-; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; DATA-NEXT: [[TMP12:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; DATA-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP12]])
-; DATA-NEXT: [[TMP14:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP13]], 0
-; DATA-NEXT: [[TMP16:%.*]] = and i1 [[TMP14]], [[TMP15]]
-; DATA-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 0, i32 -1
-; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL]], i32 [[TMP17]]
-; DATA-NEXT: [[TMP18:%.*]] = icmp sge i32 [[TMP17]], 0
-; DATA-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], float [[CSA_EXTRACT]], float 1.000000e+00
-; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA: scalar.ph:
-; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; DATA-NEXT: ret float [[T_0_LCSSA]]
-; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[T_09:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP20]], 0.000000e+00
-; DATA-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP20]], float [[T_09]]
-; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-;
-entry:
- %cmp8 = icmp sgt i32 %N, 0
- br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader: ; preds = %entry
- %wide.trip.count = zext i32 %N to i64
- br label %for.body
-
-for.cond.cleanup: ; preds = %for.body, %entry
- %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.body ]
- ret float %t.0.lcssa
-
-for.body: ; preds = %for.body.preheader, %for.body
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
- %t.09 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.body ]
- %arrayidx = getelementptr inbounds float, ptr %data, i64 %indvars.iv
- %0 = load float, ptr %arrayidx, align 4
- %cmp1 = fcmp ogt float %0, 0.000000e+00
- %t.1 = select i1 %cmp1, float %0, float %t.09
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int simple_csa_int(int N, bool *cond, int *data) {
-; int t = -1;
-; for (int i = 0; i < N; i++) {
-; if (cond[i])
-; t = data[i];
-; }
-; return t; // use t
-; }
-define i32 @simple_csa_int(i32 %N, ptr %cond, ptr %data) {
-; EVL-LABEL: @simple_csa_int(
-; EVL-NEXT: entry:
-; EVL-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL: for.body.preheader:
-; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; EVL-NEXT: ret i32 [[T_0_LCSSA]]
-; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; EVL-NEXT: [[T_07:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; EVL: if.then:
-; EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; EVL-NEXT: br label [[FOR_INC]]
-; EVL: for.inc:
-; EVL-NEXT: [[T_1]] = phi i32 [ [[TMP1]], [[IF_THEN]] ], [ [[T_07]], [[FOR_BODY]] ]
-; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; NO-EVL-LABEL: @simple_csa_int(
-; NO-EVL-NEXT: entry:
-; NO-EVL-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL: for.body.preheader:
-; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; NO-EVL-NEXT: ret i32 [[T_0_LCSSA]]
-; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[T_07:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; NO-EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; NO-EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; NO-EVL: if.then:
-; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; NO-EVL-NEXT: br label [[FOR_INC]]
-; NO-EVL: for.inc:
-; NO-EVL-NEXT: [[T_1]] = phi i32 [ [[TMP1]], [[IF_THEN]] ], [ [[T_07]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; DATA-LABEL: @simple_csa_int(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; DATA-NEXT: ret i32 [[T_0_LCSSA]]
-; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; DATA-NEXT: [[T_07:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; DATA-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; DATA-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; DATA: if.then:
-; DATA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; DATA-NEXT: br label [[FOR_INC]]
-; DATA: for.inc:
-; DATA-NEXT: [[T_1]] = phi i32 [ [[TMP1]], [[IF_THEN]] ], [ [[T_07]], [[FOR_BODY]] ]
-; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-entry:
- %cmp6 = icmp sgt i32 %N, 0
- br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader: ; preds = %entry
- %wide.trip.count = zext i32 %N to i64
- br label %for.body
-
-for.cond.cleanup: ; preds = %for.inc, %entry
- %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.1, %for.inc ]
- ret i32 %t.0.lcssa
-
-for.body: ; preds = %for.body.preheader, %for.inc
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
- %t.07 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
- %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
- %0 = load i8, ptr %arrayidx, align 1
- %tobool.not = icmp eq i8 %0, 0
- br i1 %tobool.not, label %for.inc, label %if.then
-
-if.then: ; preds = %for.body
- %arrayidx2 = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
- %1 = load i32, ptr %arrayidx2, align 4
- br label %for.inc
-
-for.inc: ; preds = %for.body, %if.then
- %t.1 = phi i32 [ %1, %if.then ], [ %t.07, %for.body ]
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; float simple_csa_float(int N, bool *cond, float *data) {
-; float t = 1.0f;
-; for (int i = 0; i < N; i++) {
-; if (cond[i])
-; t = data[i];
-; }
-; return t; // use t
-; }
-define float @simple_csa_float(i32 %N, ptr %cond, ptr %data) {
-; EVL-LABEL: @simple_csa_float(
-; EVL-NEXT: entry:
-; EVL-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL: for.body.preheader:
-; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; EVL-NEXT: ret float [[T_0_LCSSA]]
-; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; EVL-NEXT: [[T_07:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; EVL: if.then:
-; EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; EVL-NEXT: br label [[FOR_INC]]
-; EVL: for.inc:
-; EVL-NEXT: [[T_1]] = phi float [ [[TMP1]], [[IF_THEN]] ], [ [[T_07]], [[FOR_BODY]] ]
-; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; NO-EVL-LABEL: @simple_csa_float(
-; NO-EVL-NEXT: entry:
-; NO-EVL-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL: for.body.preheader:
-; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; NO-EVL-NEXT: ret float [[T_0_LCSSA]]
-; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[T_07:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; NO-EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; NO-EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; NO-EVL: if.then:
-; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; NO-EVL-NEXT: br label [[FOR_INC]]
-; NO-EVL: for.inc:
-; NO-EVL-NEXT: [[T_1]] = phi float [ [[TMP1]], [[IF_THEN]] ], [ [[T_07]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; DATA-LABEL: @simple_csa_float(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; DATA-NEXT: ret float [[T_0_LCSSA]]
-; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; DATA-NEXT: [[T_07:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; DATA-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; DATA-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; DATA: if.then:
-; DATA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; DATA-NEXT: br label [[FOR_INC]]
-; DATA: for.inc:
-; DATA-NEXT: [[T_1]] = phi float [ [[TMP1]], [[IF_THEN]] ], [ [[T_07]], [[FOR_BODY]] ]
-; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-entry:
- %cmp6 = icmp sgt i32 %N, 0
- br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader: ; preds = %entry
- %wide.trip.count = zext i32 %N to i64
- br label %for.body
-
-for.cond.cleanup: ; preds = %for.inc, %entry
- %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.inc ]
- ret float %t.0.lcssa
-
-for.body: ; preds = %for.body.preheader, %for.inc
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
- %t.07 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
- %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
- %0 = load i8, ptr %arrayidx, align 1
- %tobool.not = icmp eq i8 %0, 0
- br i1 %tobool.not, label %for.inc, label %if.then
-
-if.then: ; preds = %for.body
- %arrayidx2 = getelementptr inbounds float, ptr %data, i64 %indvars.iv
- %1 = load float, ptr %arrayidx2, align 4
- br label %for.inc
-
-for.inc: ; preds = %for.body, %if.then
- %t.1 = phi float [ %1, %if.then ], [ %t.07, %for.body ]
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int csa_in_series_int_select(int N, int *data0, int *data1, int a) {
-; int t = -1;
-; int s = -1;
-; for (int i = 0; i < N; i++) {
-; if (a < data0[i])
-; t = data0[i];
-; if (a < data1[i])
-; s = data1[i];
-; }
-; return t | s; // use t and s
-; }
-define i32 @csa_in_series_int_select(i32 %N, ptr %data0, ptr %data1, i64 %a) {
-; EVL-LABEL: @csa_in_series_int_select(
-; EVL-NEXT: entry:
-; EVL-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL: for.body.preheader:
-; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL: vector.ph:
-; EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
-; EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
-; EVL: vector.body:
-; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP6]]
-; EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
-; EVL-NEXT: [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; EVL-NEXT: [[TMP10:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
-; EVL-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP10]])
-; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP11]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
-; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP11]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI2]]
-; EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP6]]
-; EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; EVL-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
-; EVL-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD3]] to <vscale x 4 x i64>
-; EVL-NEXT: [[TMP15:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP14]]
-; EVL-NEXT: [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP15]])
-; EVL-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP16]], <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP16]], <vscale x 4 x i32> [[WIDE_LOAD3]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
-; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; EVL: middle.block:
-; EVL-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; EVL-NEXT: [[TMP18:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP18]])
-; EVL-NEXT: [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
-; EVL-NEXT: [[TMP21:%.*]] = icmp eq i32 [[TMP19]], 0
-; EVL-NEXT: [[TMP22:%.*]] = and i1 [[TMP20]], [[TMP21]]
-; EVL-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 0, i32 -1
-; EVL-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL5]], i32 [[TMP23]]
-; EVL-NEXT: [[TMP24:%.*]] = icmp sge i32 [[TMP23]], 0
-; EVL-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 [[CSA_EXTRACT7]], i32 -1
-; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; EVL-NEXT: [[TMP26:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP26]])
-; EVL-NEXT: [[TMP28:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP27]], 0
-; EVL-NEXT: [[TMP30:%.*]] = and i1 [[TMP28]], [[TMP29]]
-; EVL-NEXT: [[TMP31:%.*]] = select i1 [[TMP30]], i32 0, i32 -1
-; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP31]]
-; EVL-NEXT: [[TMP32:%.*]] = icmp sge i32 [[TMP31]], 0
-; EVL-NEXT: [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[CSA_EXTRACT]], i32 -1
-; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL: scalar.ph:
-; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT: [[TMP34:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP34]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
-; EVL-NEXT: ret i32 [[OR]]
-; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; EVL-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT: [[TMP36:%.*]] = sext i32 [[TMP35]] to i64
-; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP36]]
-; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP35]], i32 [[T_022]]
-; EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; EVL-NEXT: [[TMP38:%.*]] = sext i32 [[TMP37]] to i64
-; EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP38]]
-; EVL-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP37]], i32 [[S_023]]
-; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-;
-; NO-EVL-LABEL: @csa_in_series_int_select(
-; NO-EVL-NEXT: entry:
-; NO-EVL-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL: for.body.preheader:
-; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL: vector.ph:
-; NO-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; NO-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
-; NO-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
-; NO-EVL: vector.body:
-; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP6]]
-; NO-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
-; NO-EVL-NEXT: [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; NO-EVL-NEXT: [[TMP10:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
-; NO-EVL-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP10]])
-; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP11]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
-; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP11]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI2]]
-; NO-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP6]]
-; NO-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; NO-EVL-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
-; NO-EVL-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD3]] to <vscale x 4 x i64>
-; NO-EVL-NEXT: [[TMP15:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP14]]
-; NO-EVL-NEXT: [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP15]])
-; NO-EVL-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP16]], <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP16]], <vscale x 4 x i32> [[WIDE_LOAD3]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
-; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; NO-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; NO-EVL: middle.block:
-; NO-EVL-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; NO-EVL-NEXT: [[TMP18:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP18]])
-; NO-EVL-NEXT: [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
-; NO-EVL-NEXT: [[TMP21:%.*]] = icmp eq i32 [[TMP19]], 0
-; NO-EVL-NEXT: [[TMP22:%.*]] = and i1 [[TMP20]], [[TMP21]]
-; NO-EVL-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 0, i32 -1
-; NO-EVL-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL5]], i32 [[TMP23]]
-; NO-EVL-NEXT: [[TMP24:%.*]] = icmp sge i32 [[TMP23]], 0
-; NO-EVL-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 [[CSA_EXTRACT7]], i32 -1
-; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; NO-EVL-NEXT: [[TMP26:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP26]])
-; NO-EVL-NEXT: [[TMP28:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP27]], 0
-; NO-EVL-NEXT: [[TMP30:%.*]] = and i1 [[TMP28]], [[TMP29]]
-; NO-EVL-NEXT: [[TMP31:%.*]] = select i1 [[TMP30]], i32 0, i32 -1
-; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP31]]
-; NO-EVL-NEXT: [[TMP32:%.*]] = icmp sge i32 [[TMP31]], 0
-; NO-EVL-NEXT: [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[CSA_EXTRACT]], i32 -1
-; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL: scalar.ph:
-; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT: [[TMP34:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP34]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
-; NO-EVL-NEXT: ret i32 [[OR]]
-; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT: [[TMP36:%.*]] = sext i32 [[TMP35]] to i64
-; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP36]]
-; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP35]], i32 [[T_022]]
-; NO-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; NO-EVL-NEXT: [[TMP38:%.*]] = sext i32 [[TMP37]] to i64
-; NO-EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP38]]
-; NO-EVL-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP37]], i32 [[S_023]]
-; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-;
-; DATA-LABEL: @csa_in_series_int_select(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA: vector.ph:
-; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; DATA-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[A:%.*]], i64 0
-; DATA-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
-; DATA: vector.body:
-; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP6]]
-; DATA-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
-; DATA-NEXT: [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; DATA-NEXT: [[TMP10:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
-; DATA-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP10]])
-; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP11]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
-; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP11]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI2]]
-; DATA-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP6]]
-; DATA-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
-; DATA-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP13]], align 4
-; DATA-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD3]] to <vscale x 4 x i64>
-; DATA-NEXT: [[TMP15:%.*]] = icmp slt <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP14]]
-; DATA-NEXT: [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP15]])
-; DATA-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP16]], <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP16]], <vscale x 4 x i32> [[WIDE_LOAD3]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
-; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; DATA-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; DATA: middle.block:
-; DATA-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; DATA-NEXT: [[TMP18:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
-; DATA-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP18]])
-; DATA-NEXT: [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
-; DATA-NEXT: [[TMP21:%.*]] = icmp eq i32 [[TMP19]], 0
-; DATA-NEXT: [[TMP22:%.*]] = and i1 [[TMP20]], [[TMP21]]
-; DATA-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 0, i32 -1
-; DATA-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL5]], i32 [[TMP23]]
-; DATA-NEXT: [[TMP24:%.*]] = icmp sge i32 [[TMP23]], 0
-; DATA-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 [[CSA_EXTRACT7]], i32 -1
-; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; DATA-NEXT: [[TMP26:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; DATA-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP26]])
-; DATA-NEXT: [[TMP28:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT: [[TMP29:%.*]] = icmp eq i32 [[TMP27]], 0
-; DATA-NEXT: [[TMP30:%.*]] = and i1 [[TMP28]], [[TMP29]]
-; DATA-NEXT: [[TMP31:%.*]] = select i1 [[TMP30]], i32 0, i32 -1
-; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP31]]
-; DATA-NEXT: [[TMP32:%.*]] = icmp sge i32 [[TMP31]], 0
-; DATA-NEXT: [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[CSA_EXTRACT]], i32 -1
-; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA: scalar.ph:
-; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT: [[TMP34:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[OR:%.*]] = phi i32 [ [[TMP34]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
-; DATA-NEXT: ret i32 [[OR]]
-; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; DATA-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT: [[TMP36:%.*]] = sext i32 [[TMP35]] to i64
-; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP36]]
-; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP35]], i32 [[T_022]]
-; DATA-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; DATA-NEXT: [[TMP38:%.*]] = sext i32 [[TMP37]] to i64
-; DATA-NEXT: [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP38]]
-; DATA-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP37]], i32 [[S_023]]
-; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-;
-entry:
- %cmp21 = icmp sgt i32 %N, 0
- br i1 %cmp21, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader: ; preds = %entry
- %wide.trip.count = zext i32 %N to i64
- br label %for.body
-
-for.cond.cleanup.loopexit: ; preds = %for.body
- %0 = or i32 %s.1, %spec.select
- br label %for.cond.cleanup
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
- ret i32 %or
-
-for.body: ; preds = %for.body.preheader, %for.body
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
- %s.023 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.body ]
- %t.022 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
- %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
- %1 = load i32, ptr %arrayidx, align 4
- %2 = sext i32 %1 to i64
- %cmp1 = icmp slt i64 %a, %2
- %spec.select = select i1 %cmp1, i32 %1, i32 %t.022
- %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
- %3 = load i32, ptr %arrayidx5, align 4
- %4 = sext i32 %3 to i64
- %cmp6 = icmp slt i64 %a, %4
- %s.1 = select i1 %cmp6, i32 %3, i32 %s.023
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int csa_in_series_int_select_induction_cmp(int N, int *data0, int *data1) {
-; int t = -1;
-; int s = -1;
-; for (int i = 0; i < N; i++) {
-; if (i < data0[i])
-; t = data0[i];
-; if (i < data1[i])
-; s = data1[i];
-; }
-; return t | s; // use t and s
-; }
-define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_in_series_int_select_induction_cmp(
-; EVL-NEXT: entry:
-; EVL-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL: for.body.preheader:
-; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL: vector.ph:
-; EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; EVL-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; EVL-NEXT: [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
-; EVL-NEXT: [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
-; EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; EVL-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
-; EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
-; EVL: vector.body:
-; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP12]]
-; EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
-; EVL-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; EVL-NEXT: [[TMP16:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
-; EVL-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP16]])
-; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
-; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI2]]
-; EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP12]]
-; EVL-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
-; EVL-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP19]], align 4
-; EVL-NEXT: [[TMP20:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD3]] to <vscale x 4 x i64>
-; EVL-NEXT: [[TMP21:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP20]]
-; EVL-NEXT: [[TMP22:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP21]])
-; EVL-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP22]], <vscale x 4 x i1> [[TMP21]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP22]], <vscale x 4 x i32> [[WIDE_LOAD3]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
-; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; EVL-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; EVL: middle.block:
-; EVL-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; EVL-NEXT: [[TMP24:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP24]])
-; EVL-NEXT: [[TMP26:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
-; EVL-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP25]], 0
-; EVL-NEXT: [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
-; EVL-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 0, i32 -1
-; EVL-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL5]], i32 [[TMP29]]
-; EVL-NEXT: [[TMP30:%.*]] = icmp sge i32 [[TMP29]], 0
-; EVL-NEXT: [[TMP31:%.*]] = select i1 [[TMP30]], i32 [[CSA_EXTRACT7]], i32 -1
-; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; EVL-NEXT: [[TMP32:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP32]])
-; EVL-NEXT: [[TMP34:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT: [[TMP35:%.*]] = icmp eq i32 [[TMP33]], 0
-; EVL-NEXT: [[TMP36:%.*]] = and i1 [[TMP34]], [[TMP35]]
-; EVL-NEXT: [[TMP37:%.*]] = select i1 [[TMP36]], i32 0, i32 -1
-; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP37]]
-; EVL-NEXT: [[TMP38:%.*]] = icmp sge i32 [[TMP37]], 0
-; EVL-NEXT: [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[CSA_EXTRACT]], i32 -1
-; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL: scalar.ph:
-; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT: [[TMP40:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP40]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
-; EVL-NEXT: ret i32 [[OR]]
-; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; EVL-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP41:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT: [[TMP42:%.*]] = sext i32 [[TMP41]] to i64
-; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP42]]
-; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP41]], i32 [[T_022]]
-; EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP43:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; EVL-NEXT: [[TMP44:%.*]] = sext i32 [[TMP43]] to i64
-; EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP44]]
-; EVL-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP43]], i32 [[S_023]]
-; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-;
-; NO-EVL-LABEL: @csa_in_series_int_select_induction_cmp(
-; NO-EVL-NEXT: entry:
-; NO-EVL-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL: for.body.preheader:
-; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL: vector.ph:
-; NO-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; NO-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-EVL-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; NO-EVL-NEXT: [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
-; NO-EVL-NEXT: [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; NO-EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
-; NO-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; NO-EVL-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; NO-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
-; NO-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
-; NO-EVL: vector.body:
-; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP12]]
-; NO-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
-; NO-EVL-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; NO-EVL-NEXT: [[TMP16:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
-; NO-EVL-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP16]])
-; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
-; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI2]]
-; NO-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP12]]
-; NO-EVL-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
-; NO-EVL-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP19]], align 4
-; NO-EVL-NEXT: [[TMP20:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD3]] to <vscale x 4 x i64>
-; NO-EVL-NEXT: [[TMP21:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP20]]
-; NO-EVL-NEXT: [[TMP22:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP21]])
-; NO-EVL-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP22]], <vscale x 4 x i1> [[TMP21]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP22]], <vscale x 4 x i32> [[WIDE_LOAD3]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
-; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; NO-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; NO-EVL-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; NO-EVL: middle.block:
-; NO-EVL-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; NO-EVL-NEXT: [[TMP24:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP24]])
-; NO-EVL-NEXT: [[TMP26:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
-; NO-EVL-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP25]], 0
-; NO-EVL-NEXT: [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
-; NO-EVL-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 0, i32 -1
-; NO-EVL-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL5]], i32 [[TMP29]]
-; NO-EVL-NEXT: [[TMP30:%.*]] = icmp sge i32 [[TMP29]], 0
-; NO-EVL-NEXT: [[TMP31:%.*]] = select i1 [[TMP30]], i32 [[CSA_EXTRACT7]], i32 -1
-; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; NO-EVL-NEXT: [[TMP32:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP32]])
-; NO-EVL-NEXT: [[TMP34:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT: [[TMP35:%.*]] = icmp eq i32 [[TMP33]], 0
-; NO-EVL-NEXT: [[TMP36:%.*]] = and i1 [[TMP34]], [[TMP35]]
-; NO-EVL-NEXT: [[TMP37:%.*]] = select i1 [[TMP36]], i32 0, i32 -1
-; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP37]]
-; NO-EVL-NEXT: [[TMP38:%.*]] = icmp sge i32 [[TMP37]], 0
-; NO-EVL-NEXT: [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[CSA_EXTRACT]], i32 -1
-; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL: scalar.ph:
-; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT: [[TMP40:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP40]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
-; NO-EVL-NEXT: ret i32 [[OR]]
-; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP41:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT: [[TMP42:%.*]] = sext i32 [[TMP41]] to i64
-; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP42]]
-; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP41]], i32 [[T_022]]
-; NO-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP43:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; NO-EVL-NEXT: [[TMP44:%.*]] = sext i32 [[TMP43]] to i64
-; NO-EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP44]]
-; NO-EVL-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP43]], i32 [[S_023]]
-; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-;
-; DATA-LABEL: @csa_in_series_int_select_induction_cmp(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA: vector.ph:
-; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; DATA-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; DATA-NEXT: [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
-; DATA-NEXT: [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; DATA-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
-; DATA-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; DATA-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; DATA-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
-; DATA-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
-; DATA: vector.body:
-; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP12]]
-; DATA-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
-; DATA-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; DATA-NEXT: [[TMP16:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
-; DATA-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP16]])
-; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
-; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI2]]
-; DATA-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP12]]
-; DATA-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
-; DATA-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP19]], align 4
-; DATA-NEXT: [[TMP20:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD3]] to <vscale x 4 x i64>
-; DATA-NEXT: [[TMP21:%.*]] = icmp slt <vscale x 4 x i64> [[VEC_IND]], [[TMP20]]
-; DATA-NEXT: [[TMP22:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP21]])
-; DATA-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP22]], <vscale x 4 x i1> [[TMP21]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP22]], <vscale x 4 x i32> [[WIDE_LOAD3]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
-; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; DATA-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; DATA-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; DATA: middle.block:
-; DATA-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; DATA-NEXT: [[TMP24:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
-; DATA-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP24]])
-; DATA-NEXT: [[TMP26:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
-; DATA-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP25]], 0
-; DATA-NEXT: [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
-; DATA-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 0, i32 -1
-; DATA-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL5]], i32 [[TMP29]]
-; DATA-NEXT: [[TMP30:%.*]] = icmp sge i32 [[TMP29]], 0
-; DATA-NEXT: [[TMP31:%.*]] = select i1 [[TMP30]], i32 [[CSA_EXTRACT7]], i32 -1
-; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; DATA-NEXT: [[TMP32:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; DATA-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP32]])
-; DATA-NEXT: [[TMP34:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT: [[TMP35:%.*]] = icmp eq i32 [[TMP33]], 0
-; DATA-NEXT: [[TMP36:%.*]] = and i1 [[TMP34]], [[TMP35]]
-; DATA-NEXT: [[TMP37:%.*]] = select i1 [[TMP36]], i32 0, i32 -1
-; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP37]]
-; DATA-NEXT: [[TMP38:%.*]] = icmp sge i32 [[TMP37]], 0
-; DATA-NEXT: [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[CSA_EXTRACT]], i32 -1
-; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA: scalar.ph:
-; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT: [[TMP40:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[OR:%.*]] = phi i32 [ [[TMP40]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
-; DATA-NEXT: ret i32 [[OR]]
-; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; DATA-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP41:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT: [[TMP42:%.*]] = sext i32 [[TMP41]] to i64
-; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP42]]
-; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP41]], i32 [[T_022]]
-; DATA-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP43:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; DATA-NEXT: [[TMP44:%.*]] = sext i32 [[TMP43]] to i64
-; DATA-NEXT: [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP44]]
-; DATA-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP43]], i32 [[S_023]]
-; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-;
-entry:
- %cmp21 = icmp sgt i32 %N, 0
- br i1 %cmp21, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader: ; preds = %entry
- %wide.trip.count = zext i32 %N to i64
- br label %for.body
-
-for.cond.cleanup.loopexit: ; preds = %for.body
- %0 = or i32 %s.1, %spec.select
- br label %for.cond.cleanup
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
- ret i32 %or
-
-for.body: ; preds = %for.body.preheader, %for.body
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
- %s.023 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.body ]
- %t.022 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
- %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
- %1 = load i32, ptr %arrayidx, align 4
- %2 = sext i32 %1 to i64
- %cmp1 = icmp slt i64 %indvars.iv, %2
- %spec.select = select i1 %cmp1, i32 %1, i32 %t.022
- %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
- %3 = load i32, ptr %arrayidx5, align 4
- %4 = sext i32 %3 to i64
- %cmp6 = icmp slt i64 %indvars.iv, %4
- %s.1 = select i1 %cmp6, i32 %3, i32 %s.023
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; float csa_in_series_float_select(int N, float *data0,
-; float *data1) {
-; float t = 1.0f;
-; float s = 1.0f;
-; for (int i = 0; i < N; i++) {
-; if (0.0f < data0[i])
-; t = data0[i];
-; if (0.0f <data1[i])
-; s = data1[i];
-; }
-; return t + s; // use t and s
-; }
-define float @csa_in_series_float_select(i32 %N, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_in_series_float_select(
-; EVL-NEXT: entry:
-; EVL-NEXT: [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL: for.body.preheader:
-; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL: vector.ph:
-; EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
-; EVL: vector.body:
-; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[TMP6]]
-; EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
-; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
-; EVL-NEXT: [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
-; EVL-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
-; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
-; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP10]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[CSA_DATA_PHI2]]
-; EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[TMP6]]
-; EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
-; EVL-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP12]], align 4
-; EVL-NEXT: [[TMP13:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD3]], zeroinitializer
-; EVL-NEXT: [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP13]])
-; EVL-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP14]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP14]], <vscale x 4 x float> [[WIDE_LOAD3]], <vscale x 4 x float> [[CSA_DATA_PHI]]
-; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; EVL: middle.block:
-; EVL-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; EVL-NEXT: [[TMP16:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP16]])
-; EVL-NEXT: [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
-; EVL-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP17]], 0
-; EVL-NEXT: [[TMP20:%.*]] = and i1 [[TMP18]], [[TMP19]]
-; EVL-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i32 0, i32 -1
-; EVL-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL5]], i32 [[TMP21]]
-; EVL-NEXT: [[TMP22:%.*]] = icmp sge i32 [[TMP21]], 0
-; EVL-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[CSA_EXTRACT7]], float 1.000000e+00
-; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; EVL-NEXT: [[TMP24:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP24]])
-; EVL-NEXT: [[TMP26:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP25]], 0
-; EVL-NEXT: [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
-; EVL-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 0, i32 -1
-; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL]], i32 [[TMP29]]
-; EVL-NEXT: [[TMP30:%.*]] = icmp sge i32 [[TMP29]], 0
-; EVL-NEXT: [[TMP31:%.*]] = select i1 [[TMP30]], float [[CSA_EXTRACT]], float 1.000000e+00
-; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL: scalar.ph:
-; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT: [[TMP32:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[ADD:%.*]] = phi float [ [[TMP32]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
-; EVL-NEXT: ret float [[ADD]]
-; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT: [[S_021:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; EVL-NEXT: [[T_020:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP33:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP33]], 0.000000e+00
-; EVL-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP33]], float [[T_020]]
-; EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP34:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
-; EVL-NEXT: [[CMP6:%.*]] = fcmp ogt float [[TMP34]], 0.000000e+00
-; EVL-NEXT: [[S_1]] = select i1 [[CMP6]], float [[TMP34]], float [[S_021]]
-; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-;
-; NO-EVL-LABEL: @csa_in_series_float_select(
-; NO-EVL-NEXT: entry:
-; NO-EVL-NEXT: [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL: for.body.preheader:
-; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL: vector.ph:
-; NO-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; NO-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
-; NO-EVL: vector.body:
-; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[TMP6]]
-; NO-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
-; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
-; NO-EVL-NEXT: [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
-; NO-EVL-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
-; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
-; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP10]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[CSA_DATA_PHI2]]
-; NO-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[TMP6]]
-; NO-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
-; NO-EVL-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP12]], align 4
-; NO-EVL-NEXT: [[TMP13:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD3]], zeroinitializer
-; NO-EVL-NEXT: [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP13]])
-; NO-EVL-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP14]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP14]], <vscale x 4 x float> [[WIDE_LOAD3]], <vscale x 4 x float> [[CSA_DATA_PHI]]
-; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; NO-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; NO-EVL: middle.block:
-; NO-EVL-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; NO-EVL-NEXT: [[TMP16:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP16]])
-; NO-EVL-NEXT: [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
-; NO-EVL-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP17]], 0
-; NO-EVL-NEXT: [[TMP20:%.*]] = and i1 [[TMP18]], [[TMP19]]
-; NO-EVL-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i32 0, i32 -1
-; NO-EVL-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL5]], i32 [[TMP21]]
-; NO-EVL-NEXT: [[TMP22:%.*]] = icmp sge i32 [[TMP21]], 0
-; NO-EVL-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[CSA_EXTRACT7]], float 1.000000e+00
-; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; NO-EVL-NEXT: [[TMP24:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP24]])
-; NO-EVL-NEXT: [[TMP26:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP25]], 0
-; NO-EVL-NEXT: [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
-; NO-EVL-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 0, i32 -1
-; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL]], i32 [[TMP29]]
-; NO-EVL-NEXT: [[TMP30:%.*]] = icmp sge i32 [[TMP29]], 0
-; NO-EVL-NEXT: [[TMP31:%.*]] = select i1 [[TMP30]], float [[CSA_EXTRACT]], float 1.000000e+00
-; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL: scalar.ph:
-; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT: [[TMP32:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[ADD:%.*]] = phi float [ [[TMP32]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
-; NO-EVL-NEXT: ret float [[ADD]]
-; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[S_021:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[T_020:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP33:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP33]], 0.000000e+00
-; NO-EVL-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP33]], float [[T_020]]
-; NO-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP34:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
-; NO-EVL-NEXT: [[CMP6:%.*]] = fcmp ogt float [[TMP34]], 0.000000e+00
-; NO-EVL-NEXT: [[S_1]] = select i1 [[CMP6]], float [[TMP34]], float [[S_021]]
-; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-;
-; DATA-LABEL: @csa_in_series_float_select(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA: vector.ph:
-; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
-; DATA: vector.body:
-; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_MASK_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 4 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[TMP6]]
-; DATA-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
-; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
-; DATA-NEXT: [[TMP9:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
-; DATA-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
-; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[CSA_MASK_PHI1]]
-; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP10]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x float> [[CSA_DATA_PHI2]]
-; DATA-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[TMP6]]
-; DATA-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
-; DATA-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP12]], align 4
-; DATA-NEXT: [[TMP13:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD3]], zeroinitializer
-; DATA-NEXT: [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP13]])
-; DATA-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP14]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP14]], <vscale x 4 x float> [[WIDE_LOAD3]], <vscale x 4 x float> [[CSA_DATA_PHI]]
-; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; DATA-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; DATA: middle.block:
-; DATA-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; DATA-NEXT: [[TMP16:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL4]], <vscale x 4 x i32> [[CSA_STEP6]], <vscale x 4 x i32> zeroinitializer
-; DATA-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP16]])
-; DATA-NEXT: [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL4]], i64 0
-; DATA-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP17]], 0
-; DATA-NEXT: [[TMP20:%.*]] = and i1 [[TMP18]], [[TMP19]]
-; DATA-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i32 0, i32 -1
-; DATA-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL5]], i32 [[TMP21]]
-; DATA-NEXT: [[TMP22:%.*]] = icmp sge i32 [[TMP21]], 0
-; DATA-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], float [[CSA_EXTRACT7]], float 1.000000e+00
-; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; DATA-NEXT: [[TMP24:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; DATA-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP24]])
-; DATA-NEXT: [[TMP26:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT: [[TMP27:%.*]] = icmp eq i32 [[TMP25]], 0
-; DATA-NEXT: [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
-; DATA-NEXT: [[TMP29:%.*]] = select i1 [[TMP28]], i32 0, i32 -1
-; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x float> [[CSA_DATA_SEL]], i32 [[TMP29]]
-; DATA-NEXT: [[TMP30:%.*]] = icmp sge i32 [[TMP29]], 0
-; DATA-NEXT: [[TMP31:%.*]] = select i1 [[TMP30]], float [[CSA_EXTRACT]], float 1.000000e+00
-; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA: scalar.ph:
-; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT: [[TMP32:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[ADD:%.*]] = phi float [ [[TMP32]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
-; DATA-NEXT: ret float [[ADD]]
-; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[S_021:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; DATA-NEXT: [[T_020:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP33:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP33]], 0.000000e+00
-; DATA-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP33]], float [[T_020]]
-; DATA-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP34:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
-; DATA-NEXT: [[CMP6:%.*]] = fcmp ogt float [[TMP34]], 0.000000e+00
-; DATA-NEXT: [[S_1]] = select i1 [[CMP6]], float [[TMP34]], float [[S_021]]
-; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-;
-entry:
- %cmp19 = icmp sgt i32 %N, 0
- br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader: ; preds = %entry
- %wide.trip.count = zext i32 %N to i64
- br label %for.body
-
-for.cond.cleanup.loopexit: ; preds = %for.body
- %0 = fadd float %t.1, %s.1
- br label %for.cond.cleanup
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
- ret float %add
-
-for.body: ; preds = %for.body.preheader, %for.body
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
- %s.021 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.body ]
- %t.020 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.body ]
- %arrayidx = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
- %1 = load float, ptr %arrayidx, align 4
- %cmp1 = fcmp ogt float %1, 0.000000e+00
- %t.1 = select i1 %cmp1, float %1, float %t.020
- %arrayidx5 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
- %2 = load float, ptr %arrayidx5, align 4
- %cmp6 = fcmp ogt float %2, 0.000000e+00
- %s.1 = select i1 %cmp6, float %2, float %s.021
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int csa_in_series_int(int N, bool *cond0, bool *cond1, int *data0, int *data1) {
-; int t = -1;
-; int s = -1;
-; for (int i = 0; i < N; i++) {
-; if (cond0[i])
-; t = data0[i];
-; if (cond1[i])
-; s = data1[i];
-; }
-; return t | s; // use t and s
-; }
-define i32 @csa_in_series_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_in_series_int(
-; EVL-NEXT: entry:
-; EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL: for.body.preheader:
-; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC:%.*]] ]
-; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC]] ]
-; EVL-NEXT: [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
-; EVL-NEXT: ret i32 [[OR]]
-; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; EVL-NEXT: [[S_017:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; EVL-NEXT: [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; EVL: if.then:
-; EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; EVL-NEXT: br label [[IF_END]]
-; EVL: if.end:
-; EVL-NEXT: [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
-; EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
-; EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; EVL: if.then6:
-; EVL-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
-; EVL-NEXT: br label [[FOR_INC]]
-; EVL: for.inc:
-; EVL-NEXT: [[S_1]] = phi i32 [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_END]] ]
-; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; NO-EVL-LABEL: @csa_in_series_int(
-; NO-EVL-NEXT: entry:
-; NO-EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL: for.body.preheader:
-; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC:%.*]] ]
-; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
-; NO-EVL-NEXT: ret i32 [[OR]]
-; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[S_017:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; NO-EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; NO-EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; NO-EVL: if.then:
-; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; NO-EVL-NEXT: br label [[IF_END]]
-; NO-EVL: if.end:
-; NO-EVL-NEXT: [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; NO-EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
-; NO-EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; NO-EVL: if.then6:
-; NO-EVL-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
-; NO-EVL-NEXT: br label [[FOR_INC]]
-; NO-EVL: for.inc:
-; NO-EVL-NEXT: [[S_1]] = phi i32 [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_END]] ]
-; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; DATA-LABEL: @csa_in_series_int(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC:%.*]] ]
-; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC]] ]
-; DATA-NEXT: [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
-; DATA-NEXT: ret i32 [[OR]]
-; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; DATA-NEXT: [[S_017:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; DATA-NEXT: [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; DATA-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; DATA-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; DATA: if.then:
-; DATA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; DATA-NEXT: br label [[IF_END]]
-; DATA: if.end:
-; DATA-NEXT: [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
-; DATA-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; DATA-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
-; DATA-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; DATA: if.then6:
-; DATA-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
-; DATA-NEXT: br label [[FOR_INC]]
-; DATA: for.inc:
-; DATA-NEXT: [[S_1]] = phi i32 [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_END]] ]
-; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-entry:
- %cmp15 = icmp sgt i32 %N, 0
- br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader: ; preds = %entry
- %wide.trip.count = zext i32 %N to i64
- br label %for.body
-
-for.cond.cleanup.loopexit: ; preds = %for.inc
- %0 = or i32 %s.1, %t.1
- br label %for.cond.cleanup
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
- ret i32 %or
-
-for.body: ; preds = %for.body.preheader, %for.inc
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
- %s.017 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
- %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
- %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
- %1 = load i8, ptr %arrayidx, align 1
- %tobool.not = icmp eq i8 %1, 0
- br i1 %tobool.not, label %if.end, label %if.then
-
-if.then: ; preds = %for.body
- %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
- %2 = load i32, ptr %arrayidx2, align 4
- br label %if.end
-
-if.end: ; preds = %if.then, %for.body
- %t.1 = phi i32 [ %2, %if.then ], [ %t.016, %for.body ]
- %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
- %3 = load i8, ptr %arrayidx4, align 1
- %tobool5.not = icmp eq i8 %3, 0
- br i1 %tobool5.not, label %for.inc, label %if.then6
-
-if.then6: ; preds = %if.end
- %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
- %4 = load i32, ptr %arrayidx8, align 4
- br label %for.inc
-
-for.inc: ; preds = %if.end, %if.then6
- %s.1 = phi i32 [ %4, %if.then6 ], [ %s.017, %if.end ]
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; float csa_in_series_float(int N, bool *cond0, bool *cond1, float *data0,
-; float *data1) {
-; float t = 1.0f;
-; float s = 1.0f;
-; for (int i = 0; i < N; i++) {
-; if (cond0[i])
-; t = data0[i];
-; if (cond1[i])
-; s = data1[i];
-; }
-; return t + s; // use t and s
-; }
-define float @csa_in_series_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_in_series_float(
-; EVL-NEXT: entry:
-; EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL: for.body.preheader:
-; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC:%.*]] ]
-; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC]] ]
-; EVL-NEXT: [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
-; EVL-NEXT: ret float [[ADD]]
-; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; EVL-NEXT: [[S_017:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; EVL-NEXT: [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; EVL: if.then:
-; EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; EVL-NEXT: br label [[IF_END]]
-; EVL: if.end:
-; EVL-NEXT: [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
-; EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
-; EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; EVL: if.then6:
-; EVL-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
-; EVL-NEXT: br label [[FOR_INC]]
-; EVL: for.inc:
-; EVL-NEXT: [[S_1]] = phi float [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_END]] ]
-; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; NO-EVL-LABEL: @csa_in_series_float(
-; NO-EVL-NEXT: entry:
-; NO-EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL: for.body.preheader:
-; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC:%.*]] ]
-; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
-; NO-EVL-NEXT: ret float [[ADD]]
-; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[S_017:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; NO-EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; NO-EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; NO-EVL: if.then:
-; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; NO-EVL-NEXT: br label [[IF_END]]
-; NO-EVL: if.end:
-; NO-EVL-NEXT: [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; NO-EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
-; NO-EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; NO-EVL: if.then6:
-; NO-EVL-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
-; NO-EVL-NEXT: br label [[FOR_INC]]
-; NO-EVL: for.inc:
-; NO-EVL-NEXT: [[S_1]] = phi float [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_END]] ]
-; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; DATA-LABEL: @csa_in_series_float(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC:%.*]] ]
-; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC]] ]
-; DATA-NEXT: [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
-; DATA-NEXT: ret float [[ADD]]
-; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; DATA-NEXT: [[S_017:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; DATA-NEXT: [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; DATA-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; DATA-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; DATA: if.then:
-; DATA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; DATA-NEXT: br label [[IF_END]]
-; DATA: if.end:
-; DATA-NEXT: [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
-; DATA-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; DATA-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
-; DATA-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; DATA: if.then6:
-; DATA-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
-; DATA-NEXT: br label [[FOR_INC]]
-; DATA: for.inc:
-; DATA-NEXT: [[S_1]] = phi float [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_END]] ]
-; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-entry:
- %cmp15 = icmp sgt i32 %N, 0
- br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader: ; preds = %entry
- %wide.trip.count = zext i32 %N to i64
- br label %for.body
-
-for.cond.cleanup.loopexit: ; preds = %for.inc
- %0 = fadd float %t.1, %s.1
- br label %for.cond.cleanup
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
- ret float %add
-
-for.body: ; preds = %for.body.preheader, %for.inc
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
- %s.017 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
- %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
- %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
- %1 = load i8, ptr %arrayidx, align 1
- %tobool.not = icmp eq i8 %1, 0
- br i1 %tobool.not, label %if.end, label %if.then
-
-if.then: ; preds = %for.body
- %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
- %2 = load float, ptr %arrayidx2, align 4
- br label %if.end
-
-if.end: ; preds = %if.then, %for.body
- %t.1 = phi float [ %2, %if.then ], [ %t.016, %for.body ]
- %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
- %3 = load i8, ptr %arrayidx4, align 1
- %tobool5.not = icmp eq i8 %3, 0
- br i1 %tobool5.not, label %for.inc, label %if.then6
-
-if.then6: ; preds = %if.end
- %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
- %4 = load float, ptr %arrayidx8, align 4
- br label %for.inc
-
-for.inc: ; preds = %if.end, %if.then6
- %s.1 = phi float [ %4, %if.then6 ], [ %s.017, %if.end ]
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int csa_in_series_same_scalar_int_select(int N, int *data0,
-; int *data1) {
-; int t = -1;
-; for (int i = 0; i < N; i++) {
-; if (i < data0[i])
-; t = data0[i];
-; if (i < data1[i])
-; t = data1[i];
-; }
-; return t; // use t
-; }
-define i32 @csa_in_series_same_scalar_int_select(i32 %N, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_in_series_same_scalar_int_select(
-; EVL-NEXT: entry:
-; EVL-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL: for.body.preheader:
-; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[T_2_LCSSA:%.*]] = phi i32 [ [[T_2:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; EVL-NEXT: ret i32 [[T_0_LCSSA]]
-; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_BODY]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT: [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
-; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP1]]
-; EVL-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_022]]
-; EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; EVL-NEXT: [[TMP3:%.*]] = sext i32 [[TMP2]] to i64
-; EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP3]]
-; EVL-NEXT: [[T_2]] = select i1 [[CMP6]], i32 [[TMP2]], i32 [[SPEC_SELECT]]
-; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; NO-EVL-LABEL: @csa_in_series_same_scalar_int_select(
-; NO-EVL-NEXT: entry:
-; NO-EVL-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL: for.body.preheader:
-; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[T_2_LCSSA:%.*]] = phi i32 [ [[T_2:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; NO-EVL-NEXT: ret i32 [[T_0_LCSSA]]
-; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT: [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
-; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP1]]
-; NO-EVL-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_022]]
-; NO-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; NO-EVL-NEXT: [[TMP3:%.*]] = sext i32 [[TMP2]] to i64
-; NO-EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP3]]
-; NO-EVL-NEXT: [[T_2]] = select i1 [[CMP6]], i32 [[TMP2]], i32 [[SPEC_SELECT]]
-; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; DATA-LABEL: @csa_in_series_same_scalar_int_select(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[T_2_LCSSA:%.*]] = phi i32 [ [[T_2:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; DATA-NEXT: ret i32 [[T_0_LCSSA]]
-; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_BODY]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT: [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
-; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP1]]
-; DATA-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP1]], i32 [[TMP0]], i32 [[T_022]]
-; DATA-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; DATA-NEXT: [[TMP3:%.*]] = sext i32 [[TMP2]] to i64
-; DATA-NEXT: [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP3]]
-; DATA-NEXT: [[T_2]] = select i1 [[CMP6]], i32 [[TMP2]], i32 [[SPEC_SELECT]]
-; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-entry:
- %cmp21 = icmp sgt i32 %N, 0
- br i1 %cmp21, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader: ; preds = %entry
- %wide.trip.count = zext i32 %N to i64
- br label %for.body
-
-for.cond.cleanup: ; preds = %for.body, %entry
- %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.2, %for.body ]
- ret i32 %t.0.lcssa
-
-for.body: ; preds = %for.body.preheader, %for.body
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
- %t.022 = phi i32 [ -1, %for.body.preheader ], [ %t.2, %for.body ]
- %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
- %0 = load i32, ptr %arrayidx, align 4
- %1 = sext i32 %0 to i64
- %cmp1 = icmp slt i64 %indvars.iv, %1
- %spec.select = select i1 %cmp1, i32 %0, i32 %t.022
- %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
- %2 = load i32, ptr %arrayidx5, align 4
- %3 = sext i32 %2 to i64
- %cmp6 = icmp slt i64 %indvars.iv, %3
- %t.2 = select i1 %cmp6, i32 %2, i32 %spec.select
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; float csa_in_series_same_scalar_float_select(int N,
-; float *data0, float *data1) {
-; float t = 1.0f;
-; for (int i = 0; i < N; i++) {
-; if (0.0f < data0[i])
-; t = data0[i];
-; if (0.0f < data1[i])
-; t = data1[i];
-; }
-; return t; // use t
-; }
-define float @csa_in_series_same_scalar_float_select(i32 %N, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_in_series_same_scalar_float_select(
-; EVL-NEXT: entry:
-; EVL-NEXT: [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL: for.body.preheader:
-; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[T_2_LCSSA:%.*]] = phi float [ [[T_2:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; EVL-NEXT: ret float [[T_0_LCSSA]]
-; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT: [[T_020:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_BODY]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 0.000000e+00
-; EVL-NEXT: [[T_1:%.*]] = select i1 [[CMP1]], float [[TMP0]], float [[T_020]]
-; EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
-; EVL-NEXT: [[CMP6:%.*]] = fcmp ogt float [[TMP1]], 0.000000e+00
-; EVL-NEXT: [[T_2]] = select i1 [[CMP6]], float [[TMP1]], float [[T_1]]
-; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; NO-EVL-LABEL: @csa_in_series_same_scalar_float_select(
-; NO-EVL-NEXT: entry:
-; NO-EVL-NEXT: [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL: for.body.preheader:
-; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[T_2_LCSSA:%.*]] = phi float [ [[T_2:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; NO-EVL-NEXT: ret float [[T_0_LCSSA]]
-; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[T_020:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 0.000000e+00
-; NO-EVL-NEXT: [[T_1:%.*]] = select i1 [[CMP1]], float [[TMP0]], float [[T_020]]
-; NO-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
-; NO-EVL-NEXT: [[CMP6:%.*]] = fcmp ogt float [[TMP1]], 0.000000e+00
-; NO-EVL-NEXT: [[T_2]] = select i1 [[CMP6]], float [[TMP1]], float [[T_1]]
-; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; DATA-LABEL: @csa_in_series_same_scalar_float_select(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[T_2_LCSSA:%.*]] = phi float [ [[T_2:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; DATA-NEXT: ret float [[T_0_LCSSA]]
-; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[T_020:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_BODY]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 0.000000e+00
-; DATA-NEXT: [[T_1:%.*]] = select i1 [[CMP1]], float [[TMP0]], float [[T_020]]
-; DATA-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
-; DATA-NEXT: [[CMP6:%.*]] = fcmp ogt float [[TMP1]], 0.000000e+00
-; DATA-NEXT: [[T_2]] = select i1 [[CMP6]], float [[TMP1]], float [[T_1]]
-; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-entry:
- %cmp19 = icmp sgt i32 %N, 0
- br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader: ; preds = %entry
- %wide.trip.count = zext i32 %N to i64
- br label %for.body
-
-for.cond.cleanup: ; preds = %for.body, %entry
- %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.2, %for.body ]
- ret float %t.0.lcssa
-
-for.body: ; preds = %for.body.preheader, %for.body
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
- %t.020 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.2, %for.body ]
- %arrayidx = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
- %0 = load float, ptr %arrayidx, align 4
- %cmp1 = fcmp ogt float %0, 0.000000e+00
- %t.1 = select i1 %cmp1, float %0, float %t.020
- %arrayidx5 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
- %1 = load float, ptr %arrayidx5, align 4
- %cmp6 = fcmp ogt float %1, 0.000000e+00
- %t.2 = select i1 %cmp6, float %1, float %t.1
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int csa_in_series_same_scalar_int(int N, bool *cond0, bool *cond1, int *data0,
-; int *data1) {
-; int t = -1;
-; for (int i = 0; i < N; i++) {
-; if (cond0[i])
-; t = data0[i];
-; if (cond1[i])
-; t = data1[i];
-; }
-; return t; // use t
-; }
-define i32 @csa_in_series_same_scalar_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_in_series_same_scalar_int(
-; EVL-NEXT: entry:
-; EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL: for.body.preheader:
-; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[T_2_LCSSA:%.*]] = phi i32 [ [[T_2:%.*]], [[FOR_INC:%.*]] ]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; EVL-NEXT: ret i32 [[T_0_LCSSA]]
-; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; EVL-NEXT: [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_INC]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; EVL: if.then:
-; EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; EVL-NEXT: br label [[IF_END]]
-; EVL: if.end:
-; EVL-NEXT: [[T_1:%.*]] = phi i32 [ [[TMP1]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
-; EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP2]], 0
-; EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; EVL: if.then6:
-; EVL-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
-; EVL-NEXT: br label [[FOR_INC]]
-; EVL: for.inc:
-; EVL-NEXT: [[T_2]] = phi i32 [ [[TMP3]], [[IF_THEN6]] ], [ [[T_1]], [[IF_END]] ]
-; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; NO-EVL-LABEL: @csa_in_series_same_scalar_int(
-; NO-EVL-NEXT: entry:
-; NO-EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL: for.body.preheader:
-; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[T_2_LCSSA:%.*]] = phi i32 [ [[T_2:%.*]], [[FOR_INC:%.*]] ]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; NO-EVL-NEXT: ret i32 [[T_0_LCSSA]]
-; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; NO-EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; NO-EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; NO-EVL: if.then:
-; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; NO-EVL-NEXT: br label [[IF_END]]
-; NO-EVL: if.end:
-; NO-EVL-NEXT: [[T_1:%.*]] = phi i32 [ [[TMP1]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; NO-EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP2]], 0
-; NO-EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; NO-EVL: if.then6:
-; NO-EVL-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
-; NO-EVL-NEXT: br label [[FOR_INC]]
-; NO-EVL: for.inc:
-; NO-EVL-NEXT: [[T_2]] = phi i32 [ [[TMP3]], [[IF_THEN6]] ], [ [[T_1]], [[IF_END]] ]
-; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; DATA-LABEL: @csa_in_series_same_scalar_int(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[T_2_LCSSA:%.*]] = phi i32 [ [[T_2:%.*]], [[FOR_INC:%.*]] ]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; DATA-NEXT: ret i32 [[T_0_LCSSA]]
-; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; DATA-NEXT: [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_INC]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; DATA-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; DATA-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; DATA: if.then:
-; DATA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; DATA-NEXT: br label [[IF_END]]
-; DATA: if.end:
-; DATA-NEXT: [[T_1:%.*]] = phi i32 [ [[TMP1]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
-; DATA-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; DATA-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP2]], 0
-; DATA-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; DATA: if.then6:
-; DATA-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
-; DATA-NEXT: br label [[FOR_INC]]
-; DATA: for.inc:
-; DATA-NEXT: [[T_2]] = phi i32 [ [[TMP3]], [[IF_THEN6]] ], [ [[T_1]], [[IF_END]] ]
-; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-entry:
- %cmp15 = icmp sgt i32 %N, 0
- br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader: ; preds = %entry
- %wide.trip.count = zext i32 %N to i64
- br label %for.body
-
-for.cond.cleanup: ; preds = %for.inc, %entry
- %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.2, %for.inc ]
- ret i32 %t.0.lcssa
-
-for.body: ; preds = %for.body.preheader, %for.inc
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
- %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.2, %for.inc ]
- %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
- %0 = load i8, ptr %arrayidx, align 1
- %tobool.not = icmp eq i8 %0, 0
- br i1 %tobool.not, label %if.end, label %if.then
-
-if.then: ; preds = %for.body
- %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
- %1 = load i32, ptr %arrayidx2, align 4
- br label %if.end
-
-if.end: ; preds = %if.then, %for.body
- %t.1 = phi i32 [ %1, %if.then ], [ %t.016, %for.body ]
- %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
- %2 = load i8, ptr %arrayidx4, align 1
- %tobool5.not = icmp eq i8 %2, 0
- br i1 %tobool5.not, label %for.inc, label %if.then6
-
-if.then6: ; preds = %if.end
- %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
- %3 = load i32, ptr %arrayidx8, align 4
- br label %for.inc
-
-for.inc: ; preds = %if.end, %if.then6
- %t.2 = phi i32 [ %3, %if.then6 ], [ %t.1, %if.end ]
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; float csa_in_series_same_scalar_float(int N, bool *cond0, bool *cond1,
-; float *data0, float *data1) {
-; float t = 1.0f;
-; for (int i = 0; i < N; i++) {
-; if (cond0[i])
-; t = data0[i];
-; if (cond1[i])
-; t = data1[i];
-; }
-; return t; // use t
-; }
-define float @csa_in_series_same_scalar_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_in_series_same_scalar_float(
-; EVL-NEXT: entry:
-; EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL: for.body.preheader:
-; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[T_2_LCSSA:%.*]] = phi float [ [[T_2:%.*]], [[FOR_INC:%.*]] ]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; EVL-NEXT: ret float [[T_0_LCSSA]]
-; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; EVL-NEXT: [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_INC]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; EVL: if.then:
-; EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; EVL-NEXT: br label [[IF_END]]
-; EVL: if.end:
-; EVL-NEXT: [[T_1:%.*]] = phi float [ [[TMP1]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
-; EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP2]], 0
-; EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; EVL: if.then6:
-; EVL-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
-; EVL-NEXT: br label [[FOR_INC]]
-; EVL: for.inc:
-; EVL-NEXT: [[T_2]] = phi float [ [[TMP3]], [[IF_THEN6]] ], [ [[T_1]], [[IF_END]] ]
-; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; NO-EVL-LABEL: @csa_in_series_same_scalar_float(
-; NO-EVL-NEXT: entry:
-; NO-EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL: for.body.preheader:
-; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[T_2_LCSSA:%.*]] = phi float [ [[T_2:%.*]], [[FOR_INC:%.*]] ]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; NO-EVL-NEXT: ret float [[T_0_LCSSA]]
-; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; NO-EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; NO-EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; NO-EVL: if.then:
-; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; NO-EVL-NEXT: br label [[IF_END]]
-; NO-EVL: if.end:
-; NO-EVL-NEXT: [[T_1:%.*]] = phi float [ [[TMP1]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; NO-EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP2]], 0
-; NO-EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; NO-EVL: if.then6:
-; NO-EVL-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
-; NO-EVL-NEXT: br label [[FOR_INC]]
-; NO-EVL: for.inc:
-; NO-EVL-NEXT: [[T_2]] = phi float [ [[TMP3]], [[IF_THEN6]] ], [ [[T_1]], [[IF_END]] ]
-; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; DATA-LABEL: @csa_in_series_same_scalar_float(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[T_2_LCSSA:%.*]] = phi float [ [[T_2:%.*]], [[FOR_INC:%.*]] ]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_2_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; DATA-NEXT: ret float [[T_0_LCSSA]]
-; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; DATA-NEXT: [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_2]], [[FOR_INC]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; DATA-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; DATA-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; DATA: if.then:
-; DATA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; DATA-NEXT: br label [[IF_END]]
-; DATA: if.end:
-; DATA-NEXT: [[T_1:%.*]] = phi float [ [[TMP1]], [[IF_THEN]] ], [ [[T_016]], [[FOR_BODY]] ]
-; DATA-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; DATA-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP2]], 0
-; DATA-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; DATA: if.then6:
-; DATA-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
-; DATA-NEXT: br label [[FOR_INC]]
-; DATA: for.inc:
-; DATA-NEXT: [[T_2]] = phi float [ [[TMP3]], [[IF_THEN6]] ], [ [[T_1]], [[IF_END]] ]
-; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-entry:
- %cmp15 = icmp sgt i32 %N, 0
- br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader: ; preds = %entry
- %wide.trip.count = zext i32 %N to i64
- br label %for.body
-
-for.cond.cleanup: ; preds = %for.inc, %entry
- %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.2, %for.inc ]
- ret float %t.0.lcssa
-
-for.body: ; preds = %for.body.preheader, %for.inc
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
- %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.2, %for.inc ]
- %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
- %0 = load i8, ptr %arrayidx, align 1
- %tobool.not = icmp eq i8 %0, 0
- br i1 %tobool.not, label %if.end, label %if.then
-
-if.then: ; preds = %for.body
- %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
- %1 = load float, ptr %arrayidx2, align 4
- br label %if.end
-
-if.end: ; preds = %if.then, %for.body
- %t.1 = phi float [ %1, %if.then ], [ %t.016, %for.body ]
- %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
- %2 = load i8, ptr %arrayidx4, align 1
- %tobool5.not = icmp eq i8 %2, 0
- br i1 %tobool5.not, label %for.inc, label %if.then6
-
-if.then6: ; preds = %if.end
- %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
- %3 = load float, ptr %arrayidx8, align 4
- br label %for.inc
-
-for.inc: ; preds = %if.end, %if.then6
- %t.2 = phi float [ %3, %if.then6 ], [ %t.1, %if.end ]
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int csa_same_cond_int(int N, bool *cond, int *data0, int *data1) {
-; int t = -1;
-; int s = -1;
-; for (int i = 0; i < N; i++) {
-; if (cond[i]) {
-; t = data0[i];
-; s = data1[i];
-; }
-; }
-; return t | s; // use t and s
-; }
-define i32 @csa_same_cond_int(i32 %N, ptr %cond, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_same_cond_int(
-; EVL-NEXT: entry:
-; EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL: for.body.preheader:
-; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC]] ]
-; EVL-NEXT: [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
-; EVL-NEXT: ret i32 [[OR]]
-; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; EVL-NEXT: [[S_011:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; EVL-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; EVL: if.then:
-; EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
-; EVL-NEXT: br label [[FOR_INC]]
-; EVL: for.inc:
-; EVL-NEXT: [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_010]], [[FOR_BODY]] ]
-; EVL-NEXT: [[S_1]] = phi i32 [ [[TMP3]], [[IF_THEN]] ], [ [[S_011]], [[FOR_BODY]] ]
-; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; NO-EVL-LABEL: @csa_same_cond_int(
-; NO-EVL-NEXT: entry:
-; NO-EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL: for.body.preheader:
-; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
-; NO-EVL-NEXT: ret i32 [[OR]]
-; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[S_011:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; NO-EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; NO-EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; NO-EVL: if.then:
-; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; NO-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
-; NO-EVL-NEXT: br label [[FOR_INC]]
-; NO-EVL: for.inc:
-; NO-EVL-NEXT: [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_010]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[S_1]] = phi i32 [ [[TMP3]], [[IF_THEN]] ], [ [[S_011]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; DATA-LABEL: @csa_same_cond_int(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; DATA-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC]] ]
-; DATA-NEXT: [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
-; DATA-NEXT: ret i32 [[OR]]
-; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; DATA-NEXT: [[S_011:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; DATA-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; DATA-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; DATA-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; DATA: if.then:
-; DATA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; DATA-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
-; DATA-NEXT: br label [[FOR_INC]]
-; DATA: for.inc:
-; DATA-NEXT: [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_010]], [[FOR_BODY]] ]
-; DATA-NEXT: [[S_1]] = phi i32 [ [[TMP3]], [[IF_THEN]] ], [ [[S_011]], [[FOR_BODY]] ]
-; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-entry:
- %cmp9 = icmp sgt i32 %N, 0
- br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader: ; preds = %entry
- %wide.trip.count = zext i32 %N to i64
- br label %for.body
-
-for.cond.cleanup.loopexit: ; preds = %for.inc
- %0 = or i32 %s.1, %t.1
- br label %for.cond.cleanup
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
- ret i32 %or
-
-for.body: ; preds = %for.body.preheader, %for.inc
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
- %s.011 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
- %t.010 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
- %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
- %1 = load i8, ptr %arrayidx, align 1
- %tobool.not = icmp eq i8 %1, 0
- br i1 %tobool.not, label %for.inc, label %if.then
-
-if.then: ; preds = %for.body
- %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
- %2 = load i32, ptr %arrayidx2, align 4
- %arrayidx4 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
- %3 = load i32, ptr %arrayidx4, align 4
- br label %for.inc
-
-for.inc: ; preds = %for.body, %if.then
- %t.1 = phi i32 [ %2, %if.then ], [ %t.010, %for.body ]
- %s.1 = phi i32 [ %3, %if.then ], [ %s.011, %for.body ]
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; float csa_same_cond_float(int N, bool *cond, float *data0, float *data1) {
-; float t = 1.0f;
-; float s = 1.0f;
-; for (int i = 0; i < N; i++) {
-; if (cond[i]) {
-; t = data0[i];
-; s = data1[i];
-; }
-; }
-; return t + s; // use t and s
-; }
-define float @csa_same_cond_float(i32 %N, ptr %cond, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_same_cond_float(
-; EVL-NEXT: entry:
-; EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL: for.body.preheader:
-; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC]] ]
-; EVL-NEXT: [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
-; EVL-NEXT: ret float [[ADD]]
-; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; EVL-NEXT: [[S_011:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; EVL-NEXT: [[T_010:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; EVL: if.then:
-; EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
-; EVL-NEXT: br label [[FOR_INC]]
-; EVL: for.inc:
-; EVL-NEXT: [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_010]], [[FOR_BODY]] ]
-; EVL-NEXT: [[S_1]] = phi float [ [[TMP3]], [[IF_THEN]] ], [ [[S_011]], [[FOR_BODY]] ]
-; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; NO-EVL-LABEL: @csa_same_cond_float(
-; NO-EVL-NEXT: entry:
-; NO-EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL: for.body.preheader:
-; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
-; NO-EVL-NEXT: ret float [[ADD]]
-; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[S_011:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[T_010:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; NO-EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; NO-EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; NO-EVL: if.then:
-; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; NO-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
-; NO-EVL-NEXT: br label [[FOR_INC]]
-; NO-EVL: for.inc:
-; NO-EVL-NEXT: [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_010]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[S_1]] = phi float [ [[TMP3]], [[IF_THEN]] ], [ [[S_011]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; DATA-LABEL: @csa_same_cond_float(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; DATA-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC]] ]
-; DATA-NEXT: [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
-; DATA-NEXT: ret float [[ADD]]
-; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; DATA-NEXT: [[S_011:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; DATA-NEXT: [[T_010:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; DATA-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; DATA-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; DATA: if.then:
-; DATA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; DATA-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
-; DATA-NEXT: br label [[FOR_INC]]
-; DATA: for.inc:
-; DATA-NEXT: [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_010]], [[FOR_BODY]] ]
-; DATA-NEXT: [[S_1]] = phi float [ [[TMP3]], [[IF_THEN]] ], [ [[S_011]], [[FOR_BODY]] ]
-; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-entry:
- %cmp9 = icmp sgt i32 %N, 0
- br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader: ; preds = %entry
- %wide.trip.count = zext i32 %N to i64
- br label %for.body
-
-for.cond.cleanup.loopexit: ; preds = %for.inc
- %0 = fadd float %t.1, %s.1
- br label %for.cond.cleanup
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
- ret float %add
-
-for.body: ; preds = %for.body.preheader, %for.inc
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
- %s.011 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
- %t.010 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
- %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
- %1 = load i8, ptr %arrayidx, align 1
- %tobool.not = icmp eq i8 %1, 0
- br i1 %tobool.not, label %for.inc, label %if.then
-
-if.then: ; preds = %for.body
- %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
- %2 = load float, ptr %arrayidx2, align 4
- %arrayidx4 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
- %3 = load float, ptr %arrayidx4, align 4
- br label %for.inc
-
-for.inc: ; preds = %for.body, %if.then
- %t.1 = phi float [ %2, %if.then ], [ %t.010, %for.body ]
- %s.1 = phi float [ %3, %if.then ], [ %s.011, %for.body ]
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int csa_else_if_same_scalar_int(int N, bool *cond0, bool *cond1, int *data0,
-; int *data1) {
-; int t = -1;
-; for (int i = 0; i < N; i++) {
-; if (cond0[i])
-; t = data0[i];
-; else if (cond1[i])
-; t = data1[i];
-; }
-; return t; // use t
-; }
-define i32 @csa_else_if_same_scalar_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_else_if_same_scalar_int(
-; EVL-NEXT: entry:
-; EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL: for.body.preheader:
-; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; EVL-NEXT: ret i32 [[T_0_LCSSA]]
-; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; EVL-NEXT: [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[FOR_INC_SINK_SPLIT:%.*]]
-; EVL: if.else:
-; EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[FOR_INC_SINK_SPLIT]]
-; EVL: for.inc.sink.split:
-; EVL-NEXT: [[DATA0_SINK:%.*]] = phi ptr [ [[DATA0:%.*]], [[FOR_BODY]] ], [ [[DATA1:%.*]], [[IF_ELSE]] ]
-; EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0_SINK]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; EVL-NEXT: br label [[FOR_INC]]
-; EVL: for.inc:
-; EVL-NEXT: [[T_1]] = phi i32 [ [[T_016]], [[IF_ELSE]] ], [ [[TMP2]], [[FOR_INC_SINK_SPLIT]] ]
-; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; NO-EVL-LABEL: @csa_else_if_same_scalar_int(
-; NO-EVL-NEXT: entry:
-; NO-EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL: for.body.preheader:
-; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; NO-EVL-NEXT: ret i32 [[T_0_LCSSA]]
-; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; NO-EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; NO-EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[FOR_INC_SINK_SPLIT:%.*]]
-; NO-EVL: if.else:
-; NO-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; NO-EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; NO-EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[FOR_INC_SINK_SPLIT]]
-; NO-EVL: for.inc.sink.split:
-; NO-EVL-NEXT: [[DATA0_SINK:%.*]] = phi ptr [ [[DATA0:%.*]], [[FOR_BODY]] ], [ [[DATA1:%.*]], [[IF_ELSE]] ]
-; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0_SINK]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; NO-EVL-NEXT: br label [[FOR_INC]]
-; NO-EVL: for.inc:
-; NO-EVL-NEXT: [[T_1]] = phi i32 [ [[T_016]], [[IF_ELSE]] ], [ [[TMP2]], [[FOR_INC_SINK_SPLIT]] ]
-; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; DATA-LABEL: @csa_else_if_same_scalar_int(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; DATA-NEXT: ret i32 [[T_0_LCSSA]]
-; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; DATA-NEXT: [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; DATA-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; DATA-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[FOR_INC_SINK_SPLIT:%.*]]
-; DATA: if.else:
-; DATA-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; DATA-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; DATA-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[FOR_INC_SINK_SPLIT]]
-; DATA: for.inc.sink.split:
-; DATA-NEXT: [[DATA0_SINK:%.*]] = phi ptr [ [[DATA0:%.*]], [[FOR_BODY]] ], [ [[DATA1:%.*]], [[IF_ELSE]] ]
-; DATA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0_SINK]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; DATA-NEXT: br label [[FOR_INC]]
-; DATA: for.inc:
-; DATA-NEXT: [[T_1]] = phi i32 [ [[T_016]], [[IF_ELSE]] ], [ [[TMP2]], [[FOR_INC_SINK_SPLIT]] ]
-; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-entry:
- %cmp15 = icmp sgt i32 %N, 0
- br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader: ; preds = %entry
- %wide.trip.count = zext i32 %N to i64
- br label %for.body
-
-for.cond.cleanup: ; preds = %for.inc, %entry
- %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.1, %for.inc ]
- ret i32 %t.0.lcssa
-
-for.body: ; preds = %for.body.preheader, %for.inc
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
- %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
- %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
- %0 = load i8, ptr %arrayidx, align 1
- %tobool.not = icmp eq i8 %0, 0
- br i1 %tobool.not, label %if.else, label %for.inc.sink.split
-
-if.else: ; preds = %for.body
- %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
- %1 = load i8, ptr %arrayidx4, align 1
- %tobool5.not = icmp eq i8 %1, 0
- br i1 %tobool5.not, label %for.inc, label %for.inc.sink.split
-
-for.inc.sink.split: ; preds = %if.else, %for.body
- %data0.sink = phi ptr [ %data0, %for.body ], [ %data1, %if.else ]
- %arrayidx2 = getelementptr inbounds i32, ptr %data0.sink, i64 %indvars.iv
- %2 = load i32, ptr %arrayidx2, align 4
- br label %for.inc
-
-for.inc: ; preds = %for.inc.sink.split, %if.else
- %t.1 = phi i32 [ %t.016, %if.else ], [ %2, %for.inc.sink.split ]
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; float csa_else_if_same_scalar_float(int N, bool *cond0, bool *cond1,
-; float *data0, float *data1) {
-; float t = 1.0f;
-; for (int i = 0; i < N; i++) {
-; if (cond0[i])
-; t = data0[i];
-; else if (cond1[i])
-; t = data1[i];
-; }
-; return t; // use t
-; }
-define float @csa_else_if_same_scalar_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_else_if_same_scalar_float(
-; EVL-NEXT: entry:
-; EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL: for.body.preheader:
-; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; EVL-NEXT: ret float [[T_0_LCSSA]]
-; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; EVL-NEXT: [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[FOR_INC_SINK_SPLIT:%.*]]
-; EVL: if.else:
-; EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[FOR_INC_SINK_SPLIT]]
-; EVL: for.inc.sink.split:
-; EVL-NEXT: [[DATA0_SINK:%.*]] = phi ptr [ [[DATA0:%.*]], [[FOR_BODY]] ], [ [[DATA1:%.*]], [[IF_ELSE]] ]
-; EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0_SINK]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; EVL-NEXT: br label [[FOR_INC]]
-; EVL: for.inc:
-; EVL-NEXT: [[T_1]] = phi float [ [[T_016]], [[IF_ELSE]] ], [ [[TMP2]], [[FOR_INC_SINK_SPLIT]] ]
-; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; NO-EVL-LABEL: @csa_else_if_same_scalar_float(
-; NO-EVL-NEXT: entry:
-; NO-EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL: for.body.preheader:
-; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; NO-EVL-NEXT: ret float [[T_0_LCSSA]]
-; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; NO-EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; NO-EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[FOR_INC_SINK_SPLIT:%.*]]
-; NO-EVL: if.else:
-; NO-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; NO-EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; NO-EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[FOR_INC_SINK_SPLIT]]
-; NO-EVL: for.inc.sink.split:
-; NO-EVL-NEXT: [[DATA0_SINK:%.*]] = phi ptr [ [[DATA0:%.*]], [[FOR_BODY]] ], [ [[DATA1:%.*]], [[IF_ELSE]] ]
-; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0_SINK]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; NO-EVL-NEXT: br label [[FOR_INC]]
-; NO-EVL: for.inc:
-; NO-EVL-NEXT: [[T_1]] = phi float [ [[T_016]], [[IF_ELSE]] ], [ [[TMP2]], [[FOR_INC_SINK_SPLIT]] ]
-; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; DATA-LABEL: @csa_else_if_same_scalar_float(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; DATA-NEXT: ret float [[T_0_LCSSA]]
-; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; DATA-NEXT: [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; DATA-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; DATA-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[FOR_INC_SINK_SPLIT:%.*]]
-; DATA: if.else:
-; DATA-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; DATA-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; DATA-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[FOR_INC_SINK_SPLIT]]
-; DATA: for.inc.sink.split:
-; DATA-NEXT: [[DATA0_SINK:%.*]] = phi ptr [ [[DATA0:%.*]], [[FOR_BODY]] ], [ [[DATA1:%.*]], [[IF_ELSE]] ]
-; DATA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0_SINK]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; DATA-NEXT: br label [[FOR_INC]]
-; DATA: for.inc:
-; DATA-NEXT: [[T_1]] = phi float [ [[T_016]], [[IF_ELSE]] ], [ [[TMP2]], [[FOR_INC_SINK_SPLIT]] ]
-; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-entry:
- %cmp15 = icmp sgt i32 %N, 0
- br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader: ; preds = %entry
- %wide.trip.count = zext i32 %N to i64
- br label %for.body
-
-for.cond.cleanup: ; preds = %for.inc, %entry
- %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.inc ]
- ret float %t.0.lcssa
-
-for.body: ; preds = %for.body.preheader, %for.inc
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
- %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
- %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
- %0 = load i8, ptr %arrayidx, align 1
- %tobool.not = icmp eq i8 %0, 0
- br i1 %tobool.not, label %if.else, label %for.inc.sink.split
-
-if.else: ; preds = %for.body
- %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
- %1 = load i8, ptr %arrayidx4, align 1
- %tobool5.not = icmp eq i8 %1, 0
- br i1 %tobool5.not, label %for.inc, label %for.inc.sink.split
-
-for.inc.sink.split: ; preds = %if.else, %for.body
- %data0.sink = phi ptr [ %data0, %for.body ], [ %data1, %if.else ]
- %arrayidx2 = getelementptr inbounds float, ptr %data0.sink, i64 %indvars.iv
- %2 = load float, ptr %arrayidx2, align 4
- br label %for.inc
-
-for.inc: ; preds = %for.inc.sink.split, %if.else
- %t.1 = phi float [ %t.016, %if.else ], [ %2, %for.inc.sink.split ]
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int csa_else_if_int(int N, bool *cond0, bool *cond1, int *data0, int *data1) {
-; int t = -1;
-; int s = -1;
-; for (int i = 0; i < N; i++) {
-; if (cond0[i])
-; t = data0[i];
-; else if (cond1[i])
-; s = data1[i];
-; }
-; return t | s; // use t and s
-; }
-define i32 @csa_else_if_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_else_if_int(
-; EVL-NEXT: entry:
-; EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL: for.body.preheader:
-; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC]] ]
-; EVL-NEXT: [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
-; EVL-NEXT: ret i32 [[OR]]
-; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; EVL-NEXT: [[S_017:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; EVL-NEXT: [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
-; EVL: if.then:
-; EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; EVL-NEXT: br label [[FOR_INC]]
-; EVL: if.else:
-; EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
-; EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; EVL: if.then6:
-; EVL-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
-; EVL-NEXT: br label [[FOR_INC]]
-; EVL: for.inc:
-; EVL-NEXT: [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[IF_THEN6]] ], [ [[T_016]], [[IF_ELSE]] ]
-; EVL-NEXT: [[S_1]] = phi i32 [ [[S_017]], [[IF_THEN]] ], [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_ELSE]] ]
-; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; NO-EVL-LABEL: @csa_else_if_int(
-; NO-EVL-NEXT: entry:
-; NO-EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL: for.body.preheader:
-; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
-; NO-EVL-NEXT: ret i32 [[OR]]
-; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[S_017:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; NO-EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; NO-EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
-; NO-EVL: if.then:
-; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; NO-EVL-NEXT: br label [[FOR_INC]]
-; NO-EVL: if.else:
-; NO-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; NO-EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
-; NO-EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; NO-EVL: if.then6:
-; NO-EVL-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
-; NO-EVL-NEXT: br label [[FOR_INC]]
-; NO-EVL: for.inc:
-; NO-EVL-NEXT: [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[IF_THEN6]] ], [ [[T_016]], [[IF_ELSE]] ]
-; NO-EVL-NEXT: [[S_1]] = phi i32 [ [[S_017]], [[IF_THEN]] ], [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_ELSE]] ]
-; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; DATA-LABEL: @csa_else_if_int(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi i32 [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; DATA-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_INC]] ]
-; DATA-NEXT: [[TMP0:%.*]] = or i32 [[S_1_LCSSA]], [[T_1_LCSSA]]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[OR:%.*]] = phi i32 [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ -1, [[ENTRY:%.*]] ]
-; DATA-NEXT: ret i32 [[OR]]
-; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; DATA-NEXT: [[S_017:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; DATA-NEXT: [[T_016:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; DATA-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; DATA-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
-; DATA: if.then:
-; DATA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; DATA-NEXT: br label [[FOR_INC]]
-; DATA: if.else:
-; DATA-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; DATA-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
-; DATA-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; DATA: if.then6:
-; DATA-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
-; DATA-NEXT: br label [[FOR_INC]]
-; DATA: for.inc:
-; DATA-NEXT: [[T_1]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[IF_THEN6]] ], [ [[T_016]], [[IF_ELSE]] ]
-; DATA-NEXT: [[S_1]] = phi i32 [ [[S_017]], [[IF_THEN]] ], [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_ELSE]] ]
-; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-entry:
- %cmp15 = icmp sgt i32 %N, 0
- br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader: ; preds = %entry
- %wide.trip.count = zext i32 %N to i64
- br label %for.body
-
-for.cond.cleanup.loopexit: ; preds = %for.inc
- %0 = or i32 %s.1, %t.1
- br label %for.cond.cleanup
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
- ret i32 %or
-
-for.body: ; preds = %for.body.preheader, %for.inc
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
- %s.017 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
- %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
- %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
- %1 = load i8, ptr %arrayidx, align 1
- %tobool.not = icmp eq i8 %1, 0
- br i1 %tobool.not, label %if.else, label %if.then
-
-if.then: ; preds = %for.body
- %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
- %2 = load i32, ptr %arrayidx2, align 4
- br label %for.inc
-
-if.else: ; preds = %for.body
- %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
- %3 = load i8, ptr %arrayidx4, align 1
- %tobool5.not = icmp eq i8 %3, 0
- br i1 %tobool5.not, label %for.inc, label %if.then6
-
-if.then6: ; preds = %if.else
- %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
- %4 = load i32, ptr %arrayidx8, align 4
- br label %for.inc
-
-for.inc: ; preds = %if.then, %if.then6, %if.else
- %t.1 = phi i32 [ %2, %if.then ], [ %t.016, %if.then6 ], [ %t.016, %if.else ]
- %s.1 = phi i32 [ %s.017, %if.then ], [ %4, %if.then6 ], [ %s.017, %if.else ]
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; float csa_else_if_float(int N, bool *cond0, bool *cond1, float *data0,
-; float *data1) {
-; float t = 1.0f;
-; float s = 1.0f;
-; for (int i = 0; i < N; i++) {
-; if (cond0[i])
-; t = data0[i];
-; else if (cond1[i])
-; s = data1[i];
-; }
-; return t + s; // use t and s
-; }
-define float @csa_else_if_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
-; EVL-LABEL: @csa_else_if_float(
-; EVL-NEXT: entry:
-; EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL: for.body.preheader:
-; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC]] ]
-; EVL-NEXT: [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
-; EVL-NEXT: ret float [[ADD]]
-; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; EVL-NEXT: [[S_017:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; EVL-NEXT: [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
-; EVL: if.then:
-; EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; EVL-NEXT: br label [[FOR_INC]]
-; EVL: if.else:
-; EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
-; EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; EVL: if.then6:
-; EVL-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
-; EVL-NEXT: br label [[FOR_INC]]
-; EVL: for.inc:
-; EVL-NEXT: [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[IF_THEN6]] ], [ [[T_016]], [[IF_ELSE]] ]
-; EVL-NEXT: [[S_1]] = phi float [ [[S_017]], [[IF_THEN]] ], [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_ELSE]] ]
-; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; NO-EVL-LABEL: @csa_else_if_float(
-; NO-EVL-NEXT: entry:
-; NO-EVL-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL: for.body.preheader:
-; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
-; NO-EVL-NEXT: ret float [[ADD]]
-; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[S_017:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; NO-EVL-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; NO-EVL-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
-; NO-EVL: if.then:
-; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; NO-EVL-NEXT: br label [[FOR_INC]]
-; NO-EVL: if.else:
-; NO-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; NO-EVL-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
-; NO-EVL-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; NO-EVL: if.then6:
-; NO-EVL-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
-; NO-EVL-NEXT: br label [[FOR_INC]]
-; NO-EVL: for.inc:
-; NO-EVL-NEXT: [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[IF_THEN6]] ], [ [[T_016]], [[IF_ELSE]] ]
-; NO-EVL-NEXT: [[S_1]] = phi float [ [[S_017]], [[IF_THEN]] ], [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_ELSE]] ]
-; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; DATA-LABEL: @csa_else_if_float(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_INC:%.*]] ]
-; DATA-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_INC]] ]
-; DATA-NEXT: [[TMP0:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[ADD:%.*]] = phi float [ [[TMP0]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
-; DATA-NEXT: ret float [[ADD]]
-; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
-; DATA-NEXT: [[S_017:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[S_1]], [[FOR_INC]] ]
-; DATA-NEXT: [[T_016:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[T_1]], [[FOR_INC]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[COND0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; DATA-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP1]], 0
-; DATA-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
-; DATA: if.then:
-; DATA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; DATA-NEXT: br label [[FOR_INC]]
-; DATA: if.else:
-; DATA-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[COND1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; DATA-NEXT: [[TOBOOL5_NOT:%.*]] = icmp eq i8 [[TMP3]], 0
-; DATA-NEXT: br i1 [[TOBOOL5_NOT]], label [[FOR_INC]], label [[IF_THEN6:%.*]]
-; DATA: if.then6:
-; DATA-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
-; DATA-NEXT: br label [[FOR_INC]]
-; DATA: for.inc:
-; DATA-NEXT: [[T_1]] = phi float [ [[TMP2]], [[IF_THEN]] ], [ [[T_016]], [[IF_THEN6]] ], [ [[T_016]], [[IF_ELSE]] ]
-; DATA-NEXT: [[S_1]] = phi float [ [[S_017]], [[IF_THEN]] ], [ [[TMP4]], [[IF_THEN6]] ], [ [[S_017]], [[IF_ELSE]] ]
-; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-entry:
- %cmp15 = icmp sgt i32 %N, 0
- br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader: ; preds = %entry
- %wide.trip.count = zext i32 %N to i64
- br label %for.body
-
-for.cond.cleanup.loopexit: ; preds = %for.inc
- %0 = fadd float %t.1, %s.1
- br label %for.cond.cleanup
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
- ret float %add
-
-for.body: ; preds = %for.body.preheader, %for.inc
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
- %s.017 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
- %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
- %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
- %1 = load i8, ptr %arrayidx, align 1
- %tobool.not = icmp eq i8 %1, 0
- br i1 %tobool.not, label %if.else, label %if.then
-
-if.then: ; preds = %for.body
- %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
- %2 = load float, ptr %arrayidx2, align 4
- br label %for.inc
-
-if.else: ; preds = %for.body
- %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
- %3 = load i8, ptr %arrayidx4, align 1
- %tobool5.not = icmp eq i8 %3, 0
- br i1 %tobool5.not, label %for.inc, label %if.then6
-
-if.then6: ; preds = %if.else
- %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
- %4 = load float, ptr %arrayidx8, align 4
- br label %for.inc
-
-for.inc: ; preds = %if.then, %if.then6, %if.else
- %t.1 = phi float [ %2, %if.then ], [ %t.016, %if.then6 ], [ %t.016, %if.else ]
- %s.1 = phi float [ %s.017, %if.then ], [ %4, %if.then6 ], [ %s.017, %if.else ]
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; uint64_t idx_scalar(int64_t *a, int64_t *b, uint64_t ii, uint64_t n) {
-; uint64_t idx = ii;
-; for (uint64_t i = 0; i < n; ++i)
-; idx = (a[i] > b[i]) ? i : idx;
-; return idx;
-; }
-define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
-; EVL-LABEL: @idx_scalar(
-; EVL-NEXT: entry:
-; EVL-NEXT: [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; EVL: for.body.preheader:
-; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
-; EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL: vector.ph:
-; EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
-; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; EVL-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; EVL-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
-; EVL-NEXT: [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
-; EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; EVL-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
-; EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
-; EVL: vector.body:
-; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP12]]
-; EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
-; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP14]], align 8
-; EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP12]]
-; EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i32 0
-; EVL-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP16]], align 8
-; EVL-NEXT: [[TMP17:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; EVL-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP17]])
-; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x i64> [[CSA_DATA_PHI]]
-; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; EVL: middle.block:
-; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; EVL-NEXT: [[TMP20:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
-; EVL-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP20]])
-; EVL-NEXT: [[TMP22:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
-; EVL-NEXT: [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
-; EVL-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
-; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x i64> [[CSA_DATA_SEL]], i32 [[TMP25]]
-; EVL-NEXT: [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
-; EVL-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
-; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL: scalar.ph:
-; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; EVL-NEXT: ret i64 [[IDX_0_LCSSA]]
-; EVL: for.body:
-; EVL-NEXT: [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; EVL-NEXT: [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I_010]]
-; EVL-NEXT: [[TMP28:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[I_010]]
-; EVL-NEXT: [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
-; EVL-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[TMP28]], [[TMP29]]
-; EVL-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
-; EVL-NEXT: [[INC]] = add nuw i64 [[I_010]], 1
-; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-;
-; NO-EVL-LABEL: @idx_scalar(
-; NO-EVL-NEXT: entry:
-; NO-EVL-NEXT: [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; NO-EVL: for.body.preheader:
-; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
-; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL: vector.ph:
-; NO-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
-; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; NO-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; NO-EVL-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; NO-EVL-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
-; NO-EVL-NEXT: [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; NO-EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
-; NO-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; NO-EVL-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; NO-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
-; NO-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
-; NO-EVL: vector.body:
-; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP12]]
-; NO-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
-; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP14]], align 8
-; NO-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP12]]
-; NO-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i32 0
-; NO-EVL-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP16]], align 8
-; NO-EVL-NEXT: [[TMP17:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; NO-EVL-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP17]])
-; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x i64> [[CSA_DATA_PHI]]
-; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; NO-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; NO-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; NO-EVL: middle.block:
-; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; NO-EVL-NEXT: [[TMP20:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
-; NO-EVL-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP20]])
-; NO-EVL-NEXT: [[TMP22:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
-; NO-EVL-NEXT: [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
-; NO-EVL-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
-; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x i64> [[CSA_DATA_SEL]], i32 [[TMP25]]
-; NO-EVL-NEXT: [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
-; NO-EVL-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
-; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL: scalar.ph:
-; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; NO-EVL-NEXT: ret i64 [[IDX_0_LCSSA]]
-; NO-EVL: for.body:
-; NO-EVL-NEXT: [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; NO-EVL-NEXT: [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I_010]]
-; NO-EVL-NEXT: [[TMP28:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; NO-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[I_010]]
-; NO-EVL-NEXT: [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
-; NO-EVL-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[TMP28]], [[TMP29]]
-; NO-EVL-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
-; NO-EVL-NEXT: [[INC]] = add nuw i64 [[I_010]], 1
-; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-;
-; DATA-LABEL: @idx_scalar(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
-; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA: vector.ph:
-; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
-; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; DATA-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; DATA-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; DATA-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
-; DATA-NEXT: [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; DATA-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
-; DATA-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; DATA-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; DATA-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
-; DATA-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
-; DATA: vector.body:
-; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP12]]
-; DATA-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
-; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP14]], align 8
-; DATA-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP12]]
-; DATA-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i32 0
-; DATA-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP16]], align 8
-; DATA-NEXT: [[TMP17:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; DATA-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP17]])
-; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x i64> [[CSA_DATA_PHI]]
-; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; DATA-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; DATA-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; DATA: middle.block:
-; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; DATA-NEXT: [[TMP20:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
-; DATA-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP20]])
-; DATA-NEXT: [[TMP22:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
-; DATA-NEXT: [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
-; DATA-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
-; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x i64> [[CSA_DATA_SEL]], i32 [[TMP25]]
-; DATA-NEXT: [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
-; DATA-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
-; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA: scalar.ph:
-; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; DATA-NEXT: ret i64 [[IDX_0_LCSSA]]
-; DATA: for.body:
-; DATA-NEXT: [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; DATA-NEXT: [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I_010]]
-; DATA-NEXT: [[TMP28:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; DATA-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[I_010]]
-; DATA-NEXT: [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
-; DATA-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[TMP28]], [[TMP29]]
-; DATA-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
-; DATA-NEXT: [[INC]] = add nuw i64 [[I_010]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-;
-entry:
- %cmp8.not = icmp eq i64 %n, 0
- br i1 %cmp8.not, label %for.cond.cleanup, label %for.body.preheader
-
-for.body.preheader: ; preds = %entry
- br label %for.body
-
-for.cond.cleanup.loopexit: ; preds = %for.body
- %cond.lcssa = phi i64 [ %cond, %for.body ]
- br label %for.cond.cleanup
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond.lcssa, %for.cond.cleanup.loopexit ]
- ret i64 %idx.0.lcssa
-
-for.body: ; preds = %for.body.preheader, %for.body
- %i.010 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
- %idx.09 = phi i64 [ %cond, %for.body ], [ %ii, %for.body.preheader ]
- %arrayidx = getelementptr inbounds i64, ptr %a, i64 %i.010
- %0 = load i64, ptr %arrayidx, align 8
- %arrayidx1 = getelementptr inbounds i64, ptr %b, i64 %i.010
- %1 = load i64, ptr %arrayidx1, align 8
- %cmp2 = icmp sgt i64 %0, %1
- %cond = select i1 %cmp2, i64 %i.010, i64 %idx.09
- %inc = add nuw i64 %i.010, 1
- %exitcond.not = icmp eq i64 %inc, %n
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; uint64_t idx_scalar_dec(int64_t *a, int64_t *b, uint64_t ii, uint64_t n) {
-; uint64_t idx = ii;
-; for (uint64_t i = n; i > 0; --i) // decreasing
-; idx = (a[i - 1] > b[i - 1]) ? i : idx;
-; return idx;
-; }
-define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
-; EVL-LABEL: @idx_scalar_dec(
-; EVL-NEXT: entry:
-; EVL-NEXT: [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; EVL: for.body.preheader:
-; EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
-; EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL: vector.ph:
-; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; EVL-NEXT: [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
-; EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[N]], i64 0
-; EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
-; EVL-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3, i64 -4, i64 -5, i64 -6, i64 -7>
-; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
-; EVL: vector.body:
-; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <8 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <8 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[TMP0:%.*]] = add <8 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
-; EVL-NEXT: [[TMP1:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0
-; EVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP1]]
-; EVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; EVL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -7
-; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
-; EVL-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP1]]
-; EVL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -7
-; EVL-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8
-; EVL-NEXT: [[REVERSE2:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD1]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; EVL-NEXT: [[TMP8:%.*]] = icmp sgt <8 x i64> [[REVERSE]], [[REVERSE2]]
-; EVL-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP8]])
-; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP9]], <8 x i1> [[TMP8]], <8 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP9]], <8 x i64> [[VEC_IND]], <8 x i64> [[CSA_DATA_PHI]]
-; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; EVL-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8>
-; EVL-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; EVL: middle.block:
-; EVL-NEXT: [[TMP11:%.*]] = select <8 x i1> [[CSA_MASK_SEL]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> zeroinitializer
-; EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> [[TMP11]])
-; EVL-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP12]], 0
-; EVL-NEXT: [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]]
-; EVL-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 0, i32 -1
-; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <8 x i64> [[CSA_DATA_SEL]], i32 [[TMP16]]
-; EVL-NEXT: [[TMP17:%.*]] = icmp sge i32 [[TMP16]], 0
-; EVL-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
-; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL: scalar.ph:
-; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; EVL-NEXT: ret i64 [[IDX_0_LCSSA]]
-; EVL: for.body:
-; EVL-NEXT: [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; EVL-NEXT: [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
-; EVL-NEXT: [[SUB]] = add i64 [[I_011]], -1
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[SUB]]
-; EVL-NEXT: [[TMP19:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[SUB]]
-; EVL-NEXT: [[TMP20:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
-; EVL-NEXT: [[CMP3:%.*]] = icmp sgt i64 [[TMP19]], [[TMP20]]
-; EVL-NEXT: [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
-; EVL-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
-; EVL-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-;
-; NO-EVL-LABEL: @idx_scalar_dec(
-; NO-EVL-NEXT: entry:
-; NO-EVL-NEXT: [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; NO-EVL: for.body.preheader:
-; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
-; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL: vector.ph:
-; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; NO-EVL-NEXT: [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
-; NO-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[N]], i64 0
-; NO-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
-; NO-EVL-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3, i64 -4, i64 -5, i64 -6, i64 -7>
-; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
-; NO-EVL: vector.body:
-; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <8 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <8 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[TMP0:%.*]] = add <8 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
-; NO-EVL-NEXT: [[TMP1:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0
-; NO-EVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP1]]
-; NO-EVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; NO-EVL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -7
-; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
-; NO-EVL-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; NO-EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP1]]
-; NO-EVL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; NO-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -7
-; NO-EVL-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8
-; NO-EVL-NEXT: [[REVERSE2:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD1]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; NO-EVL-NEXT: [[TMP8:%.*]] = icmp sgt <8 x i64> [[REVERSE]], [[REVERSE2]]
-; NO-EVL-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP8]])
-; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP9]], <8 x i1> [[TMP8]], <8 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP9]], <8 x i64> [[VEC_IND]], <8 x i64> [[CSA_DATA_PHI]]
-; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; NO-EVL-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8>
-; NO-EVL-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; NO-EVL: middle.block:
-; NO-EVL-NEXT: [[TMP11:%.*]] = select <8 x i1> [[CSA_MASK_SEL]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> zeroinitializer
-; NO-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> [[TMP11]])
-; NO-EVL-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP12]], 0
-; NO-EVL-NEXT: [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]]
-; NO-EVL-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 0, i32 -1
-; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <8 x i64> [[CSA_DATA_SEL]], i32 [[TMP16]]
-; NO-EVL-NEXT: [[TMP17:%.*]] = icmp sge i32 [[TMP16]], 0
-; NO-EVL-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
-; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL: scalar.ph:
-; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; NO-EVL-NEXT: ret i64 [[IDX_0_LCSSA]]
-; NO-EVL: for.body:
-; NO-EVL-NEXT: [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; NO-EVL-NEXT: [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
-; NO-EVL-NEXT: [[SUB]] = add i64 [[I_011]], -1
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[SUB]]
-; NO-EVL-NEXT: [[TMP19:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; NO-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[SUB]]
-; NO-EVL-NEXT: [[TMP20:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
-; NO-EVL-NEXT: [[CMP3:%.*]] = icmp sgt i64 [[TMP19]], [[TMP20]]
-; NO-EVL-NEXT: [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
-; NO-EVL-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
-; NO-EVL-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-;
-; DATA-LABEL: @idx_scalar_dec(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
-; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA: vector.ph:
-; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; DATA-NEXT: [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
-; DATA-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[N]], i64 0
-; DATA-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
-; DATA-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3, i64 -4, i64 -5, i64 -6, i64 -7>
-; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
-; DATA: vector.body:
-; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <8 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <8 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[TMP0:%.*]] = add <8 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
-; DATA-NEXT: [[TMP1:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0
-; DATA-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP1]]
-; DATA-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; DATA-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -7
-; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
-; DATA-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; DATA-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP1]]
-; DATA-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; DATA-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -7
-; DATA-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8
-; DATA-NEXT: [[REVERSE2:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD1]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; DATA-NEXT: [[TMP8:%.*]] = icmp sgt <8 x i64> [[REVERSE]], [[REVERSE2]]
-; DATA-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP8]])
-; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP9]], <8 x i1> [[TMP8]], <8 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP9]], <8 x i64> [[VEC_IND]], <8 x i64> [[CSA_DATA_PHI]]
-; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; DATA-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8>
-; DATA-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; DATA: middle.block:
-; DATA-NEXT: [[TMP11:%.*]] = select <8 x i1> [[CSA_MASK_SEL]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> zeroinitializer
-; DATA-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> [[TMP11]])
-; DATA-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP12]], 0
-; DATA-NEXT: [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]]
-; DATA-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 0, i32 -1
-; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <8 x i64> [[CSA_DATA_SEL]], i32 [[TMP16]]
-; DATA-NEXT: [[TMP17:%.*]] = icmp sge i32 [[TMP16]], 0
-; DATA-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
-; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA: scalar.ph:
-; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; DATA-NEXT: ret i64 [[IDX_0_LCSSA]]
-; DATA: for.body:
-; DATA-NEXT: [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; DATA-NEXT: [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
-; DATA-NEXT: [[SUB]] = add i64 [[I_011]], -1
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[SUB]]
-; DATA-NEXT: [[TMP19:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; DATA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[SUB]]
-; DATA-NEXT: [[TMP20:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
-; DATA-NEXT: [[CMP3:%.*]] = icmp sgt i64 [[TMP19]], [[TMP20]]
-; DATA-NEXT: [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
-; DATA-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
-; DATA-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-;
-entry:
- %cmp.not9 = icmp eq i64 %n, 0
- br i1 %cmp.not9, label %for.cond.cleanup, label %for.body.preheader
-
-for.body.preheader: ; preds = %entry
- br label %for.body
-
-for.cond.cleanup.loopexit: ; preds = %for.body
- %cond.lcssa = phi i64 [ %cond, %for.body ]
- br label %for.cond.cleanup
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond.lcssa, %for.cond.cleanup.loopexit ]
- ret i64 %idx.0.lcssa
-
-for.body: ; preds = %for.body.preheader, %for.body
- %i.011 = phi i64 [ %sub, %for.body ], [ %n, %for.body.preheader ]
- %idx.010 = phi i64 [ %cond, %for.body ], [ %ii, %for.body.preheader ]
- %sub = add i64 %i.011, -1
- %arrayidx = getelementptr inbounds i64, ptr %a, i64 %sub
- %0 = load i64, ptr %arrayidx, align 8
- %arrayidx2 = getelementptr inbounds i64, ptr %b, i64 %sub
- %1 = load i64, ptr %arrayidx2, align 8
- %cmp3 = icmp sgt i64 %0, %1
- %cond = select i1 %cmp3, i64 %i.011, i64 %idx.010
- %cmp.not = icmp eq i64 %sub, 0
- br i1 %cmp.not, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; The key part of this function is that the true arm of the select corresponds
-; to selecting the initial value, instead of selecting the new value.
-; int simple_csa_int_select_neg_cond(int N, int *data) {
-; int t = 0;
-; for (int i = 0; i < N; i++) {
-; if (i != data[i])
-; t = data[i];
-; }
-; return t; // use t
-; }
-define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
-; EVL-LABEL: @simple_csa_int_select_neg_cond(
-; EVL-NEXT: entry:
-; EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL: for.body.preheader:
-; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL: vector.ph:
-; EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; EVL-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; EVL-NEXT: [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
-; EVL-NEXT: [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
-; EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; EVL-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
-; EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
-; EVL: vector.body:
-; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP12]]
-; EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
-; EVL-NEXT: [[TMP15:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; EVL-NEXT: [[TMP16:%.*]] = icmp eq <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
-; EVL-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i1> [[TMP16]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; EVL-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP17]])
-; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
-; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; EVL: middle.block:
-; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; EVL-NEXT: [[TMP20:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; EVL-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP20]])
-; EVL-NEXT: [[TMP22:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
-; EVL-NEXT: [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
-; EVL-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
-; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP25]]
-; EVL-NEXT: [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
-; EVL-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[CSA_EXTRACT]], i32 0
-; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL: scalar.ph:
-; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; EVL-NEXT: ret i32 [[T_0_LCSSA]]
-; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT: [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT: [[TMP29:%.*]] = zext i32 [[TMP28]] to i64
-; EVL-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP29]]
-; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP28]]
-; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-;
-; NO-EVL-LABEL: @simple_csa_int_select_neg_cond(
-; NO-EVL-NEXT: entry:
-; NO-EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL: for.body.preheader:
-; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL: vector.ph:
-; NO-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; NO-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; NO-EVL-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; NO-EVL-NEXT: [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
-; NO-EVL-NEXT: [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; NO-EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
-; NO-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; NO-EVL-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; NO-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
-; NO-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
-; NO-EVL: vector.body:
-; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP12]]
-; NO-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
-; NO-EVL-NEXT: [[TMP15:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; NO-EVL-NEXT: [[TMP16:%.*]] = icmp eq <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
-; NO-EVL-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i1> [[TMP16]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; NO-EVL-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP17]])
-; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
-; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; NO-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; NO-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; NO-EVL: middle.block:
-; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; NO-EVL-NEXT: [[TMP20:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; NO-EVL-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP20]])
-; NO-EVL-NEXT: [[TMP22:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
-; NO-EVL-NEXT: [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
-; NO-EVL-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
-; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP25]]
-; NO-EVL-NEXT: [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
-; NO-EVL-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[CSA_EXTRACT]], i32 0
-; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL: scalar.ph:
-; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; NO-EVL-NEXT: ret i32 [[T_0_LCSSA]]
-; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT: [[TMP29:%.*]] = zext i32 [[TMP28]] to i64
-; NO-EVL-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP29]]
-; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP28]]
-; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-;
-; DATA-LABEL: @simple_csa_int_select_neg_cond(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA: vector.ph:
-; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; DATA-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; DATA-NEXT: [[TMP7:%.*]] = add <vscale x 4 x i64> [[TMP6]], zeroinitializer
-; DATA-NEXT: [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; DATA-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
-; DATA-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; DATA-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; DATA-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
-; DATA-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
-; DATA: vector.body:
-; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 4 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP12]]
-; DATA-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
-; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
-; DATA-NEXT: [[TMP15:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; DATA-NEXT: [[TMP16:%.*]] = icmp eq <vscale x 4 x i64> [[VEC_IND]], [[TMP15]]
-; DATA-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i1> [[TMP16]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; DATA-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP17]])
-; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[CSA_DATA_PHI]]
-; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; DATA-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; DATA-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; DATA: middle.block:
-; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; DATA-NEXT: [[TMP20:%.*]] = select <vscale x 4 x i1> [[CSA_MASK_SEL]], <vscale x 4 x i32> [[CSA_STEP]], <vscale x 4 x i32> zeroinitializer
-; DATA-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP20]])
-; DATA-NEXT: [[TMP22:%.*]] = extractelement <vscale x 4 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
-; DATA-NEXT: [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
-; DATA-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
-; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[CSA_DATA_SEL]], i32 [[TMP25]]
-; DATA-NEXT: [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
-; DATA-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[CSA_EXTRACT]], i32 0
-; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA: scalar.ph:
-; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; DATA-NEXT: ret i32 [[T_0_LCSSA]]
-; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT: [[TMP29:%.*]] = zext i32 [[TMP28]] to i64
-; DATA-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP29]]
-; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP28]]
-; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-;
-entry:
- %cmp9 = icmp sgt i32 %N, 0
- br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader: ; preds = %entry
- %wide.trip.count = zext i32 %N to i64
- br label %for.body
-
-for.cond.cleanup.loopexit: ; preds = %for.body
- %spec.select.lcssa = phi i32 [ %spec.select, %for.body ]
- br label %for.cond.cleanup
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %t.0.lcssa = phi i32 [ 0, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
- ret i32 %t.0.lcssa
-
-for.body: ; preds = %for.body.preheader, %for.body
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
- %t.010 = phi i32 [ 0, %for.body.preheader ], [ %spec.select, %for.body ]
- %arrayidx = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
- %0 = load i32, ptr %arrayidx, align 4
- %1 = zext i32 %0 to i64
- %cmp1.not = icmp eq i64 %indvars.iv, %1
- %spec.select = select i1 %cmp1.not, i32 %t.010, i32 %0
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; This function is generated from the following C/C++ program:
-; int *simple_csa_ptr_select(int N, int **data) {
-; int *t = nullptr;
-; for (int i = 0; i < N; i++) {
-; if (i < *data[i])
-; t = data[i];
-; }
-; return t; // use t
-; }
-define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
-; EVL-LABEL: @simple_csa_ptr_select(
-; EVL-NEXT: entry:
-; EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; EVL: for.body.preheader:
-; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; EVL: vector.ph:
-; EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
-; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; EVL-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; EVL-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
-; EVL-NEXT: [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
-; EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; EVL-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
-; EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
-; EVL: vector.body:
-; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x ptr> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[TMP12]]
-; EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds ptr, ptr [[TMP13]], i32 0
-; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP14]], align 8
-; EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[WIDE_LOAD]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
-; EVL-NEXT: [[TMP15:%.*]] = sext <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i64>
-; EVL-NEXT: [[TMP16:%.*]] = icmp slt <vscale x 2 x i64> [[VEC_IND]], [[TMP15]]
-; EVL-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP16]])
-; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 2 x ptr> [[WIDE_LOAD]], <vscale x 2 x ptr> [[CSA_DATA_PHI]]
-; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
-; EVL: middle.block:
-; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; EVL-NEXT: [[TMP19:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
-; EVL-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP19]])
-; EVL-NEXT: [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
-; EVL-NEXT: [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
-; EVL-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
-; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x ptr> [[CSA_DATA_SEL]], i32 [[TMP24]]
-; EVL-NEXT: [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
-; EVL-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], ptr [[CSA_EXTRACT]], ptr null
-; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; EVL: scalar.ph:
-; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
-; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; EVL-NEXT: ret ptr [[T_0_LCSSA]]
-; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT: [[T_010:%.*]] = phi ptr [ null, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-; EVL-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
-; EVL-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
-; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP29]]
-; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP27]], ptr [[T_010]]
-; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
-;
-; NO-EVL-LABEL: @simple_csa_ptr_select(
-; NO-EVL-NEXT: entry:
-; NO-EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; NO-EVL: for.body.preheader:
-; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NO-EVL: vector.ph:
-; NO-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
-; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; NO-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; NO-EVL-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; NO-EVL-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
-; NO-EVL-NEXT: [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; NO-EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
-; NO-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; NO-EVL-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; NO-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
-; NO-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
-; NO-EVL: vector.body:
-; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x ptr> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[TMP12]]
-; NO-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds ptr, ptr [[TMP13]], i32 0
-; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP14]], align 8
-; NO-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[WIDE_LOAD]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
-; NO-EVL-NEXT: [[TMP15:%.*]] = sext <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i64>
-; NO-EVL-NEXT: [[TMP16:%.*]] = icmp slt <vscale x 2 x i64> [[VEC_IND]], [[TMP15]]
-; NO-EVL-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP16]])
-; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 2 x ptr> [[WIDE_LOAD]], <vscale x 2 x ptr> [[CSA_DATA_PHI]]
-; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; NO-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; NO-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
-; NO-EVL: middle.block:
-; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; NO-EVL-NEXT: [[TMP19:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
-; NO-EVL-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP19]])
-; NO-EVL-NEXT: [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
-; NO-EVL-NEXT: [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
-; NO-EVL-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
-; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x ptr> [[CSA_DATA_SEL]], i32 [[TMP24]]
-; NO-EVL-NEXT: [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
-; NO-EVL-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], ptr [[CSA_EXTRACT]], ptr null
-; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NO-EVL: scalar.ph:
-; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
-; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; NO-EVL-NEXT: ret ptr [[T_0_LCSSA]]
-; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[T_010:%.*]] = phi ptr [ null, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-; NO-EVL-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
-; NO-EVL-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
-; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP29]]
-; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP27]], ptr [[T_010]]
-; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
-;
-; DATA-LABEL: @simple_csa_ptr_select(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA: vector.ph:
-; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
-; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; DATA-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; DATA-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
-; DATA-NEXT: [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; DATA-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
-; DATA-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; DATA-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; DATA-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
-; DATA-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
-; DATA: vector.body:
-; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x ptr> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT: [[TMP13:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[TMP12]]
-; DATA-NEXT: [[TMP14:%.*]] = getelementptr inbounds ptr, ptr [[TMP13]], i32 0
-; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP14]], align 8
-; DATA-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[WIDE_LOAD]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
-; DATA-NEXT: [[TMP15:%.*]] = sext <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i64>
-; DATA-NEXT: [[TMP16:%.*]] = icmp slt <vscale x 2 x i64> [[VEC_IND]], [[TMP15]]
-; DATA-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP16]])
-; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 2 x ptr> [[WIDE_LOAD]], <vscale x 2 x ptr> [[CSA_DATA_PHI]]
-; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; DATA-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; DATA-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
-; DATA: middle.block:
-; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; DATA-NEXT: [[TMP19:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
-; DATA-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP19]])
-; DATA-NEXT: [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
-; DATA-NEXT: [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
-; DATA-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
-; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x ptr> [[CSA_DATA_SEL]], i32 [[TMP24]]
-; DATA-NEXT: [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
-; DATA-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], ptr [[CSA_EXTRACT]], ptr null
-; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA: scalar.ph:
-; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; DATA-NEXT: ret ptr [[T_0_LCSSA]]
-; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[T_010:%.*]] = phi ptr [ null, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-; DATA-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
-; DATA-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
-; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP29]]
-; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP27]], ptr [[T_010]]
-; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
-;
-entry:
- %cmp9 = icmp sgt i32 %N, 0
- br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
- %wide.trip.count = zext i32 %N to i64
- br label %for.body
-
-for.cond.cleanup.loopexit:
- %spec.select.lcssa = phi ptr [ %spec.select, %for.body ]
- br label %for.cond.cleanup
-
-for.cond.cleanup:
- %t.0.lcssa = phi ptr [ null, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
- ret ptr %t.0.lcssa
-
-for.body:
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
- %t.010 = phi ptr [ null, %for.body.preheader ], [ %spec.select, %for.body ]
- %arrayidx = getelementptr inbounds ptr, ptr %data, i64 %indvars.iv
- %0 = load ptr, ptr %arrayidx, align 8
- %1 = load i32, ptr %0, align 4
- %2 = sext i32 %1 to i64
- %cmp1 = icmp slt i64 %indvars.iv, %2
- %spec.select = select i1 %cmp1, ptr %0, ptr %t.010
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-}
diff --git a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll
new file mode 100644
index 00000000000000..0269f0c672a3fa
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll
@@ -0,0 +1,3091 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -passes=loop-vectorize -force-tail-folding-style=data-with-evl \
+; RUN: -enable-csa-vectorization -scalable-vectorization=on \
+; RUN: -force-target-supports-scalable-vectors -force-target-instruction-cost=1 \
+; RUN: | FileCheck %s -check-prefix=EVL
+; RUN: opt < %s -S -passes=loop-vectorize -force-tail-folding-style=none \
+; RUN: -enable-csa-vectorization -scalable-vectorization=on \
+; RUN: -force-target-supports-scalable-vectors -force-target-instruction-cost=1 \
+; RUN: | FileCheck %s -check-prefix=NO-EVL
+; RUN: opt < %s -S -passes=loop-vectorize -force-tail-folding-style=data \
+; RUN: -enable-csa-vectorization -scalable-vectorization=on \
+; RUN: -force-target-supports-scalable-vectors -force-target-instruction-cost=1 \
+; RUN: | FileCheck %s -check-prefix=DATA
+
+; This function is generated from the following C/C++ program:
+; int simple_csa_int_select(int N, int *data, int a) {
+; int t = -1;
+; for (int i = 0; i < N; i++) {
+; if (a < data[i])
+; t = data[i];
+; }
+; return t; // use t
+; }
+define i32 @simple_csa_int_select(i32 %N, ptr %data, i64 %a) {
+; EVL-LABEL: @simple_csa_int_select(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL: vector.ph:
+; EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[A:%.*]], i64 0
+; EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; EVL: vector.body:
+; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP3]]
+; EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP5]], align 4
+; EVL-NEXT: [[TMP6:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; EVL-NEXT: [[TMP7:%.*]] = icmp slt <vscale x 1 x i64> [[BROADCAST_SPLAT]], [[TMP6]]
+; EVL-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP7]])
+; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP8]], <vscale x 1 x i1> [[TMP7]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP8]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; EVL-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; EVL: middle.block:
+; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; EVL-NEXT: [[TMP10:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP10]])
+; EVL-NEXT: [[TMP12:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP11]], 0
+; EVL-NEXT: [[TMP14:%.*]] = and i1 [[TMP12]], [[TMP13]]
+; EVL-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP15]]
+; EVL-NEXT: [[TMP16:%.*]] = icmp sge i32 [[TMP15]], 0
+; EVL-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[CSA_EXTRACT]], i32 -1
+; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL: scalar.ph:
+; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; EVL-NEXT: ret i32 [[T_0_LCSSA]]
+; EVL: for.body:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT: [[TMP19:%.*]] = sext i32 [[TMP18]] to i64
+; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP19]]
+; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP18]], i32 [[T_010]]
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+; NO-EVL-LABEL: @simple_csa_int_select(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL: vector.ph:
+; NO-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[A:%.*]], i64 0
+; NO-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-EVL: vector.body:
+; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP3]]
+; NO-EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP5]], align 4
+; NO-EVL-NEXT: [[TMP6:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; NO-EVL-NEXT: [[TMP7:%.*]] = icmp slt <vscale x 1 x i64> [[BROADCAST_SPLAT]], [[TMP6]]
+; NO-EVL-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP7]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP8]], <vscale x 1 x i1> [[TMP7]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP8]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; NO-EVL-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-EVL: middle.block:
+; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; NO-EVL-NEXT: [[TMP10:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP10]])
+; NO-EVL-NEXT: [[TMP12:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP11]], 0
+; NO-EVL-NEXT: [[TMP14:%.*]] = and i1 [[TMP12]], [[TMP13]]
+; NO-EVL-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP15]]
+; NO-EVL-NEXT: [[TMP16:%.*]] = icmp sge i32 [[TMP15]], 0
+; NO-EVL-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[CSA_EXTRACT]], i32 -1
+; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL: scalar.ph:
+; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; NO-EVL-NEXT: ret i32 [[T_0_LCSSA]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT: [[TMP19:%.*]] = sext i32 [[TMP18]] to i64
+; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP19]]
+; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP18]], i32 [[T_010]]
+; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+; DATA-LABEL: @simple_csa_int_select(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA: vector.ph:
+; DATA-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[A:%.*]], i64 0
+; DATA-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
+; DATA: vector.body:
+; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP3]]
+; DATA-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP5]], align 4
+; DATA-NEXT: [[TMP6:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; DATA-NEXT: [[TMP7:%.*]] = icmp slt <vscale x 1 x i64> [[BROADCAST_SPLAT]], [[TMP6]]
+; DATA-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP7]])
+; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP8]], <vscale x 1 x i1> [[TMP7]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP8]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; DATA-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; DATA: middle.block:
+; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; DATA-NEXT: [[TMP10:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; DATA-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP10]])
+; DATA-NEXT: [[TMP12:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP11]], 0
+; DATA-NEXT: [[TMP14:%.*]] = and i1 [[TMP12]], [[TMP13]]
+; DATA-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i32 0, i32 -1
+; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP15]]
+; DATA-NEXT: [[TMP16:%.*]] = icmp sge i32 [[TMP15]], 0
+; DATA-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[CSA_EXTRACT]], i32 -1
+; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA: scalar.ph:
+; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; DATA-NEXT: ret i32 [[T_0_LCSSA]]
+; DATA: for.body:
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT: [[TMP19:%.*]] = sext i32 [[TMP18]] to i64
+; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP19]]
+; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP18]], i32 [[T_010]]
+; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+entry:
+ %cmp9 = icmp sgt i32 %N, 0
+ br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %spec.select.lcssa = phi i32 [ %spec.select, %for.body ]
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %t.0.lcssa = phi i32 [ -1, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+ ret i32 %t.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %t.010 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %1 = sext i32 %0 to i64
+ %cmp1 = icmp slt i64 %a, %1
+ %spec.select = select i1 %cmp1, i32 %0, i32 %t.010
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int simple_csa_int_select_induction_cmp(int N, int *data) {
+; int t = -1;
+; for (int i = 0; i < N; i++) {
+; if (i < data[i])
+; t = data[i];
+; }
+; return t; // use t
+; }
+define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
+; EVL-LABEL: @simple_csa_int_select_induction_cmp(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL: vector.ph:
+; EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+; EVL-NEXT: [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
+; EVL-NEXT: [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
+; EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP7:%.*]] = mul i64 1, [[TMP6]]
+; EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
+; EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; EVL: vector.body:
+; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP8]]
+; EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
+; EVL-NEXT: [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; EVL-NEXT: [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
+; EVL-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
+; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; EVL: middle.block:
+; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; EVL-NEXT: [[TMP15:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP15]])
+; EVL-NEXT: [[TMP17:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP16]], 0
+; EVL-NEXT: [[TMP19:%.*]] = and i1 [[TMP17]], [[TMP18]]
+; EVL-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP20]]
+; EVL-NEXT: [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
+; EVL-NEXT: [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT]], i32 -1
+; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL: scalar.ph:
+; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; EVL-NEXT: ret i32 [[T_0_LCSSA]]
+; EVL: for.body:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT: [[TMP24:%.*]] = sext i32 [[TMP23]] to i64
+; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP24]]
+; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP23]], i32 [[T_010]]
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+;
+; NO-EVL-LABEL: @simple_csa_int_select_induction_cmp(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL: vector.ph:
+; NO-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+; NO-EVL-NEXT: [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
+; NO-EVL-NEXT: [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; NO-EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
+; NO-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP7:%.*]] = mul i64 1, [[TMP6]]
+; NO-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
+; NO-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-EVL: vector.body:
+; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP8]]
+; NO-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
+; NO-EVL-NEXT: [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; NO-EVL-NEXT: [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
+; NO-EVL-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; NO-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; NO-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-EVL: middle.block:
+; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; NO-EVL-NEXT: [[TMP15:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP15]])
+; NO-EVL-NEXT: [[TMP17:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP16]], 0
+; NO-EVL-NEXT: [[TMP19:%.*]] = and i1 [[TMP17]], [[TMP18]]
+; NO-EVL-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP20]]
+; NO-EVL-NEXT: [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
+; NO-EVL-NEXT: [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT]], i32 -1
+; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL: scalar.ph:
+; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; NO-EVL-NEXT: ret i32 [[T_0_LCSSA]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT: [[TMP24:%.*]] = sext i32 [[TMP23]] to i64
+; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP24]]
+; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP23]], i32 [[T_010]]
+; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+;
+; DATA-LABEL: @simple_csa_int_select_induction_cmp(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA: vector.ph:
+; DATA-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+; DATA-NEXT: [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
+; DATA-NEXT: [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; DATA-NEXT: [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
+; DATA-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP7:%.*]] = mul i64 1, [[TMP6]]
+; DATA-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
+; DATA-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
+; DATA: vector.body:
+; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP8]]
+; DATA-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
+; DATA-NEXT: [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; DATA-NEXT: [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
+; DATA-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
+; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; DATA-NEXT: [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; DATA-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; DATA: middle.block:
+; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; DATA-NEXT: [[TMP15:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; DATA-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP15]])
+; DATA-NEXT: [[TMP17:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP16]], 0
+; DATA-NEXT: [[TMP19:%.*]] = and i1 [[TMP17]], [[TMP18]]
+; DATA-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 -1
+; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP20]]
+; DATA-NEXT: [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
+; DATA-NEXT: [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT]], i32 -1
+; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA: scalar.ph:
+; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; DATA-NEXT: ret i32 [[T_0_LCSSA]]
+; DATA: for.body:
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT: [[TMP24:%.*]] = sext i32 [[TMP23]] to i64
+; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP24]]
+; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP23]], i32 [[T_010]]
+; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+;
+entry:
+ %cmp9 = icmp sgt i32 %N, 0
+ br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %spec.select.lcssa = phi i32 [ %spec.select, %for.body ]
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %t.0.lcssa = phi i32 [ -1, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+ ret i32 %t.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %t.010 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %1 = sext i32 %0 to i64
+ %cmp1 = icmp slt i64 %indvars.iv, %1
+ %spec.select = select i1 %cmp1, i32 %0, i32 %t.010
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float simple_csa_float_select(int N, float *data) {
+; float t = 1.0f;
+; for (int i = 0; i < N; i++) {
+; if (0.0f < data[i])
+; t = data[i];
+; }
+; return t; // use t
+; }
+define float @simple_csa_float_select(i32 %N, ptr %data) {
+; EVL-LABEL: @simple_csa_float_select(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL: vector.ph:
+; EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; EVL: vector.body:
+; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[TMP3]]
+; EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0
+; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x float>, ptr [[TMP5]], align 4
+; EVL-NEXT: [[TMP6:%.*]] = fcmp ogt <vscale x 1 x float> [[WIDE_LOAD]], zeroinitializer
+; EVL-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP6]])
+; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP7]], <vscale x 1 x i1> [[TMP6]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP7]], <vscale x 1 x float> [[WIDE_LOAD]], <vscale x 1 x float> [[CSA_DATA_PHI]]
+; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; EVL-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; EVL: middle.block:
+; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; EVL-NEXT: [[TMP9:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP9]])
+; EVL-NEXT: [[TMP11:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP10]], 0
+; EVL-NEXT: [[TMP13:%.*]] = and i1 [[TMP11]], [[TMP12]]
+; EVL-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x float> [[CSA_DATA_SEL]], i32 [[TMP14]]
+; EVL-NEXT: [[TMP15:%.*]] = icmp sge i32 [[TMP14]], 0
+; EVL-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], float [[CSA_EXTRACT]], float 1.000000e+00
+; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL: scalar.ph:
+; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; EVL-NEXT: ret float [[T_0_LCSSA]]
+; EVL: for.body:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[T_09:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP17]], 0.000000e+00
+; EVL-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP17]], float [[T_09]]
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+;
+; NO-EVL-LABEL: @simple_csa_float_select(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL: vector.ph:
+; NO-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-EVL: vector.body:
+; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[TMP3]]
+; NO-EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x float>, ptr [[TMP5]], align 4
+; NO-EVL-NEXT: [[TMP6:%.*]] = fcmp ogt <vscale x 1 x float> [[WIDE_LOAD]], zeroinitializer
+; NO-EVL-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP6]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP7]], <vscale x 1 x i1> [[TMP6]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP7]], <vscale x 1 x float> [[WIDE_LOAD]], <vscale x 1 x float> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; NO-EVL-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; NO-EVL: middle.block:
+; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; NO-EVL-NEXT: [[TMP9:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP9]])
+; NO-EVL-NEXT: [[TMP11:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP10]], 0
+; NO-EVL-NEXT: [[TMP13:%.*]] = and i1 [[TMP11]], [[TMP12]]
+; NO-EVL-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x float> [[CSA_DATA_SEL]], i32 [[TMP14]]
+; NO-EVL-NEXT: [[TMP15:%.*]] = icmp sge i32 [[TMP14]], 0
+; NO-EVL-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], float [[CSA_EXTRACT]], float 1.000000e+00
+; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL: scalar.ph:
+; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; NO-EVL-NEXT: ret float [[T_0_LCSSA]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[T_09:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP17]], 0.000000e+00
+; NO-EVL-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP17]], float [[T_09]]
+; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+;
+; DATA-LABEL: @simple_csa_float_select(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA: vector.ph:
+; DATA-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
+; DATA: vector.body:
+; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[TMP3]]
+; DATA-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0
+; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x float>, ptr [[TMP5]], align 4
+; DATA-NEXT: [[TMP6:%.*]] = fcmp ogt <vscale x 1 x float> [[WIDE_LOAD]], zeroinitializer
+; DATA-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP6]])
+; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP7]], <vscale x 1 x i1> [[TMP6]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP7]], <vscale x 1 x float> [[WIDE_LOAD]], <vscale x 1 x float> [[CSA_DATA_PHI]]
+; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; DATA-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; DATA: middle.block:
+; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; DATA-NEXT: [[TMP9:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; DATA-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP9]])
+; DATA-NEXT: [[TMP11:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP10]], 0
+; DATA-NEXT: [[TMP13:%.*]] = and i1 [[TMP11]], [[TMP12]]
+; DATA-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 0, i32 -1
+; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x float> [[CSA_DATA_SEL]], i32 [[TMP14]]
+; DATA-NEXT: [[TMP15:%.*]] = icmp sge i32 [[TMP14]], 0
+; DATA-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], float [[CSA_EXTRACT]], float 1.000000e+00
+; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA: scalar.ph:
+; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; DATA-NEXT: ret float [[T_0_LCSSA]]
+; DATA: for.body:
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[T_09:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP17]], 0.000000e+00
+; DATA-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP17]], float [[T_09]]
+; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+;
+entry:
+ %cmp8 = icmp sgt i32 %N, 0
+ br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.body ]
+ ret float %t.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %t.09 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.body ]
+ %arrayidx = getelementptr inbounds float, ptr %data, i64 %indvars.iv
+ %0 = load float, ptr %arrayidx, align 4
+ %cmp1 = fcmp ogt float %0, 0.000000e+00
+ %t.1 = select i1 %cmp1, float %0, float %t.09
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int simple_csa_int(int N, bool *cond, int *data) {
+; int t = -1;
+; for (int i = 0; i < N; i++) {
+; if (cond[i])
+; t = data[i];
+; }
+; return t; // use t
+; }
+define i32 @simple_csa_int(i32 %N, ptr %cond, ptr %data) {
+; EVL-LABEL: @simple_csa_int(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @simple_csa_int(
+; NO-EVL-NOT: vector.body:
+;
+; DATA-LABEL: @simple_csa_int(
+; DATA-NOT: vector.body:
+;
+entry:
+ %cmp6 = icmp sgt i32 %N, 0
+ br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.inc, %entry
+ %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.1, %for.inc ]
+ ret i32 %t.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %t.07 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
+ %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
+ %0 = load i8, ptr %arrayidx, align 1
+ %tobool.not = icmp eq i8 %0, 0
+ br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then: ; preds = %for.body
+ %arrayidx2 = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
+ %1 = load i32, ptr %arrayidx2, align 4
+ br label %for.inc
+
+for.inc: ; preds = %for.body, %if.then
+ %t.1 = phi i32 [ %1, %if.then ], [ %t.07, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float simple_csa_float(int N, bool *cond, float *data) {
+; float t = 1.0f;
+; for (int i = 0; i < N; i++) {
+; if (cond[i])
+; t = data[i];
+; }
+; return t; // use t
+; }
+define float @simple_csa_float(i32 %N, ptr %cond, ptr %data) {
+; EVL-LABEL: @simple_csa_float(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @simple_csa_float(
+; NO-EVL-NOT: vector.body:
+;
+; DATA-LABEL: @simple_csa_float(
+; NO-EVL-NOT: vector.body:
+;
+entry:
+ %cmp6 = icmp sgt i32 %N, 0
+ br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.inc, %entry
+ %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.inc ]
+ ret float %t.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %t.07 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
+ %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
+ %0 = load i8, ptr %arrayidx, align 1
+ %tobool.not = icmp eq i8 %0, 0
+ br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then: ; preds = %for.body
+ %arrayidx2 = getelementptr inbounds float, ptr %data, i64 %indvars.iv
+ %1 = load float, ptr %arrayidx2, align 4
+ br label %for.inc
+
+for.inc: ; preds = %for.body, %if.then
+ %t.1 = phi float [ %1, %if.then ], [ %t.07, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_in_series_int_select(int N, int *data0, int *data1, int a) {
+; int t = -1;
+; int s = -1;
+; for (int i = 0; i < N; i++) {
+; if (a < data0[i])
+; t = data0[i];
+; if (a < data1[i])
+; s = data1[i];
+; }
+; return t | s; // use t and s
+; }
+define i32 @csa_in_series_int_select(i32 %N, ptr %data0, ptr %data1, i64 %a) {
+; EVL-LABEL: @csa_in_series_int_select(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL: vector.ph:
+; EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[A:%.*]], i64 0
+; EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; EVL: vector.body:
+; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_MASK_PHI1:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP3]]
+; EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP5]], align 4
+; EVL-NEXT: [[TMP6:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; EVL-NEXT: [[TMP7:%.*]] = icmp slt <vscale x 1 x i64> [[BROADCAST_SPLAT]], [[TMP6]]
+; EVL-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP7]])
+; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP8]], <vscale x 1 x i1> [[TMP7]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
+; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP8]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI2]]
+; EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP3]]
+; EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; EVL-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
+; EVL-NEXT: [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD3]] to <vscale x 1 x i64>
+; EVL-NEXT: [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[BROADCAST_SPLAT]], [[TMP11]]
+; EVL-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
+; EVL-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD3]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; EVL: middle.block:
+; EVL-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; EVL-NEXT: [[TMP15:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP15]])
+; EVL-NEXT: [[TMP17:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
+; EVL-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP16]], 0
+; EVL-NEXT: [[TMP19:%.*]] = and i1 [[TMP17]], [[TMP18]]
+; EVL-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL5]], i32 [[TMP20]]
+; EVL-NEXT: [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
+; EVL-NEXT: [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT7]], i32 -1
+; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; EVL-NEXT: [[TMP23:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT: [[TMP24:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP23]])
+; EVL-NEXT: [[TMP25:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT: [[TMP26:%.*]] = icmp eq i32 [[TMP24]], 0
+; EVL-NEXT: [[TMP27:%.*]] = and i1 [[TMP25]], [[TMP26]]
+; EVL-NEXT: [[TMP28:%.*]] = select i1 [[TMP27]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP28]]
+; EVL-NEXT: [[TMP29:%.*]] = icmp sge i32 [[TMP28]], 0
+; EVL-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[CSA_EXTRACT]], i32 -1
+; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL: scalar.ph:
+; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT: [[TMP31:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP31]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
+; EVL-NEXT: ret i32 [[OR]]
+; EVL: for.body:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; EVL-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT: [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
+; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP33]]
+; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP32]], i32 [[T_022]]
+; EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP34:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; EVL-NEXT: [[TMP35:%.*]] = sext i32 [[TMP34]] to i64
+; EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP35]]
+; EVL-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP34]], i32 [[S_023]]
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+;
+; NO-EVL-LABEL: @csa_in_series_int_select(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL: vector.ph:
+; NO-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[A:%.*]], i64 0
+; NO-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-EVL: vector.body:
+; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_MASK_PHI1:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP3]]
+; NO-EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP5]], align 4
+; NO-EVL-NEXT: [[TMP6:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; NO-EVL-NEXT: [[TMP7:%.*]] = icmp slt <vscale x 1 x i64> [[BROADCAST_SPLAT]], [[TMP6]]
+; NO-EVL-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP7]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP8]], <vscale x 1 x i1> [[TMP7]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP8]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI2]]
+; NO-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP3]]
+; NO-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
+; NO-EVL-NEXT: [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD3]] to <vscale x 1 x i64>
+; NO-EVL-NEXT: [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[BROADCAST_SPLAT]], [[TMP11]]
+; NO-EVL-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD3]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; NO-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; NO-EVL: middle.block:
+; NO-EVL-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; NO-EVL-NEXT: [[TMP15:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP15]])
+; NO-EVL-NEXT: [[TMP17:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
+; NO-EVL-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP16]], 0
+; NO-EVL-NEXT: [[TMP19:%.*]] = and i1 [[TMP17]], [[TMP18]]
+; NO-EVL-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL5]], i32 [[TMP20]]
+; NO-EVL-NEXT: [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
+; NO-EVL-NEXT: [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT7]], i32 -1
+; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; NO-EVL-NEXT: [[TMP23:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP24:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP23]])
+; NO-EVL-NEXT: [[TMP25:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT: [[TMP26:%.*]] = icmp eq i32 [[TMP24]], 0
+; NO-EVL-NEXT: [[TMP27:%.*]] = and i1 [[TMP25]], [[TMP26]]
+; NO-EVL-NEXT: [[TMP28:%.*]] = select i1 [[TMP27]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP28]]
+; NO-EVL-NEXT: [[TMP29:%.*]] = icmp sge i32 [[TMP28]], 0
+; NO-EVL-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[CSA_EXTRACT]], i32 -1
+; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL: scalar.ph:
+; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT: [[TMP31:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP31]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT: ret i32 [[OR]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT: [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
+; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP33]]
+; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP32]], i32 [[T_022]]
+; NO-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP34:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; NO-EVL-NEXT: [[TMP35:%.*]] = sext i32 [[TMP34]] to i64
+; NO-EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP35]]
+; NO-EVL-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP34]], i32 [[S_023]]
+; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+;
+; DATA-LABEL: @csa_in_series_int_select(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA: vector.ph:
+; DATA-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[A:%.*]], i64 0
+; DATA-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
+; DATA: vector.body:
+; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_MASK_PHI1:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP3]]
+; DATA-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP5]], align 4
+; DATA-NEXT: [[TMP6:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; DATA-NEXT: [[TMP7:%.*]] = icmp slt <vscale x 1 x i64> [[BROADCAST_SPLAT]], [[TMP6]]
+; DATA-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP7]])
+; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP8]], <vscale x 1 x i1> [[TMP7]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
+; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP8]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI2]]
+; DATA-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP3]]
+; DATA-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; DATA-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
+; DATA-NEXT: [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD3]] to <vscale x 1 x i64>
+; DATA-NEXT: [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[BROADCAST_SPLAT]], [[TMP11]]
+; DATA-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
+; DATA-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD3]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; DATA-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; DATA: middle.block:
+; DATA-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; DATA-NEXT: [[TMP15:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
+; DATA-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP15]])
+; DATA-NEXT: [[TMP17:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
+; DATA-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP16]], 0
+; DATA-NEXT: [[TMP19:%.*]] = and i1 [[TMP17]], [[TMP18]]
+; DATA-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 -1
+; DATA-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL5]], i32 [[TMP20]]
+; DATA-NEXT: [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
+; DATA-NEXT: [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT7]], i32 -1
+; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; DATA-NEXT: [[TMP23:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; DATA-NEXT: [[TMP24:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP23]])
+; DATA-NEXT: [[TMP25:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT: [[TMP26:%.*]] = icmp eq i32 [[TMP24]], 0
+; DATA-NEXT: [[TMP27:%.*]] = and i1 [[TMP25]], [[TMP26]]
+; DATA-NEXT: [[TMP28:%.*]] = select i1 [[TMP27]], i32 0, i32 -1
+; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP28]]
+; DATA-NEXT: [[TMP29:%.*]] = icmp sge i32 [[TMP28]], 0
+; DATA-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[CSA_EXTRACT]], i32 -1
+; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA: scalar.ph:
+; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT: [[TMP31:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[OR:%.*]] = phi i32 [ [[TMP31]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
+; DATA-NEXT: ret i32 [[OR]]
+; DATA: for.body:
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; DATA-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT: [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
+; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP33]]
+; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP32]], i32 [[T_022]]
+; DATA-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP34:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; DATA-NEXT: [[TMP35:%.*]] = sext i32 [[TMP34]] to i64
+; DATA-NEXT: [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP35]]
+; DATA-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP34]], i32 [[S_023]]
+; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+;
+entry:
+ %cmp21 = icmp sgt i32 %N, 0
+ br i1 %cmp21, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %0 = or i32 %s.1, %spec.select
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
+ ret i32 %or
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %s.023 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.body ]
+ %t.022 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+ %1 = load i32, ptr %arrayidx, align 4
+ %2 = sext i32 %1 to i64
+ %cmp1 = icmp slt i64 %a, %2
+ %spec.select = select i1 %cmp1, i32 %1, i32 %t.022
+ %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+ %3 = load i32, ptr %arrayidx5, align 4
+ %4 = sext i32 %3 to i64
+ %cmp6 = icmp slt i64 %a, %4
+ %s.1 = select i1 %cmp6, i32 %3, i32 %s.023
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_in_series_int_select_induction_cmp(int N, int *data0, int *data1) {
+; int t = -1;
+; int s = -1;
+; for (int i = 0; i < N; i++) {
+; if (i < data0[i])
+; t = data0[i];
+; if (i < data1[i])
+; s = data1[i];
+; }
+; return t | s; // use t and s
+; }
+define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_int_select_induction_cmp(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL: vector.ph:
+; EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+; EVL-NEXT: [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
+; EVL-NEXT: [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
+; EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP7:%.*]] = mul i64 1, [[TMP6]]
+; EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
+; EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; EVL: vector.body:
+; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_MASK_PHI1:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP8]]
+; EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
+; EVL-NEXT: [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; EVL-NEXT: [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
+; EVL-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
+; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
+; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI2]]
+; EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP8]]
+; EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; EVL-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 1 x i32>, ptr [[TMP15]], align 4
+; EVL-NEXT: [[TMP16:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD3]] to <vscale x 1 x i64>
+; EVL-NEXT: [[TMP17:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP16]]
+; EVL-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP17]])
+; EVL-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP18]], <vscale x 1 x i1> [[TMP17]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP18]], <vscale x 1 x i32> [[WIDE_LOAD3]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; EVL: middle.block:
+; EVL-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; EVL-NEXT: [[TMP20:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP20]])
+; EVL-NEXT: [[TMP22:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
+; EVL-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
+; EVL-NEXT: [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
+; EVL-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL5]], i32 [[TMP25]]
+; EVL-NEXT: [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
+; EVL-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[CSA_EXTRACT7]], i32 -1
+; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; EVL-NEXT: [[TMP28:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP28]])
+; EVL-NEXT: [[TMP30:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT: [[TMP31:%.*]] = icmp eq i32 [[TMP29]], 0
+; EVL-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]]
+; EVL-NEXT: [[TMP33:%.*]] = select i1 [[TMP32]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP33]]
+; EVL-NEXT: [[TMP34:%.*]] = icmp sge i32 [[TMP33]], 0
+; EVL-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[CSA_EXTRACT]], i32 -1
+; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL: scalar.ph:
+; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT: [[TMP36:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP36]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
+; EVL-NEXT: ret i32 [[OR]]
+; EVL: for.body:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; EVL-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT: [[TMP38:%.*]] = sext i32 [[TMP37]] to i64
+; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP38]]
+; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP37]], i32 [[T_022]]
+; EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP39:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; EVL-NEXT: [[TMP40:%.*]] = sext i32 [[TMP39]] to i64
+; EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP40]]
+; EVL-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP39]], i32 [[S_023]]
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+;
+; NO-EVL-LABEL: @csa_in_series_int_select_induction_cmp(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL: vector.ph:
+; NO-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+; NO-EVL-NEXT: [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
+; NO-EVL-NEXT: [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; NO-EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
+; NO-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP7:%.*]] = mul i64 1, [[TMP6]]
+; NO-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
+; NO-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-EVL: vector.body:
+; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_MASK_PHI1:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP8]]
+; NO-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
+; NO-EVL-NEXT: [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; NO-EVL-NEXT: [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
+; NO-EVL-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI2]]
+; NO-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP8]]
+; NO-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 1 x i32>, ptr [[TMP15]], align 4
+; NO-EVL-NEXT: [[TMP16:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD3]] to <vscale x 1 x i64>
+; NO-EVL-NEXT: [[TMP17:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP16]]
+; NO-EVL-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP17]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP18]], <vscale x 1 x i1> [[TMP17]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP18]], <vscale x 1 x i32> [[WIDE_LOAD3]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; NO-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; NO-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; NO-EVL: middle.block:
+; NO-EVL-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; NO-EVL-NEXT: [[TMP20:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP20]])
+; NO-EVL-NEXT: [[TMP22:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
+; NO-EVL-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
+; NO-EVL-NEXT: [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
+; NO-EVL-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL5]], i32 [[TMP25]]
+; NO-EVL-NEXT: [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
+; NO-EVL-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[CSA_EXTRACT7]], i32 -1
+; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; NO-EVL-NEXT: [[TMP28:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP28]])
+; NO-EVL-NEXT: [[TMP30:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT: [[TMP31:%.*]] = icmp eq i32 [[TMP29]], 0
+; NO-EVL-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]]
+; NO-EVL-NEXT: [[TMP33:%.*]] = select i1 [[TMP32]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP33]]
+; NO-EVL-NEXT: [[TMP34:%.*]] = icmp sge i32 [[TMP33]], 0
+; NO-EVL-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[CSA_EXTRACT]], i32 -1
+; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL: scalar.ph:
+; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT: [[TMP36:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP36]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT: ret i32 [[OR]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT: [[TMP38:%.*]] = sext i32 [[TMP37]] to i64
+; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP38]]
+; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP37]], i32 [[T_022]]
+; NO-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP39:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; NO-EVL-NEXT: [[TMP40:%.*]] = sext i32 [[TMP39]] to i64
+; NO-EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP40]]
+; NO-EVL-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP39]], i32 [[S_023]]
+; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+;
+; DATA-LABEL: @csa_in_series_int_select_induction_cmp(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA: vector.ph:
+; DATA-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+; DATA-NEXT: [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
+; DATA-NEXT: [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; DATA-NEXT: [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
+; DATA-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP7:%.*]] = mul i64 1, [[TMP6]]
+; DATA-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
+; DATA-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
+; DATA: vector.body:
+; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_MASK_PHI1:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP8]]
+; DATA-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
+; DATA-NEXT: [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; DATA-NEXT: [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
+; DATA-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
+; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
+; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI2]]
+; DATA-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP8]]
+; DATA-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; DATA-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 1 x i32>, ptr [[TMP15]], align 4
+; DATA-NEXT: [[TMP16:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD3]] to <vscale x 1 x i64>
+; DATA-NEXT: [[TMP17:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP16]]
+; DATA-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP17]])
+; DATA-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP18]], <vscale x 1 x i1> [[TMP17]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP18]], <vscale x 1 x i32> [[WIDE_LOAD3]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; DATA-NEXT: [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; DATA-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; DATA: middle.block:
+; DATA-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; DATA-NEXT: [[TMP20:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
+; DATA-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP20]])
+; DATA-NEXT: [[TMP22:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
+; DATA-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
+; DATA-NEXT: [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
+; DATA-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
+; DATA-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL5]], i32 [[TMP25]]
+; DATA-NEXT: [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
+; DATA-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[CSA_EXTRACT7]], i32 -1
+; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; DATA-NEXT: [[TMP28:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; DATA-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP28]])
+; DATA-NEXT: [[TMP30:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT: [[TMP31:%.*]] = icmp eq i32 [[TMP29]], 0
+; DATA-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]]
+; DATA-NEXT: [[TMP33:%.*]] = select i1 [[TMP32]], i32 0, i32 -1
+; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP33]]
+; DATA-NEXT: [[TMP34:%.*]] = icmp sge i32 [[TMP33]], 0
+; DATA-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[CSA_EXTRACT]], i32 -1
+; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA: scalar.ph:
+; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT: [[TMP36:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[OR:%.*]] = phi i32 [ [[TMP36]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
+; DATA-NEXT: ret i32 [[OR]]
+; DATA: for.body:
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; DATA-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT: [[TMP38:%.*]] = sext i32 [[TMP37]] to i64
+; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP38]]
+; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP37]], i32 [[T_022]]
+; DATA-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP39:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; DATA-NEXT: [[TMP40:%.*]] = sext i32 [[TMP39]] to i64
+; DATA-NEXT: [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP40]]
+; DATA-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP39]], i32 [[S_023]]
+; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+;
+entry:
+ %cmp21 = icmp sgt i32 %N, 0
+ br i1 %cmp21, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %0 = or i32 %s.1, %spec.select
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
+ ret i32 %or
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %s.023 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.body ]
+ %t.022 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+ %1 = load i32, ptr %arrayidx, align 4
+ %2 = sext i32 %1 to i64
+ %cmp1 = icmp slt i64 %indvars.iv, %2
+ %spec.select = select i1 %cmp1, i32 %1, i32 %t.022
+ %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+ %3 = load i32, ptr %arrayidx5, align 4
+ %4 = sext i32 %3 to i64
+ %cmp6 = icmp slt i64 %indvars.iv, %4
+ %s.1 = select i1 %cmp6, i32 %3, i32 %s.023
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_in_series_float_select(int N, float *data0,
+; float *data1) {
+; float t = 1.0f;
+; float s = 1.0f;
+; for (int i = 0; i < N; i++) {
+; if (0.0f < data0[i])
+; t = data0[i];
+; if (0.0f <data1[i])
+; s = data1[i];
+; }
+; return t + s; // use t and s
+; }
+define float @csa_in_series_float_select(i32 %N, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_float_select(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL: vector.ph:
+; EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; EVL: vector.body:
+; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_MASK_PHI1:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[TMP3]]
+; EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0
+; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x float>, ptr [[TMP5]], align 4
+; EVL-NEXT: [[TMP6:%.*]] = fcmp ogt <vscale x 1 x float> [[WIDE_LOAD]], zeroinitializer
+; EVL-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP6]])
+; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP7]], <vscale x 1 x i1> [[TMP6]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
+; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP7]], <vscale x 1 x float> [[WIDE_LOAD]], <vscale x 1 x float> [[CSA_DATA_PHI2]]
+; EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[TMP3]]
+; EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
+; EVL-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 1 x float>, ptr [[TMP9]], align 4
+; EVL-NEXT: [[TMP10:%.*]] = fcmp ogt <vscale x 1 x float> [[WIDE_LOAD3]], zeroinitializer
+; EVL-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP10]])
+; EVL-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP11]], <vscale x 1 x i1> [[TMP10]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP11]], <vscale x 1 x float> [[WIDE_LOAD3]], <vscale x 1 x float> [[CSA_DATA_PHI]]
+; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; EVL-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; EVL: middle.block:
+; EVL-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; EVL-NEXT: [[TMP13:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP13]])
+; EVL-NEXT: [[TMP15:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
+; EVL-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP14]], 0
+; EVL-NEXT: [[TMP17:%.*]] = and i1 [[TMP15]], [[TMP16]]
+; EVL-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x float> [[CSA_DATA_SEL5]], i32 [[TMP18]]
+; EVL-NEXT: [[TMP19:%.*]] = icmp sge i32 [[TMP18]], 0
+; EVL-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[CSA_EXTRACT7]], float 1.000000e+00
+; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; EVL-NEXT: [[TMP21:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP21]])
+; EVL-NEXT: [[TMP23:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP22]], 0
+; EVL-NEXT: [[TMP25:%.*]] = and i1 [[TMP23]], [[TMP24]]
+; EVL-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x float> [[CSA_DATA_SEL]], i32 [[TMP26]]
+; EVL-NEXT: [[TMP27:%.*]] = icmp sge i32 [[TMP26]], 0
+; EVL-NEXT: [[TMP28:%.*]] = select i1 [[TMP27]], float [[CSA_EXTRACT]], float 1.000000e+00
+; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL: scalar.ph:
+; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT: [[TMP29:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[ADD:%.*]] = phi float [ [[TMP29]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; EVL-NEXT: ret float [[ADD]]
+; EVL: for.body:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[S_021:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; EVL-NEXT: [[T_020:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP30:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP30]], 0.000000e+00
+; EVL-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP30]], float [[T_020]]
+; EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP31:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; EVL-NEXT: [[CMP6:%.*]] = fcmp ogt float [[TMP31]], 0.000000e+00
+; EVL-NEXT: [[S_1]] = select i1 [[CMP6]], float [[TMP31]], float [[S_021]]
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+;
+; NO-EVL-LABEL: @csa_in_series_float_select(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL: vector.ph:
+; NO-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-EVL: vector.body:
+; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_MASK_PHI1:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[TMP3]]
+; NO-EVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x float>, ptr [[TMP5]], align 4
+; NO-EVL-NEXT: [[TMP6:%.*]] = fcmp ogt <vscale x 1 x float> [[WIDE_LOAD]], zeroinitializer
+; NO-EVL-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP6]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP7]], <vscale x 1 x i1> [[TMP6]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP7]], <vscale x 1 x float> [[WIDE_LOAD]], <vscale x 1 x float> [[CSA_DATA_PHI2]]
+; NO-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[TMP3]]
+; NO-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 1 x float>, ptr [[TMP9]], align 4
+; NO-EVL-NEXT: [[TMP10:%.*]] = fcmp ogt <vscale x 1 x float> [[WIDE_LOAD3]], zeroinitializer
+; NO-EVL-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP10]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP11]], <vscale x 1 x i1> [[TMP10]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP11]], <vscale x 1 x float> [[WIDE_LOAD3]], <vscale x 1 x float> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; NO-EVL-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; NO-EVL: middle.block:
+; NO-EVL-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; NO-EVL-NEXT: [[TMP13:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP13]])
+; NO-EVL-NEXT: [[TMP15:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
+; NO-EVL-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP14]], 0
+; NO-EVL-NEXT: [[TMP17:%.*]] = and i1 [[TMP15]], [[TMP16]]
+; NO-EVL-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x float> [[CSA_DATA_SEL5]], i32 [[TMP18]]
+; NO-EVL-NEXT: [[TMP19:%.*]] = icmp sge i32 [[TMP18]], 0
+; NO-EVL-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[CSA_EXTRACT7]], float 1.000000e+00
+; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; NO-EVL-NEXT: [[TMP21:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP21]])
+; NO-EVL-NEXT: [[TMP23:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP22]], 0
+; NO-EVL-NEXT: [[TMP25:%.*]] = and i1 [[TMP23]], [[TMP24]]
+; NO-EVL-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x float> [[CSA_DATA_SEL]], i32 [[TMP26]]
+; NO-EVL-NEXT: [[TMP27:%.*]] = icmp sge i32 [[TMP26]], 0
+; NO-EVL-NEXT: [[TMP28:%.*]] = select i1 [[TMP27]], float [[CSA_EXTRACT]], float 1.000000e+00
+; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL: scalar.ph:
+; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT: [[TMP29:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[ADD:%.*]] = phi float [ [[TMP29]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT: ret float [[ADD]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[S_021:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[T_020:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP30:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP30]], 0.000000e+00
+; NO-EVL-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP30]], float [[T_020]]
+; NO-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP31:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; NO-EVL-NEXT: [[CMP6:%.*]] = fcmp ogt float [[TMP31]], 0.000000e+00
+; NO-EVL-NEXT: [[S_1]] = select i1 [[CMP6]], float [[TMP31]], float [[S_021]]
+; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+;
+; DATA-LABEL: @csa_in_series_float_select(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA: vector.ph:
+; DATA-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
+; DATA: vector.body:
+; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_MASK_PHI1:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[TMP3]]
+; DATA-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0
+; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x float>, ptr [[TMP5]], align 4
+; DATA-NEXT: [[TMP6:%.*]] = fcmp ogt <vscale x 1 x float> [[WIDE_LOAD]], zeroinitializer
+; DATA-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP6]])
+; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP7]], <vscale x 1 x i1> [[TMP6]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
+; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP7]], <vscale x 1 x float> [[WIDE_LOAD]], <vscale x 1 x float> [[CSA_DATA_PHI2]]
+; DATA-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[TMP3]]
+; DATA-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
+; DATA-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 1 x float>, ptr [[TMP9]], align 4
+; DATA-NEXT: [[TMP10:%.*]] = fcmp ogt <vscale x 1 x float> [[WIDE_LOAD3]], zeroinitializer
+; DATA-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP10]])
+; DATA-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP11]], <vscale x 1 x i1> [[TMP10]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP11]], <vscale x 1 x float> [[WIDE_LOAD3]], <vscale x 1 x float> [[CSA_DATA_PHI]]
+; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; DATA-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; DATA: middle.block:
+; DATA-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; DATA-NEXT: [[TMP13:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
+; DATA-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP13]])
+; DATA-NEXT: [[TMP15:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
+; DATA-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP14]], 0
+; DATA-NEXT: [[TMP17:%.*]] = and i1 [[TMP15]], [[TMP16]]
+; DATA-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], i32 0, i32 -1
+; DATA-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x float> [[CSA_DATA_SEL5]], i32 [[TMP18]]
+; DATA-NEXT: [[TMP19:%.*]] = icmp sge i32 [[TMP18]], 0
+; DATA-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[CSA_EXTRACT7]], float 1.000000e+00
+; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; DATA-NEXT: [[TMP21:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; DATA-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP21]])
+; DATA-NEXT: [[TMP23:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP22]], 0
+; DATA-NEXT: [[TMP25:%.*]] = and i1 [[TMP23]], [[TMP24]]
+; DATA-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 0, i32 -1
+; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x float> [[CSA_DATA_SEL]], i32 [[TMP26]]
+; DATA-NEXT: [[TMP27:%.*]] = icmp sge i32 [[TMP26]], 0
+; DATA-NEXT: [[TMP28:%.*]] = select i1 [[TMP27]], float [[CSA_EXTRACT]], float 1.000000e+00
+; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA: scalar.ph:
+; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT: [[TMP29:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[ADD:%.*]] = phi float [ [[TMP29]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; DATA-NEXT: ret float [[ADD]]
+; DATA: for.body:
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[S_021:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
+; DATA-NEXT: [[T_020:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP30:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP30]], 0.000000e+00
+; DATA-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP30]], float [[T_020]]
+; DATA-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP31:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; DATA-NEXT: [[CMP6:%.*]] = fcmp ogt float [[TMP31]], 0.000000e+00
+; DATA-NEXT: [[S_1]] = select i1 [[CMP6]], float [[TMP31]], float [[S_021]]
+; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+;
+entry:
+ %cmp19 = icmp sgt i32 %N, 0
+ br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %0 = fadd float %t.1, %s.1
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
+ ret float %add
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %s.021 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.body ]
+ %t.020 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.body ]
+ %arrayidx = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+ %1 = load float, ptr %arrayidx, align 4
+ %cmp1 = fcmp ogt float %1, 0.000000e+00
+ %t.1 = select i1 %cmp1, float %1, float %t.020
+ %arrayidx5 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+ %2 = load float, ptr %arrayidx5, align 4
+ %cmp6 = fcmp ogt float %2, 0.000000e+00
+ %s.1 = select i1 %cmp6, float %2, float %s.021
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_in_series_int(int N, bool *cond0, bool *cond1, int *data0, int *data1) {
+; int t = -1;
+; int s = -1;
+; for (int i = 0; i < N; i++) {
+; if (cond0[i])
+; t = data0[i];
+; if (cond1[i])
+; s = data1[i];
+; }
+; return t | s; // use t and s
+; }
+define i32 @csa_in_series_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_int(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @csa_in_series_int(
+; NO-EVL-NOT: vector.body:
+;
+; DATA-LABEL: @csa_in_series_int(
+; DATA-NOT: vector.body:
+;
+entry:
+ %cmp15 = icmp sgt i32 %N, 0
+ br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.inc
+ %0 = or i32 %s.1, %t.1
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
+ ret i32 %or
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %s.017 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
+ %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
+ %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+ %1 = load i8, ptr %arrayidx, align 1
+ %tobool.not = icmp eq i8 %1, 0
+ br i1 %tobool.not, label %if.end, label %if.then
+
+if.then: ; preds = %for.body
+ %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+ %2 = load i32, ptr %arrayidx2, align 4
+ br label %if.end
+
+if.end: ; preds = %if.then, %for.body
+ %t.1 = phi i32 [ %2, %if.then ], [ %t.016, %for.body ]
+ %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+ %3 = load i8, ptr %arrayidx4, align 1
+ %tobool5.not = icmp eq i8 %3, 0
+ br i1 %tobool5.not, label %for.inc, label %if.then6
+
+if.then6: ; preds = %if.end
+ %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+ %4 = load i32, ptr %arrayidx8, align 4
+ br label %for.inc
+
+for.inc: ; preds = %if.end, %if.then6
+ %s.1 = phi i32 [ %4, %if.then6 ], [ %s.017, %if.end ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_in_series_float(int N, bool *cond0, bool *cond1, float *data0,
+; float *data1) {
+; float t = 1.0f;
+; float s = 1.0f;
+; for (int i = 0; i < N; i++) {
+; if (cond0[i])
+; t = data0[i];
+; if (cond1[i])
+; s = data1[i];
+; }
+; return t + s; // use t and s
+; }
+define float @csa_in_series_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_float(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @csa_in_series_float(
+; NO-EVL-NOT: vector.body:
+;
+; DATA-LABEL: @csa_in_series_float(
+; DATA-NOT: vector.body:
+;
+entry:
+ %cmp15 = icmp sgt i32 %N, 0
+ br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.inc
+ %0 = fadd float %t.1, %s.1
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
+ ret float %add
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %s.017 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
+ %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
+ %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+ %1 = load i8, ptr %arrayidx, align 1
+ %tobool.not = icmp eq i8 %1, 0
+ br i1 %tobool.not, label %if.end, label %if.then
+
+if.then: ; preds = %for.body
+ %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+ %2 = load float, ptr %arrayidx2, align 4
+ br label %if.end
+
+if.end: ; preds = %if.then, %for.body
+ %t.1 = phi float [ %2, %if.then ], [ %t.016, %for.body ]
+ %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+ %3 = load i8, ptr %arrayidx4, align 1
+ %tobool5.not = icmp eq i8 %3, 0
+ br i1 %tobool5.not, label %for.inc, label %if.then6
+
+if.then6: ; preds = %if.end
+ %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+ %4 = load float, ptr %arrayidx8, align 4
+ br label %for.inc
+
+for.inc: ; preds = %if.end, %if.then6
+ %s.1 = phi float [ %4, %if.then6 ], [ %s.017, %if.end ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_in_series_same_scalar_int_select(int N, int *data0,
+; int *data1) {
+; int t = -1;
+; for (int i = 0; i < N; i++) {
+; if (i < data0[i])
+; t = data0[i];
+; if (i < data1[i])
+; t = data1[i];
+; }
+; return t; // use t
+; }
+define i32 @csa_in_series_same_scalar_int_select(i32 %N, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_same_scalar_int_select(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @csa_in_series_same_scalar_int_select(
+; NO-EVL-NOT: vector.body:
+;
+; DATA-LABEL: @csa_in_series_same_scalar_int_select(
+; DATA-NOT: vector.body:
+;
+entry:
+ %cmp21 = icmp sgt i32 %N, 0
+ br i1 %cmp21, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.2, %for.body ]
+ ret i32 %t.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %t.022 = phi i32 [ -1, %for.body.preheader ], [ %t.2, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %1 = sext i32 %0 to i64
+ %cmp1 = icmp slt i64 %indvars.iv, %1
+ %spec.select = select i1 %cmp1, i32 %0, i32 %t.022
+ %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+ %2 = load i32, ptr %arrayidx5, align 4
+ %3 = sext i32 %2 to i64
+ %cmp6 = icmp slt i64 %indvars.iv, %3
+ %t.2 = select i1 %cmp6, i32 %2, i32 %spec.select
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_in_series_same_scalar_float_select(int N,
+; float *data0, float *data1) {
+; float t = 1.0f;
+; for (int i = 0; i < N; i++) {
+; if (0.0f < data0[i])
+; t = data0[i];
+; if (0.0f < data1[i])
+; t = data1[i];
+; }
+; return t; // use t
+; }
+define float @csa_in_series_same_scalar_float_select(i32 %N, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_same_scalar_float_select(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @csa_in_series_same_scalar_float_select(
+; NO-EVL-NOT: vector.body:
+;
+; DATA-LABEL: @csa_in_series_same_scalar_float_select(
+; NO-EVL-NOT: vector.body:
+;
+entry:
+ %cmp19 = icmp sgt i32 %N, 0
+ br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.2, %for.body ]
+ ret float %t.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %t.020 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.2, %for.body ]
+ %arrayidx = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+ %0 = load float, ptr %arrayidx, align 4
+ %cmp1 = fcmp ogt float %0, 0.000000e+00
+ %t.1 = select i1 %cmp1, float %0, float %t.020
+ %arrayidx5 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+ %1 = load float, ptr %arrayidx5, align 4
+ %cmp6 = fcmp ogt float %1, 0.000000e+00
+ %t.2 = select i1 %cmp6, float %1, float %t.1
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_in_series_same_scalar_int(int N, bool *cond0, bool *cond1, int *data0,
+; int *data1) {
+; int t = -1;
+; for (int i = 0; i < N; i++) {
+; if (cond0[i])
+; t = data0[i];
+; if (cond1[i])
+; t = data1[i];
+; }
+; return t; // use t
+; }
+define i32 @csa_in_series_same_scalar_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_same_scalar_int(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @csa_in_series_same_scalar_int(
+; NO-EVL-NOT: vector.body:
+;
+; DATA-LABEL: @csa_in_series_same_scalar_int(
+; DATA-NOT: vector.body:
+;
+entry:
+ %cmp15 = icmp sgt i32 %N, 0
+ br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.inc, %entry
+ %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.2, %for.inc ]
+ ret i32 %t.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.2, %for.inc ]
+ %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+ %0 = load i8, ptr %arrayidx, align 1
+ %tobool.not = icmp eq i8 %0, 0
+ br i1 %tobool.not, label %if.end, label %if.then
+
+if.then: ; preds = %for.body
+ %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+ %1 = load i32, ptr %arrayidx2, align 4
+ br label %if.end
+
+if.end: ; preds = %if.then, %for.body
+ %t.1 = phi i32 [ %1, %if.then ], [ %t.016, %for.body ]
+ %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+ %2 = load i8, ptr %arrayidx4, align 1
+ %tobool5.not = icmp eq i8 %2, 0
+ br i1 %tobool5.not, label %for.inc, label %if.then6
+
+if.then6: ; preds = %if.end
+ %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+ %3 = load i32, ptr %arrayidx8, align 4
+ br label %for.inc
+
+for.inc: ; preds = %if.end, %if.then6
+ %t.2 = phi i32 [ %3, %if.then6 ], [ %t.1, %if.end ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_in_series_same_scalar_float(int N, bool *cond0, bool *cond1,
+; float *data0, float *data1) {
+; float t = 1.0f;
+; for (int i = 0; i < N; i++) {
+; if (cond0[i])
+; t = data0[i];
+; if (cond1[i])
+; t = data1[i];
+; }
+; return t; // use t
+; }
+define float @csa_in_series_same_scalar_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_in_series_same_scalar_float(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @csa_in_series_same_scalar_float(
+; NO-EVL-NOT: vector.body:
+;
+; DATA-LABEL: @csa_in_series_same_scalar_float(
+; DATA-NOT: vector.body:
+;
+entry:
+ %cmp15 = icmp sgt i32 %N, 0
+ br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.inc, %entry
+ %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.2, %for.inc ]
+ ret float %t.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.2, %for.inc ]
+ %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+ %0 = load i8, ptr %arrayidx, align 1
+ %tobool.not = icmp eq i8 %0, 0
+ br i1 %tobool.not, label %if.end, label %if.then
+
+if.then: ; preds = %for.body
+ %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+ %1 = load float, ptr %arrayidx2, align 4
+ br label %if.end
+
+if.end: ; preds = %if.then, %for.body
+ %t.1 = phi float [ %1, %if.then ], [ %t.016, %for.body ]
+ %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+ %2 = load i8, ptr %arrayidx4, align 1
+ %tobool5.not = icmp eq i8 %2, 0
+ br i1 %tobool5.not, label %for.inc, label %if.then6
+
+if.then6: ; preds = %if.end
+ %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+ %3 = load float, ptr %arrayidx8, align 4
+ br label %for.inc
+
+for.inc: ; preds = %if.end, %if.then6
+ %t.2 = phi float [ %3, %if.then6 ], [ %t.1, %if.end ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_same_cond_int(int N, bool *cond, int *data0, int *data1) {
+; int t = -1;
+; int s = -1;
+; for (int i = 0; i < N; i++) {
+; if (cond[i]) {
+; t = data0[i];
+; s = data1[i];
+; }
+; }
+; return t | s; // use t and s
+; }
+define i32 @csa_same_cond_int(i32 %N, ptr %cond, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_same_cond_int(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @csa_same_cond_int(
+; NO-EVL-NOT: vector.body:
+;
+; DATA-LABEL: @csa_same_cond_int(
+; DATA-NOT: vector.body:
+;
+entry:
+ %cmp9 = icmp sgt i32 %N, 0
+ br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.inc
+ %0 = or i32 %s.1, %t.1
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
+ ret i32 %or
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %s.011 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
+ %t.010 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
+ %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
+ %1 = load i8, ptr %arrayidx, align 1
+ %tobool.not = icmp eq i8 %1, 0
+ br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then: ; preds = %for.body
+ %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+ %2 = load i32, ptr %arrayidx2, align 4
+ %arrayidx4 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+ %3 = load i32, ptr %arrayidx4, align 4
+ br label %for.inc
+
+for.inc: ; preds = %for.body, %if.then
+ %t.1 = phi i32 [ %2, %if.then ], [ %t.010, %for.body ]
+ %s.1 = phi i32 [ %3, %if.then ], [ %s.011, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_same_cond_float(int N, bool *cond, float *data0, float *data1) {
+; float t = 1.0f;
+; float s = 1.0f;
+; for (int i = 0; i < N; i++) {
+; if (cond[i]) {
+; t = data0[i];
+; s = data1[i];
+; }
+; }
+; return t + s; // use t and s
+; }
+define float @csa_same_cond_float(i32 %N, ptr %cond, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_same_cond_float(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @csa_same_cond_float(
+; NO-EVL-NOT: vector.body:
+;
+; DATA-LABEL: @csa_same_cond_float(
+; DATA-NOT: vector.body:
+;
+entry:
+ %cmp9 = icmp sgt i32 %N, 0
+ br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.inc
+ %0 = fadd float %t.1, %s.1
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
+ ret float %add
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %s.011 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
+ %t.010 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
+ %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
+ %1 = load i8, ptr %arrayidx, align 1
+ %tobool.not = icmp eq i8 %1, 0
+ br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then: ; preds = %for.body
+ %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+ %2 = load float, ptr %arrayidx2, align 4
+ %arrayidx4 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+ %3 = load float, ptr %arrayidx4, align 4
+ br label %for.inc
+
+for.inc: ; preds = %for.body, %if.then
+ %t.1 = phi float [ %2, %if.then ], [ %t.010, %for.body ]
+ %s.1 = phi float [ %3, %if.then ], [ %s.011, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_else_if_same_scalar_int(int N, bool *cond0, bool *cond1, int *data0,
+; int *data1) {
+; int t = -1;
+; for (int i = 0; i < N; i++) {
+; if (cond0[i])
+; t = data0[i];
+; else if (cond1[i])
+; t = data1[i];
+; }
+; return t; // use t
+; }
+define i32 @csa_else_if_same_scalar_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_else_if_same_scalar_int(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @csa_else_if_same_scalar_int(
+; NO-EVL-NOT: vector.body:
+;
+; DATA-LABEL: @csa_else_if_same_scalar_int(
+; DATA-NOT: vector.body:
+;
+entry:
+ %cmp15 = icmp sgt i32 %N, 0
+ br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.inc, %entry
+ %t.0.lcssa = phi i32 [ -1, %entry ], [ %t.1, %for.inc ]
+ ret i32 %t.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
+ %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+ %0 = load i8, ptr %arrayidx, align 1
+ %tobool.not = icmp eq i8 %0, 0
+ br i1 %tobool.not, label %if.else, label %for.inc.sink.split
+
+if.else: ; preds = %for.body
+ %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+ %1 = load i8, ptr %arrayidx4, align 1
+ %tobool5.not = icmp eq i8 %1, 0
+ br i1 %tobool5.not, label %for.inc, label %for.inc.sink.split
+
+for.inc.sink.split: ; preds = %if.else, %for.body
+ %data0.sink = phi ptr [ %data0, %for.body ], [ %data1, %if.else ]
+ %arrayidx2 = getelementptr inbounds i32, ptr %data0.sink, i64 %indvars.iv
+ %2 = load i32, ptr %arrayidx2, align 4
+ br label %for.inc
+
+for.inc: ; preds = %for.inc.sink.split, %if.else
+ %t.1 = phi i32 [ %t.016, %if.else ], [ %2, %for.inc.sink.split ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_else_if_same_scalar_float(int N, bool *cond0, bool *cond1,
+; float *data0, float *data1) {
+; float t = 1.0f;
+; for (int i = 0; i < N; i++) {
+; if (cond0[i])
+; t = data0[i];
+; else if (cond1[i])
+; t = data1[i];
+; }
+; return t; // use t
+; }
+define float @csa_else_if_same_scalar_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_else_if_same_scalar_float(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @csa_else_if_same_scalar_float(
+; NO-EVL-NOT: vector.body:
+;
+; DATA-LABEL: @csa_else_if_same_scalar_float(
+; DATA-NOT: vector.body:
+;
+entry:
+ %cmp15 = icmp sgt i32 %N, 0
+ br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.inc, %entry
+ %t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.inc ]
+ ret float %t.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
+ %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+ %0 = load i8, ptr %arrayidx, align 1
+ %tobool.not = icmp eq i8 %0, 0
+ br i1 %tobool.not, label %if.else, label %for.inc.sink.split
+
+if.else: ; preds = %for.body
+ %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+ %1 = load i8, ptr %arrayidx4, align 1
+ %tobool5.not = icmp eq i8 %1, 0
+ br i1 %tobool5.not, label %for.inc, label %for.inc.sink.split
+
+for.inc.sink.split: ; preds = %if.else, %for.body
+ %data0.sink = phi ptr [ %data0, %for.body ], [ %data1, %if.else ]
+ %arrayidx2 = getelementptr inbounds float, ptr %data0.sink, i64 %indvars.iv
+ %2 = load float, ptr %arrayidx2, align 4
+ br label %for.inc
+
+for.inc: ; preds = %for.inc.sink.split, %if.else
+ %t.1 = phi float [ %t.016, %if.else ], [ %2, %for.inc.sink.split ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int csa_else_if_int(int N, bool *cond0, bool *cond1, int *data0, int *data1) {
+; int t = -1;
+; int s = -1;
+; for (int i = 0; i < N; i++) {
+; if (cond0[i])
+; t = data0[i];
+; else if (cond1[i])
+; s = data1[i];
+; }
+; return t | s; // use t and s
+; }
+define i32 @csa_else_if_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_else_if_int(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @csa_else_if_int(
+; NO-EVL-NOT: vector.body:
+;
+; DATA-LABEL: @csa_else_if_int(
+; DATA-NOT: vector.body:
+;
+entry:
+ %cmp15 = icmp sgt i32 %N, 0
+ br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.inc
+ %0 = or i32 %s.1, %t.1
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
+ ret i32 %or
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %s.017 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
+ %t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
+ %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+ %1 = load i8, ptr %arrayidx, align 1
+ %tobool.not = icmp eq i8 %1, 0
+ br i1 %tobool.not, label %if.else, label %if.then
+
+if.then: ; preds = %for.body
+ %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+ %2 = load i32, ptr %arrayidx2, align 4
+ br label %for.inc
+
+if.else: ; preds = %for.body
+ %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+ %3 = load i8, ptr %arrayidx4, align 1
+ %tobool5.not = icmp eq i8 %3, 0
+ br i1 %tobool5.not, label %for.inc, label %if.then6
+
+if.then6: ; preds = %if.else
+ %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+ %4 = load i32, ptr %arrayidx8, align 4
+ br label %for.inc
+
+for.inc: ; preds = %if.then, %if.then6, %if.else
+ %t.1 = phi i32 [ %2, %if.then ], [ %t.016, %if.then6 ], [ %t.016, %if.else ]
+ %s.1 = phi i32 [ %s.017, %if.then ], [ %4, %if.then6 ], [ %s.017, %if.else ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; float csa_else_if_float(int N, bool *cond0, bool *cond1, float *data0,
+; float *data1) {
+; float t = 1.0f;
+; float s = 1.0f;
+; for (int i = 0; i < N; i++) {
+; if (cond0[i])
+; t = data0[i];
+; else if (cond1[i])
+; s = data1[i];
+; }
+; return t + s; // use t and s
+; }
+define float @csa_else_if_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %data1) {
+; EVL-LABEL: @csa_else_if_float(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @csa_else_if_float(
+; EVL-NOT: vector.body:
+;
+; DATA-LABEL: @csa_else_if_float(
+; DATA-NOT: vector.body:
+;
+entry:
+ %cmp15 = icmp sgt i32 %N, 0
+ br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.inc
+ %0 = fadd float %t.1, %s.1
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
+ ret float %add
+
+for.body: ; preds = %for.body.preheader, %for.inc
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %s.017 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
+ %t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
+ %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+ %1 = load i8, ptr %arrayidx, align 1
+ %tobool.not = icmp eq i8 %1, 0
+ br i1 %tobool.not, label %if.else, label %if.then
+
+if.then: ; preds = %for.body
+ %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+ %2 = load float, ptr %arrayidx2, align 4
+ br label %for.inc
+
+if.else: ; preds = %for.body
+ %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+ %3 = load i8, ptr %arrayidx4, align 1
+ %tobool5.not = icmp eq i8 %3, 0
+ br i1 %tobool5.not, label %for.inc, label %if.then6
+
+if.then6: ; preds = %if.else
+ %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+ %4 = load float, ptr %arrayidx8, align 4
+ br label %for.inc
+
+for.inc: ; preds = %if.then, %if.then6, %if.else
+ %t.1 = phi float [ %2, %if.then ], [ %t.016, %if.then6 ], [ %t.016, %if.else ]
+ %s.1 = phi float [ %s.017, %if.then ], [ %4, %if.then6 ], [ %s.017, %if.else ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; uint64_t idx_scalar(int64_t *a, int64_t *b, uint64_t ii, uint64_t n) {
+; uint64_t idx = ii;
+; for (uint64_t i = 0; i < n; ++i)
+; idx = (a[i] > b[i]) ? i : idx;
+; return idx;
+; }
+define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
+; EVL-LABEL: @idx_scalar(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @idx_scalar(
+; NO-EVL-NOT: vector.body:
+;
+; DATA-LABEL: @idx_scalar(
+; DATA-NOT: vector.body:
+;
+entry:
+ %cmp8.not = icmp eq i64 %n, 0
+ br i1 %cmp8.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %cond.lcssa = phi i64 [ %cond, %for.body ]
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond.lcssa, %for.cond.cleanup.loopexit ]
+ ret i64 %idx.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.010 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %idx.09 = phi i64 [ %cond, %for.body ], [ %ii, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i64, ptr %a, i64 %i.010
+ %0 = load i64, ptr %arrayidx, align 8
+ %arrayidx1 = getelementptr inbounds i64, ptr %b, i64 %i.010
+ %1 = load i64, ptr %arrayidx1, align 8
+ %cmp2 = icmp sgt i64 %0, %1
+ %cond = select i1 %cmp2, i64 %i.010, i64 %idx.09
+ %inc = add nuw i64 %i.010, 1
+ %exitcond.not = icmp eq i64 %inc, %n
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; uint64_t idx_scalar_dec(int64_t *a, int64_t *b, uint64_t ii, uint64_t n) {
+; uint64_t idx = ii;
+; for (uint64_t i = n; i > 0; --i) // decreasing
+; idx = (a[i - 1] > b[i - 1]) ? i : idx;
+; return idx;
+; }
+define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
+; EVL-LABEL: @idx_scalar_dec(
+; EVL-NOT: vector.body:
+;
+; NO-EVL-LABEL: @idx_scalar_dec(
+; NO-EVL-NOT: vector.body:
+;
+; DATA-LABEL: @idx_scalar_dec(
+; DATA-NOT: vector.body:
+;
+entry:
+ %cmp.not9 = icmp eq i64 %n, 0
+ br i1 %cmp.not9, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %cond.lcssa = phi i64 [ %cond, %for.body ]
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond.lcssa, %for.cond.cleanup.loopexit ]
+ ret i64 %idx.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.011 = phi i64 [ %sub, %for.body ], [ %n, %for.body.preheader ]
+ %idx.010 = phi i64 [ %cond, %for.body ], [ %ii, %for.body.preheader ]
+ %sub = add i64 %i.011, -1
+ %arrayidx = getelementptr inbounds i64, ptr %a, i64 %sub
+ %0 = load i64, ptr %arrayidx, align 8
+ %arrayidx2 = getelementptr inbounds i64, ptr %b, i64 %sub
+ %1 = load i64, ptr %arrayidx2, align 8
+ %cmp3 = icmp sgt i64 %0, %1
+ %cond = select i1 %cmp3, i64 %i.011, i64 %idx.010
+ %cmp.not = icmp eq i64 %sub, 0
+ br i1 %cmp.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; The key part of this function is that the true arm of the select corresponds
+; to selecting the initial value, instead of selecting the new value.
+; int simple_csa_int_select_neg_cond(int N, int *data) {
+; int t = 0;
+; for (int i = 0; i < N; i++) {
+; if (i != data[i])
+; t = data[i];
+; }
+; return t; // use t
+; }
+define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
+; EVL-LABEL: @simple_csa_int_select_neg_cond(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; EVL: vector.ph:
+; EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+; EVL-NEXT: [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
+; EVL-NEXT: [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
+; EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; EVL-NEXT: [[TMP7:%.*]] = mul i64 1, [[TMP6]]
+; EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
+; EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; EVL: vector.body:
+; EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; EVL-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP8]]
+; EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
+; EVL-NEXT: [[TMP11:%.*]] = zext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; EVL-NEXT: [[TMP12:%.*]] = icmp eq <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
+; EVL-NEXT: [[TMP13:%.*]] = xor <vscale x 1 x i1> [[TMP12]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
+; EVL-NEXT: [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP13]])
+; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP14]], <vscale x 1 x i1> [[TMP13]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP14]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; EVL: middle.block:
+; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; EVL-NEXT: [[TMP16:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP16]])
+; EVL-NEXT: [[TMP18:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP17]], 0
+; EVL-NEXT: [[TMP20:%.*]] = and i1 [[TMP18]], [[TMP19]]
+; EVL-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP21]]
+; EVL-NEXT: [[TMP22:%.*]] = icmp sge i32 [[TMP21]], 0
+; EVL-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[CSA_EXTRACT]], i32 0
+; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL: scalar.ph:
+; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; EVL-NEXT: ret i32 [[T_0_LCSSA]]
+; EVL: for.body:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT: [[TMP25:%.*]] = zext i32 [[TMP24]] to i64
+; EVL-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP25]]
+; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP24]]
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+;
+; NO-EVL-LABEL: @simple_csa_int_select_neg_cond(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-EVL: vector.ph:
+; NO-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; NO-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; NO-EVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+; NO-EVL-NEXT: [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
+; NO-EVL-NEXT: [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; NO-EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
+; NO-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; NO-EVL-NEXT: [[TMP7:%.*]] = mul i64 1, [[TMP6]]
+; NO-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
+; NO-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; NO-EVL: vector.body:
+; NO-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; NO-EVL-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP8]]
+; NO-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
+; NO-EVL-NEXT: [[TMP11:%.*]] = zext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; NO-EVL-NEXT: [[TMP12:%.*]] = icmp eq <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
+; NO-EVL-NEXT: [[TMP13:%.*]] = xor <vscale x 1 x i1> [[TMP12]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
+; NO-EVL-NEXT: [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP13]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP14]], <vscale x 1 x i1> [[TMP13]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP14]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; NO-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; NO-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; NO-EVL: middle.block:
+; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; NO-EVL-NEXT: [[TMP16:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP16]])
+; NO-EVL-NEXT: [[TMP18:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP17]], 0
+; NO-EVL-NEXT: [[TMP20:%.*]] = and i1 [[TMP18]], [[TMP19]]
+; NO-EVL-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP21]]
+; NO-EVL-NEXT: [[TMP22:%.*]] = icmp sge i32 [[TMP21]], 0
+; NO-EVL-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[CSA_EXTRACT]], i32 0
+; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL: scalar.ph:
+; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; NO-EVL-NEXT: ret i32 [[T_0_LCSSA]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT: [[TMP25:%.*]] = zext i32 [[TMP24]] to i64
+; NO-EVL-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP25]]
+; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP24]]
+; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+;
+; DATA-LABEL: @simple_csa_int_select_neg_cond(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
+; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DATA: vector.ph:
+; DATA-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+; DATA-NEXT: [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
+; DATA-NEXT: [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; DATA-NEXT: [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
+; DATA-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; DATA-NEXT: [[TMP7:%.*]] = mul i64 1, [[TMP6]]
+; DATA-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
+; DATA-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
+; DATA: vector.body:
+; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
+; DATA-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; DATA-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP8]]
+; DATA-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
+; DATA-NEXT: [[TMP11:%.*]] = zext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; DATA-NEXT: [[TMP12:%.*]] = icmp eq <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
+; DATA-NEXT: [[TMP13:%.*]] = xor <vscale x 1 x i1> [[TMP12]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
+; DATA-NEXT: [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP13]])
+; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP14]], <vscale x 1 x i1> [[TMP13]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP14]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; DATA-NEXT: [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; DATA-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; DATA: middle.block:
+; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
+; DATA-NEXT: [[TMP16:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; DATA-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP16]])
+; DATA-NEXT: [[TMP18:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; DATA-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP17]], 0
+; DATA-NEXT: [[TMP20:%.*]] = and i1 [[TMP18]], [[TMP19]]
+; DATA-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i32 0, i32 -1
+; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP21]]
+; DATA-NEXT: [[TMP22:%.*]] = icmp sge i32 [[TMP21]], 0
+; DATA-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[CSA_EXTRACT]], i32 0
+; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DATA: scalar.ph:
+; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; DATA-NEXT: ret i32 [[T_0_LCSSA]]
+; DATA: for.body:
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DATA-NEXT: [[TMP25:%.*]] = zext i32 [[TMP24]] to i64
+; DATA-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP25]]
+; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP24]]
+; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+;
+entry:
+ %cmp9 = icmp sgt i32 %N, 0
+ br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %spec.select.lcssa = phi i32 [ %spec.select, %for.body ]
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %t.0.lcssa = phi i32 [ 0, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+ ret i32 %t.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %t.010 = phi i32 [ 0, %for.body.preheader ], [ %spec.select, %for.body ]
+ %arrayidx = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %1 = zext i32 %0 to i64
+ %cmp1.not = icmp eq i64 %indvars.iv, %1
+ %spec.select = select i1 %cmp1.not, i32 %t.010, i32 %0
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; This function is generated from the following C/C++ program:
+; int *simple_csa_ptr_select(int N, int **data) {
+; int *t = nullptr;
+; for (int i = 0; i < N; i++) {
+; if (a < *data[i])
+; t = data[i];
+; }
+; return t; // use t
+; }
+define ptr @simple_csa_ptr_select(i32 %N, ptr %data, i32 %a) {
+; EVL-LABEL: @simple_csa_ptr_select(
+; EVL-NEXT: entry:
+; EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL: for.body.preheader:
+; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; EVL-NEXT: br label [[FOR_BODY:%.*]]
+; EVL: for.cond.cleanup.loopexit:
+; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; EVL: for.cond.cleanup:
+; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT: ret ptr [[T_0_LCSSA]]
+; EVL: for.body:
+; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[T_010:%.*]] = phi ptr [ null, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; EVL-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+; EVL-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
+; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
+; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; NO-EVL-LABEL: @simple_csa_ptr_select(
+; NO-EVL-NEXT: entry:
+; NO-EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL: for.body.preheader:
+; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
+; NO-EVL: for.cond.cleanup:
+; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT: ret ptr [[T_0_LCSSA]]
+; NO-EVL: for.body:
+; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[T_010:%.*]] = phi ptr [ null, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; NO-EVL-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+; NO-EVL-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
+; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
+; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; DATA-LABEL: @simple_csa_ptr_select(
+; DATA-NEXT: entry:
+; DATA-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DATA-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DATA: for.body.preheader:
+; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; DATA-NEXT: br label [[FOR_BODY:%.*]]
+; DATA: for.cond.cleanup.loopexit:
+; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
+; DATA: for.cond.cleanup:
+; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; DATA-NEXT: ret ptr [[T_0_LCSSA]]
+; DATA: for.body:
+; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DATA-NEXT: [[T_010:%.*]] = phi ptr [ null, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
+; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; DATA-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; DATA-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+; DATA-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
+; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
+; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp9 = icmp sgt i32 %N, 0
+ br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit:
+ %spec.select.lcssa = phi ptr [ %spec.select, %for.body ]
+ br label %for.cond.cleanup
+
+for.cond.cleanup:
+ %t.0.lcssa = phi ptr [ null, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+ ret ptr %t.0.lcssa
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %t.010 = phi ptr [ null, %for.body.preheader ], [ %spec.select, %for.body ]
+ %arrayidx = getelementptr inbounds ptr, ptr %data, i64 %indvars.iv
+ %0 = load ptr, ptr %arrayidx, align 8
+ %1 = load i32, ptr %0, align 4
+ %2 = sext i32 %1 to i64
+ %cmp1 = icmp slt i64 %a, %2
+ %spec.select = select i1 %cmp1, ptr %0, ptr %t.010
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
>From 3cc22ce350abe45354ac0794d195f9d0ead259fa Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Fri, 30 Aug 2024 11:01:18 -0700
Subject: [PATCH 08/16] fixup! clang-format
---
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 1 -
llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp | 2 +-
2 files changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 6d75a7273c69a4..9f50855f4e7807 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3540,4 +3540,3 @@ void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
printOperands(O, SlotTracker);
}
#endif
-
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index baafa70b9effbf..2b9c32624a2916 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -15,8 +15,8 @@
#include "VPlanVerifier.h"
#include "VPlan.h"
#include "VPlanCFG.h"
-#include "VPlanUtils.h"
#include "VPlanDominatorTree.h"
+#include "VPlanUtils.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/TypeSwitch.h"
>From 9e555741915c637c538b9c6a2e03d7ec8e589f00 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Fri, 30 Aug 2024 18:12:33 -0700
Subject: [PATCH 09/16] fixup! respond to some more reviews
---
llvm/include/llvm/Analysis/CSADescriptors.h | 2 +-
llvm/lib/Analysis/CSADescriptors.cpp | 41 ++++++++++---------
llvm/lib/Transforms/Vectorize/VPlan.h | 1 -
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 8 ++--
4 files changed, 25 insertions(+), 27 deletions(-)
diff --git a/llvm/include/llvm/Analysis/CSADescriptors.h b/llvm/include/llvm/Analysis/CSADescriptors.h
index edd98777d84ab6..75372c1ba93d8a 100644
--- a/llvm/include/llvm/Analysis/CSADescriptors.h
+++ b/llvm/include/llvm/Analysis/CSADescriptors.h
@@ -1,4 +1,4 @@
-//===- llvm/Analysis/CSADescriptors.h - CSA Descriptors --*- C++ -*-===//
+//===------------- CSADescriptors.h - CSA Descriptors -----------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Analysis/CSADescriptors.cpp b/llvm/lib/Analysis/CSADescriptors.cpp
index d0377c8c16de33..1f4ac5ef8b2516 100644
--- a/llvm/lib/Analysis/CSADescriptors.cpp
+++ b/llvm/lib/Analysis/CSADescriptors.cpp
@@ -1,4 +1,4 @@
-//=== llvm/Analysis/CSADescriptors.cpp - CSA Descriptors -*- C++ -*-===//
+//===----------- CSADescriptors..cpp - CSA Descriptors ----------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -19,43 +19,43 @@ using namespace llvm::PatternMatch;
#define DEBUG_TYPE "csa-descriptors"
+/// Return CSADescriptor that describes a CSA that matches one of these
+/// patterns:
+/// phi loop_inv, (select cmp, value, phi)
+/// phi loop_inv, (select cmp, phi, value)
+/// phi (select cmp, value, phi), loop_inv
+/// phi (select cmp, phi, value), loop_inv
+/// If the CSA does not match any of these paterns, return a CSADescriptor
+/// that describes an InvalidCSA.
CSADescriptor CSADescriptor::isCSAPhi(PHINode *Phi, Loop *TheLoop) {
- // Return CSADescriptor that describes a CSA that matches one of these
- // patterns:
- // phi loop_inv, (select cmp, value, phi)
- // phi loop_inv, (select cmp, phi, value)
- // phi (select cmp, value, phi), loop_inv
- // phi (select cmp, phi, value), loop_inv
- // If the CSA does not match any of these paterns, return a CSADescriptor
- // that describes an InvalidCSA.
// Must be a scalar
Type *Type = Phi->getType();
if (!Type->isIntegerTy() && !Type->isFloatingPointTy() &&
!Type->isPointerTy())
- return CSADescriptor();
+ return {};
// Match phi loop_inv, (select cmp, value, phi)
// or phi loop_inv, (select cmp, phi, value)
// or phi (select cmp, value, phi), loop_inv
// or phi (select cmp, phi, value), loop_inv
if (Phi->getNumIncomingValues() != 2)
- return CSADescriptor();
- auto SelectInstIt = find_if(Phi->incoming_values(), [&Phi](Use &U) {
+ return {};
+ auto SelectInstIt = find_if(Phi->incoming_values(), [&Phi](const Use &U) {
return match(U.get(), m_Select(m_Value(), m_Specific(Phi), m_Value())) ||
match(U.get(), m_Select(m_Value(), m_Value(), m_Specific(Phi)));
});
if (SelectInstIt == Phi->incoming_values().end())
- return CSADescriptor();
+ return {};
auto LoopInvIt = find_if(Phi->incoming_values(), [&](Use &U) {
return U.get() != *SelectInstIt && TheLoop->isLoopInvariant(U.get());
});
if (LoopInvIt == Phi->incoming_values().end())
- return CSADescriptor();
+ return {};
// Phi or Sel must be used only outside the loop,
// excluding if Phi use Sel or Sel use Phi
- auto IsOnlyUsedOutsideLoop = [=](Value *V, Value *Ignore) {
+ auto IsOnlyUsedOutsideLoop = [&](Value *V, Value *Ignore) {
return all_of(V->users(), [Ignore, TheLoop](User *U) {
if (U == Ignore)
return true;
@@ -64,10 +64,11 @@ CSADescriptor CSADescriptor::isCSAPhi(PHINode *Phi, Loop *TheLoop) {
return true;
});
};
- auto *Sel = cast<SelectInst>(SelectInstIt->get());
- auto *LoopInv = LoopInvIt->get();
- if (!IsOnlyUsedOutsideLoop(Phi, Sel) || !IsOnlyUsedOutsideLoop(Sel, Phi))
- return CSADescriptor();
+ Instruction *Select = cast<SelectInst>(SelectInstIt->get());
+ Value *LoopInv = LoopInvIt->get();
+ if (!IsOnlyUsedOutsideLoop(Phi, Select) ||
+ !IsOnlyUsedOutsideLoop(Select, Phi))
+ return {};
- return CSADescriptor(Phi, Sel, LoopInv);
+ return {Phi, Select, LoopInv};
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 2eb887a0e7d198..e709a67183d513 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -4104,7 +4104,6 @@ class VPlanSlp {
/// Return true if all visited instruction can be combined.
bool isCompletelySLP() const { return CompletelySLP; }
};
-
} // end namespace llvm
#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 9f50855f4e7807..9c8309c648ba7c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -689,8 +689,7 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
State.set(this, InitMask, Part);
return InitMask;
}
- Value *V = State.get(this, Part - 1);
- return V;
+ return State.get(this, Part - 1);
}
case VPInstruction::CSAInitData: {
if (Part == 0) {
@@ -699,8 +698,7 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
State.set(this, InitData, Part);
return InitData;
}
- Value *V = State.get(this, Part - 1);
- return V;
+ return State.get(this, Part - 1);
}
case VPInstruction::CSAMaskPhi: {
if (Part == 0) {
@@ -2407,7 +2405,7 @@ void VPCSAExtractScalarRecipe::execute(VPTransformState &State) {
// inactive, which would also cause the reduction to have value 0.
Value *MaybeLastIdx = State.Builder.CreateIntMaxReduce(ActiveLaneIdxs);
Value *IsLaneZeroActive =
- State.Builder.CreateExtractElement(MaskSel, (uint64_t)0);
+ State.Builder.CreateExtractElement(MaskSel, static_cast<uint64_t>(0));
Value *Zero = ConstantInt::get(MaybeLastIdx->getType(), 0);
Value *MaybeLastIdxEQZero = State.Builder.CreateICmpEQ(MaybeLastIdx, Zero);
Value *And = State.Builder.CreateAnd(IsLaneZeroActive, MaybeLastIdxEQZero);
>From 7f4852b7390d0725f36fcc4cd7a1b5e098b35a06 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Tue, 3 Sep 2024 10:50:33 -0700
Subject: [PATCH 10/16] fixup! refactor phi functions
---
llvm/lib/Transforms/Vectorize/VPlan.cpp | 4 +--
llvm/lib/Transforms/Vectorize/VPlan.h | 31 ++++++++++++++++++-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 15 +++++++++
llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 27 ----------------
llvm/lib/Transforms/Vectorize/VPlanUtils.h | 11 -------
.../Transforms/Vectorize/VPlanVerifier.cpp | 6 ++--
6 files changed, 50 insertions(+), 44 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 462d29b0ca8136..ea726d8a9dbf91 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -217,7 +217,7 @@ void VPBlockBase::deleteCFG(VPBlockBase *Entry) {
VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
iterator It = begin();
- while (It != end() && vputils::isPhi(*It))
+ while (It != end() && It->isPhi())
It++;
return It;
}
@@ -1056,7 +1056,7 @@ void VPlan::execute(VPTransformState *State) {
VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
for (VPRecipeBase &R : Header->phis()) {
// Skip phi-like recipes that generate their backedege values themselves.
- if (vputils::isPhiThatGeneratesBackedge(R))
+ if (R.isPhiThatGeneratesBackedge())
continue;
if (isa<VPWidenPointerInductionRecipe>(&R) ||
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e709a67183d513..44393ec42d45a0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -890,10 +890,29 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
bool mayHaveSideEffects() const;
/// Returns true for PHI-like recipes.
- bool isPhi() const {
+ virtual bool isPhi() const {
+ assert(getVPDefID() != VPInstructionSC &&
+ "VPInstructions implement this function themselves");
return getVPDefID() >= VPFirstPHISC && getVPDefID() <= VPLastPHISC;
}
+ /// Returns true for PHI-like recipes that exists in vector loop header basic
+ /// block
+ virtual bool isHeaderPhi() const {
+ assert(getVPDefID() != VPInstructionSC &&
+ "VPInstructions implement this function themselves");
+ return (getVPDefID() >= VPFirstHeaderPHISC &&
+ getVPDefID() <= VPLastHeaderPHISC) ||
+ getVPDefID() == VPWidenPHISC;
+ }
+
+ /// Returns true for PHI-like recipes that generate their own backedge
+ virtual bool isPhiThatGeneratesBackedge() const {
+ assert(getVPDefID() != VPInstructionSC &&
+ "VPInstructions implement this function themselves");
+ return getVPDefID() == VPWidenPHISC || getVPDefID() == VPCSAHeaderPHISC;
+ }
+
/// Returns true if the recipe may read from memory.
bool mayReadFromMemory() const;
@@ -1462,6 +1481,16 @@ class VPInstruction : public VPRecipeWithIRFlags {
/// Returns true if this VPInstruction's operands are single scalars and the
/// result is also a single scalar.
bool isSingleScalar() const;
+
+ /// Returns true for PHI-like recipes.
+ bool isPhi() const override;
+
+ /// Returns true for PHI-like recipes that exists in vector loop header basic
+ /// block
+ bool isHeaderPhi() const override;
+
+ /// Returns true for PHI-like recipes that generate their own backedge
+ bool isPhiThatGeneratesBackedge() const override;
};
/// A recipe to wrap on original IR instruction not to be modified during
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 9c8309c648ba7c..64ae1c276f01af 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -788,6 +788,21 @@ bool VPInstruction::isSingleScalar() const {
getOpcode() == VPInstruction::ExplicitVectorLength;
}
+bool VPInstruction::isPhi() const {
+ return getOpcode() == VPInstruction::CSAMaskPhi ||
+ getOpcode() == VPInstruction::CSAVLPhi;
+}
+
+bool VPInstruction::isHeaderPhi() const {
+ return getOpcode() == VPInstruction::CSAMaskPhi ||
+ getOpcode() == VPInstruction::CSAVLPhi;
+}
+
+bool VPInstruction::isPhiThatGeneratesBackedge() const {
+ return getOpcode() == VPInstruction::CSAMaskPhi ||
+ getOpcode() == VPInstruction::CSAVLPhi;
+}
+
#if !defined(NDEBUG)
bool VPInstruction::isFPMathOp() const {
// Inspired by FPMathOperator::classof. Notable differences are that we don't
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 34785d6aba39fe..c18bea4f4c5926 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -60,30 +60,3 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) {
return match(V, m_Binary<Instruction::ICmp>(m_VPValue(A), m_VPValue(B))) &&
IsWideCanonicalIV(A) && B == Plan.getOrCreateBackedgeTakenCount();
}
-
-bool vputils::isPhi(const VPRecipeBase &R) {
- if (R.isPhi())
- return true;
- if (auto *VPInst = dyn_cast<VPInstruction>(&R))
- return VPInst->getOpcode() == VPInstruction::CSAMaskPhi ||
- VPInst->getOpcode() == VPInstruction::CSAVLPhi;
- return false;
-}
-
-bool vputils::isPhiThatGeneratesBackedge(const VPRecipeBase &R) {
- if (isa<VPWidenPHIRecipe, VPCSAHeaderPHIRecipe>(&R))
- return true;
- if (auto *VPInst = dyn_cast<VPInstruction>(&R))
- return VPInst->getOpcode() == VPInstruction::CSAMaskPhi ||
- VPInst->getOpcode() == VPInstruction::CSAVLPhi;
- return false;
-}
-
-bool vputils::isHeaderPhi(const VPRecipeBase &R) {
- if (isa<VPHeaderPHIRecipe, VPWidenPHIRecipe>(&R))
- return true;
- if (auto *VPInst = dyn_cast<VPInstruction>(&R))
- return VPInst->getOpcode() == VPInstruction::CSAMaskPhi ||
- VPInst->getOpcode() == VPInstruction::CSAVLPhi;
- return false;
-}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index ddbb32a1ec0c83..fc11208a433961 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -45,17 +45,6 @@ inline bool isUniformAfterVectorization(const VPValue *VPV) {
/// Return true if \p V is a header mask in \p Plan.
bool isHeaderMask(const VPValue *V, VPlan &Plan);
-
-/// Returns true for PHI-like recipes.
-bool isPhi(const VPRecipeBase &R);
-
-/// Returns true for PHI-like recipes that generate their own backedge
-bool isPhiThatGeneratesBackedge(const VPRecipeBase &R);
-
-/// Returns true for PHI-like recipes that exists in vector loop header basic
-/// block
-bool isHeaderPhi(const VPRecipeBase &R);
-
} // end namespace llvm::vputils
#endif
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 2b9c32624a2916..808879b3f9c3b9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -74,11 +74,11 @@ bool VPlanVerifier::verifyPhiRecipes(const VPBasicBlock *VPBB) {
const VPRegionBlock *ParentR = VPBB->getParent();
bool IsHeaderVPBB = ParentR && !ParentR->isReplicator() &&
ParentR->getEntryBasicBlock() == VPBB;
- while (RecipeI != End && vputils::isPhi(*RecipeI)) {
+ while (RecipeI != End && RecipeI->isPhi()) {
if (isa<VPActiveLaneMaskPHIRecipe>(RecipeI))
NumActiveLaneMaskPhiRecipes++;
- if (IsHeaderVPBB && !vputils::isHeaderPhi(*RecipeI)) {
+ if (IsHeaderVPBB && !RecipeI->isHeaderPhi()) {
errs() << "Found non-header PHI recipe in header VPBB";
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
errs() << ": ";
@@ -105,7 +105,7 @@ bool VPlanVerifier::verifyPhiRecipes(const VPBasicBlock *VPBB) {
}
while (RecipeI != End) {
- if (vputils::isPhi(*RecipeI) && !isa<VPBlendRecipe>(&*RecipeI)) {
+ if (RecipeI->isPhi() && !isa<VPBlendRecipe>(&*RecipeI)) {
errs() << "Found phi-like recipe after non-phi recipe";
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
>From 55e9f07cb8ed29b6c4123bc87c0036e1d573dea2 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Tue, 3 Sep 2024 11:24:50 -0700
Subject: [PATCH 11/16] fixup! update test checks
---
.../LoopVectorize/conditional-scalar-assignment.ll | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll
index 0269f0c672a3fa..48af3719c06ed1 100644
--- a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll
+++ b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll
@@ -2981,7 +2981,7 @@ for.body: ; preds = %for.body.preheader,
; }
; return t; // use t
; }
-define ptr @simple_csa_ptr_select(i32 %N, ptr %data, i32 %a) {
+define ptr @simple_csa_ptr_select(i32 %N, ptr %data, i64 %a) {
; EVL-LABEL: @simple_csa_ptr_select(
; EVL-NEXT: entry:
; EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -3002,7 +3002,7 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data, i32 %a) {
; EVL-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
; EVL-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
; EVL-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
+; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP2]]
; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
@@ -3028,7 +3028,7 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data, i32 %a) {
; NO-EVL-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
; NO-EVL-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
; NO-EVL-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
+; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP2]]
; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
@@ -3054,7 +3054,7 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data, i32 %a) {
; DATA-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
; DATA-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
; DATA-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP2]]
+; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP2]]
; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
>From 4d9787acbb51482707e6537aca586f7fd550f4a8 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Tue, 3 Sep 2024 11:59:34 -0700
Subject: [PATCH 12/16] fixup! use vpbuilder
---
.../Vectorize/LoopVectorizationPlanner.h | 31 +++++++++++++++++--
.../Transforms/Vectorize/LoopVectorize.cpp | 26 +++++++---------
2 files changed, 40 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 2fe9af6b0d14f5..9371e3c2f57ea2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -164,8 +164,8 @@ class VPBuilder {
return tryInsertInstruction(
new VPInstruction(Opcode, Operands, WrapFlags, DL, Name));
}
- VPValue *createNot(VPValue *Operand, DebugLoc DL = {},
- const Twine &Name = "") {
+ VPInstruction *createNot(VPValue *Operand, DebugLoc DL = {},
+ const Twine &Name = "") {
return createInstruction(VPInstruction::Not, {Operand}, DL, Name);
}
@@ -210,6 +210,33 @@ class VPBuilder {
new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name));
}
+ VPInstruction *createCSAInitMask(DebugLoc DL, const Twine &Name) {
+ return createInstruction(VPInstruction::CSAInitMask, {}, DL, Name);
+ }
+
+ VPInstruction *createCSAInitData(VPValue *InitScalar, DebugLoc DL,
+ const Twine &Name) {
+ return createInstruction(VPInstruction::CSAInitData, {InitScalar}, DL,
+ Name);
+ }
+
+ VPInstruction *createCSAMaskPhi(VPValue *InitMask, DebugLoc DL,
+ const Twine &Name) {
+ return createInstruction(VPInstruction::CSAMaskPhi, {InitMask}, DL, Name);
+ }
+
+ VPInstruction *createCSAAnyActive(VPValue *Cond, DebugLoc DL,
+ const Twine &Name) {
+ return createInstruction(VPInstruction::CSAAnyActive, {Cond}, DL, Name);
+ }
+
+ VPInstruction *createCSAMaskSel(VPValue *Cond, VPValue *MaskPhi,
+ VPValue *AnyActive, DebugLoc DL,
+ const Twine &Name) {
+ return createInstruction(VPInstruction::CSAMaskSel,
+ {Cond, MaskPhi, AnyActive}, DL, Name);
+ }
+
VPDerivedIVRecipe *createDerivedIV(InductionDescriptor::InductionKind Kind,
FPMathOperator *FPBinOp, VPValue *Start,
VPCanonicalIVPHIRecipe *CanonicalIV,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b3784ca17bd6c7..d681567048eae4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8661,16 +8661,12 @@ addCSAPreprocessRecipes(const LoopVectorizationLegality::CSAList &CSAs,
continue;
}
- auto *VPInitMask =
- new VPInstruction(VPInstruction::CSAInitMask, {}, DL, "csa.init.mask");
- auto *VPInitData = new VPInstruction(VPInstruction::CSAInitData,
- {VPInitScalar}, DL, "csa.init.data");
- PreheaderVPBB->appendRecipe(VPInitMask);
- PreheaderVPBB->appendRecipe(VPInitData);
+ VPBuilder PHB(PreheaderVPBB);
+ auto *VPInitMask = PHB.createCSAInitMask(DL, "csa.init.mask");
+ auto *VPInitData = PHB.createCSAInitData(VPInitScalar, DL, "csa.init.data");
- auto *VPMaskPhi = new VPInstruction(VPInstruction::CSAMaskPhi, {VPInitMask},
- DL, "csa.mask.phi");
- HeaderVPBB->appendRecipe(VPMaskPhi);
+ VPBuilder HB(HeaderVPBB);
+ auto *VPMaskPhi = HB.createCSAMaskPhi(VPInitMask, DL, "csa.mask.phi");
auto *S = new VPCSAState(VPInitScalar, VPInitData, VPMaskPhi);
Plan.addCSAState(CSA.first, S);
@@ -8709,22 +8705,22 @@ addCSAPostprocessRecipes(VPRecipeBuilder &RecipeBuilder,
// In that case, we must use the negation of WidenedCond.
// i.e. select cond new_val old_val versus select cond.not old_val new_val
VPValue *CondToUse = WidenedCond;
+ VPBuilder B;
if (cast<SelectInst>(CSA.second.getAssignment())->getTrueValue() ==
CSA.first) {
- auto *VPNotCond = new VPInstruction(VPInstruction::Not, WidenedCond, DL);
+ auto *VPNotCond = B.createNot(WidenedCond, DL);
VPNotCond->insertBefore(
GetVPValue(CSA.second.getAssignment())->getDefiningRecipe());
CondToUse = VPNotCond;
}
- auto *VPAnyActive = new VPInstruction(
- VPInstruction::CSAAnyActive, {CondToUse}, DL, "csa.cond.anyactive");
+ auto *VPAnyActive =
+ B.createCSAAnyActive(CondToUse, DL, "csa.cond.anyactive");
VPAnyActive->insertBefore(
GetVPValue(CSA.second.getAssignment())->getDefiningRecipe());
- auto *VPMaskSel = new VPInstruction(
- VPInstruction::CSAMaskSel,
- {CondToUse, CSAState->getVPMaskPhi(), VPAnyActive}, DL, "csa.mask.sel");
+ auto *VPMaskSel = B.createCSAMaskSel(CondToUse, CSAState->getVPMaskPhi(),
+ VPAnyActive, DL, "csa.mask.sel");
VPMaskSel->insertAfter(VPAnyActive);
VPDataUpdate->setVPNewMaskAndVPAnyActive(VPMaskSel, VPAnyActive);
VPCSAExtractScalarRecipe *ExtractScalarRecipe =
>From b53f386c0fae24c5e25a88f88f36951232d9eab2 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Mon, 9 Sep 2024 06:46:00 -0700
Subject: [PATCH 13/16] fixup! simplify tests
---
.../RISCV/conditional-scalar-assignment.ll | 360 +---
.../conditional-scalar-assignment.ll | 1510 +++++------------
2 files changed, 474 insertions(+), 1396 deletions(-)
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll
index 6d7816800603e2..d8dd1d34e2bec3 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll
@@ -5,9 +5,6 @@
; RUN: opt < %s -S -passes=loop-vectorize -force-tail-folding-style=none \
; RUN: -enable-csa-vectorization -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=256 \
; RUN: | FileCheck %s -check-prefix=NO-EVL
-; RUN: opt < %s -S -passes=loop-vectorize -force-tail-folding-style=data \
-; RUN: -enable-csa-vectorization -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=256 \
-; RUN: | FileCheck %s -check-prefix=DATA
; This function is generated from the following C/C++ program:
; uint64_t idx_scalar(int64_t *a, int64_t *b, uint64_t ii, uint64_t n) {
@@ -20,7 +17,7 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
; EVL-LABEL: @idx_scalar(
; EVL-NEXT: entry:
; EVL-NEXT: [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; EVL-NEXT: br i1 [[CMP8_NOT]], label [[EXIT:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; EVL: for.body.preheader:
; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
@@ -75,15 +72,15 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
; EVL-NEXT: [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
; EVL-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; EVL: scalar.ph:
; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
+; EVL: exit.loopexit:
; EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; EVL-NEXT: br label [[EXIT]]
+; EVL: exit:
+; EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[EXIT_LOOPEXIT]] ]
; EVL-NEXT: ret i64 [[IDX_0_LCSSA]]
; EVL: for.body:
; EVL-NEXT: [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -96,12 +93,12 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
; EVL-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
; EVL-NEXT: [[INC]] = add nuw i64 [[I_010]], 1
; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
;
; NO-EVL-LABEL: @idx_scalar(
; NO-EVL-NEXT: entry:
; NO-EVL-NEXT: [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; NO-EVL-NEXT: br i1 [[CMP8_NOT]], label [[EXIT:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; NO-EVL: for.body.preheader:
; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; NO-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
@@ -156,15 +153,15 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
; NO-EVL-NEXT: [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
; NO-EVL-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; NO-EVL: scalar.ph:
; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL: exit.loopexit:
; NO-EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; NO-EVL-NEXT: br label [[EXIT]]
+; NO-EVL: exit:
+; NO-EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[EXIT_LOOPEXIT]] ]
; NO-EVL-NEXT: ret i64 [[IDX_0_LCSSA]]
; NO-EVL: for.body:
; NO-EVL-NEXT: [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -177,102 +174,17 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
; NO-EVL-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
; NO-EVL-NEXT: [[INC]] = add nuw i64 [[I_010]], 1
; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-;
-; DATA-LABEL: @idx_scalar(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP8_NOT:%.*]] = icmp eq i64 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP8_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
-; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA: vector.ph:
-; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
-; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; DATA-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; DATA-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; DATA-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
-; DATA-NEXT: [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; DATA-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
-; DATA-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; DATA-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; DATA-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
-; DATA-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
-; DATA: vector.body:
-; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP12]]
-; DATA-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
-; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP14]], align 8
-; DATA-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP12]]
-; DATA-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i32 0
-; DATA-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP16]], align 8
-; DATA-NEXT: [[TMP17:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; DATA-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP17]])
-; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x i64> [[CSA_DATA_PHI]]
-; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; DATA-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; DATA-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; DATA: middle.block:
-; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; DATA-NEXT: [[TMP20:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
-; DATA-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP20]])
-; DATA-NEXT: [[TMP22:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
-; DATA-NEXT: [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
-; DATA-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
-; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x i64> [[CSA_DATA_SEL]], i32 [[TMP25]]
-; DATA-NEXT: [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
-; DATA-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
-; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA: scalar.ph:
-; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; DATA-NEXT: ret i64 [[IDX_0_LCSSA]]
-; DATA: for.body:
-; DATA-NEXT: [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; DATA-NEXT: [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I_010]]
-; DATA-NEXT: [[TMP28:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; DATA-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[I_010]]
-; DATA-NEXT: [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
-; DATA-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[TMP28]], [[TMP29]]
-; DATA-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
-; DATA-NEXT: [[INC]] = add nuw i64 [[I_010]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
;
entry:
%cmp8.not = icmp eq i64 %n, 0
- br i1 %cmp8.not, label %for.cond.cleanup, label %for.body.preheader
+ br i1 %cmp8.not, label %exit, label %for.body.preheader
for.body.preheader: ; preds = %entry
br label %for.body
-for.cond.cleanup.loopexit: ; preds = %for.body
- %cond.lcssa = phi i64 [ %cond, %for.body ]
- br label %for.cond.cleanup
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond.lcssa, %for.cond.cleanup.loopexit ]
+exit: ; preds = %for.body, %entry
+ %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond, %for.body ]
ret i64 %idx.0.lcssa
for.body: ; preds = %for.body.preheader, %for.body
@@ -286,7 +198,7 @@ for.body: ; preds = %for.body.preheader,
%cond = select i1 %cmp2, i64 %i.010, i64 %idx.09
%inc = add nuw i64 %i.010, 1
%exitcond.not = icmp eq i64 %inc, %n
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+ br i1 %exitcond.not, label %exit, label %for.body
}
; This function is generated from the following C/C++ program:
@@ -300,7 +212,7 @@ define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
; EVL-LABEL: @idx_scalar_dec(
; EVL-NEXT: entry:
; EVL-NEXT: [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; EVL-NEXT: br i1 [[CMP_NOT9]], label [[EXIT:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; EVL: for.body.preheader:
; EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
; EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -348,15 +260,15 @@ define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
; EVL-NEXT: [[TMP17:%.*]] = icmp sge i32 [[TMP16]], 0
; EVL-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; EVL: scalar.ph:
; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
+; EVL: exit.loopexit:
; EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; EVL-NEXT: br label [[EXIT]]
+; EVL: exit:
+; EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[EXIT_LOOPEXIT]] ]
; EVL-NEXT: ret i64 [[IDX_0_LCSSA]]
; EVL: for.body:
; EVL-NEXT: [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -369,12 +281,12 @@ define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
; EVL-NEXT: [[CMP3:%.*]] = icmp sgt i64 [[TMP19]], [[TMP20]]
; EVL-NEXT: [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
; EVL-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
-; EVL-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; EVL-NEXT: br i1 [[CMP_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
;
; NO-EVL-LABEL: @idx_scalar_dec(
; NO-EVL-NEXT: entry:
; NO-EVL-NEXT: [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; NO-EVL-NEXT: br i1 [[CMP_NOT9]], label [[EXIT:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; NO-EVL: for.body.preheader:
; NO-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
; NO-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -422,15 +334,15 @@ define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
; NO-EVL-NEXT: [[TMP17:%.*]] = icmp sge i32 [[TMP16]], 0
; NO-EVL-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; NO-EVL: scalar.ph:
; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL: exit.loopexit:
; NO-EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; NO-EVL-NEXT: br label [[EXIT]]
+; NO-EVL: exit:
+; NO-EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[EXIT_LOOPEXIT]] ]
; NO-EVL-NEXT: ret i64 [[IDX_0_LCSSA]]
; NO-EVL: for.body:
; NO-EVL-NEXT: [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -443,95 +355,17 @@ define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
; NO-EVL-NEXT: [[CMP3:%.*]] = icmp sgt i64 [[TMP19]], [[TMP20]]
; NO-EVL-NEXT: [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
; NO-EVL-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
-; NO-EVL-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-;
-; DATA-LABEL: @idx_scalar_dec(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP_NOT9:%.*]] = icmp eq i64 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP_NOT9]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
-; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA: vector.ph:
-; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; DATA-NEXT: [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
-; DATA-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[N]], i64 0
-; DATA-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
-; DATA-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[DOTSPLAT]], <i64 0, i64 -1, i64 -2, i64 -3, i64 -4, i64 -5, i64 -6, i64 -7>
-; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
-; DATA: vector.body:
-; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <8 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <8 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[TMP0:%.*]] = add <8 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
-; DATA-NEXT: [[TMP1:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0
-; DATA-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP1]]
-; DATA-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; DATA-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -7
-; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
-; DATA-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; DATA-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP1]]
-; DATA-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; DATA-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -7
-; DATA-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8
-; DATA-NEXT: [[REVERSE2:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD1]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; DATA-NEXT: [[TMP8:%.*]] = icmp sgt <8 x i64> [[REVERSE]], [[REVERSE2]]
-; DATA-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP8]])
-; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP9]], <8 x i1> [[TMP8]], <8 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP9]], <8 x i64> [[VEC_IND]], <8 x i64> [[CSA_DATA_PHI]]
-; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; DATA-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8, i64 -8>
-; DATA-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; DATA: middle.block:
-; DATA-NEXT: [[TMP11:%.*]] = select <8 x i1> [[CSA_MASK_SEL]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> zeroinitializer
-; DATA-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> [[TMP11]])
-; DATA-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP12]], 0
-; DATA-NEXT: [[TMP15:%.*]] = and i1 [[TMP13]], [[TMP14]]
-; DATA-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 0, i32 -1
-; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <8 x i64> [[CSA_DATA_SEL]], i32 [[TMP16]]
-; DATA-NEXT: [[TMP17:%.*]] = icmp sge i32 [[TMP16]], 0
-; DATA-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
-; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA: scalar.ph:
-; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; DATA-NEXT: ret i64 [[IDX_0_LCSSA]]
-; DATA: for.body:
-; DATA-NEXT: [[I_011:%.*]] = phi i64 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; DATA-NEXT: [[IDX_010:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
-; DATA-NEXT: [[SUB]] = add i64 [[I_011]], -1
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[SUB]]
-; DATA-NEXT: [[TMP19:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; DATA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[SUB]]
-; DATA-NEXT: [[TMP20:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
-; DATA-NEXT: [[CMP3:%.*]] = icmp sgt i64 [[TMP19]], [[TMP20]]
-; DATA-NEXT: [[COND]] = select i1 [[CMP3]], i64 [[I_011]], i64 [[IDX_010]]
-; DATA-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[SUB]], 0
-; DATA-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; NO-EVL-NEXT: br i1 [[CMP_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
;
entry:
%cmp.not9 = icmp eq i64 %n, 0
- br i1 %cmp.not9, label %for.cond.cleanup, label %for.body.preheader
+ br i1 %cmp.not9, label %exit, label %for.body.preheader
for.body.preheader: ; preds = %entry
br label %for.body
-for.cond.cleanup.loopexit: ; preds = %for.body
- %cond.lcssa = phi i64 [ %cond, %for.body ]
- br label %for.cond.cleanup
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond.lcssa, %for.cond.cleanup.loopexit ]
+exit: ; preds = %for.body, %entry
+ %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond, %for.body ]
ret i64 %idx.0.lcssa
for.body: ; preds = %for.body.preheader, %for.body
@@ -545,10 +379,9 @@ for.body: ; preds = %for.body.preheader,
%cmp3 = icmp sgt i64 %0, %1
%cond = select i1 %cmp3, i64 %i.011, i64 %idx.010
%cmp.not = icmp eq i64 %sub, 0
- br i1 %cmp.not, label %for.cond.cleanup.loopexit, label %for.body
+ br i1 %cmp.not, label %exit, label %for.body
}
-
; This function is generated from the following C/C++ program:
; int *simple_csa_ptr_select(int N, int **data) {
; int *t = nullptr;
@@ -562,7 +395,7 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
; EVL-LABEL: @simple_csa_ptr_select(
; EVL-NEXT: entry:
; EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
; EVL: for.body.preheader:
; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -617,15 +450,15 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
; EVL-NEXT: [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
; EVL-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], ptr [[CSA_EXTRACT]], ptr null
; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; EVL: scalar.ph:
; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
+; EVL: exit.loopexit:
; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; EVL-NEXT: br label [[EXIT]]
+; EVL: exit:
+; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
; EVL-NEXT: ret ptr [[T_0_LCSSA]]
; EVL: for.body:
; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -638,12 +471,12 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP27]], ptr [[T_010]]
; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
;
; NO-EVL-LABEL: @simple_csa_ptr_select(
; NO-EVL-NEXT: entry:
; NO-EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
; NO-EVL: for.body.preheader:
; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -698,15 +531,15 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
; NO-EVL-NEXT: [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
; NO-EVL-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], ptr [[CSA_EXTRACT]], ptr null
; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; NO-EVL: scalar.ph:
; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL: exit.loopexit:
; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; NO-EVL-NEXT: br label [[EXIT]]
+; NO-EVL: exit:
+; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
; NO-EVL-NEXT: ret ptr [[T_0_LCSSA]]
; NO-EVL: for.body:
; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -719,103 +552,18 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP27]], ptr [[T_010]]
; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-;
-; DATA-LABEL: @simple_csa_ptr_select(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA: vector.ph:
-; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
-; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
-; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; DATA-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; DATA-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
-; DATA-NEXT: [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; DATA-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
-; DATA-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; DATA-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; DATA-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
-; DATA-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
-; DATA: vector.body:
-; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x ptr> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT: [[TMP13:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[TMP12]]
-; DATA-NEXT: [[TMP14:%.*]] = getelementptr inbounds ptr, ptr [[TMP13]], i32 0
-; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP14]], align 8
-; DATA-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[WIDE_LOAD]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
-; DATA-NEXT: [[TMP15:%.*]] = sext <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i64>
-; DATA-NEXT: [[TMP16:%.*]] = icmp slt <vscale x 2 x i64> [[VEC_IND]], [[TMP15]]
-; DATA-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP16]])
-; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 2 x ptr> [[WIDE_LOAD]], <vscale x 2 x ptr> [[CSA_DATA_PHI]]
-; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; DATA-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; DATA-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; DATA: middle.block:
-; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; DATA-NEXT: [[TMP19:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
-; DATA-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP19]])
-; DATA-NEXT: [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
-; DATA-NEXT: [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
-; DATA-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
-; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x ptr> [[CSA_DATA_SEL]], i32 [[TMP24]]
-; DATA-NEXT: [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
-; DATA-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], ptr [[CSA_EXTRACT]], ptr null
-; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA: scalar.ph:
-; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; DATA-NEXT: ret ptr [[T_0_LCSSA]]
-; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[T_010:%.*]] = phi ptr [ null, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-; DATA-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
-; DATA-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
-; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP29]]
-; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP27]], ptr [[T_010]]
-; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
;
entry:
%cmp9 = icmp sgt i32 %N, 0
- br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+ br i1 %cmp9, label %for.body.preheader, label %exit
for.body.preheader:
%wide.trip.count = zext i32 %N to i64
br label %for.body
-for.cond.cleanup.loopexit:
- %spec.select.lcssa = phi ptr [ %spec.select, %for.body ]
- br label %for.cond.cleanup
-
-for.cond.cleanup:
- %t.0.lcssa = phi ptr [ null, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+exit:
+ %t.0.lcssa = phi ptr [ null, %entry ], [ %spec.select, %for.body ]
ret ptr %t.0.lcssa
for.body:
@@ -829,5 +577,5 @@ for.body:
%spec.select = select i1 %cmp1, ptr %0, ptr %t.010
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+ br i1 %exitcond.not, label %exit, label %for.body
}
diff --git a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll
index 48af3719c06ed1..dc88d3d5d5528c 100644
--- a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll
+++ b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll
@@ -7,10 +7,6 @@
; RUN: -enable-csa-vectorization -scalable-vectorization=on \
; RUN: -force-target-supports-scalable-vectors -force-target-instruction-cost=1 \
; RUN: | FileCheck %s -check-prefix=NO-EVL
-; RUN: opt < %s -S -passes=loop-vectorize -force-tail-folding-style=data \
-; RUN: -enable-csa-vectorization -scalable-vectorization=on \
-; RUN: -force-target-supports-scalable-vectors -force-target-instruction-cost=1 \
-; RUN: | FileCheck %s -check-prefix=DATA
; This function is generated from the following C/C++ program:
; int simple_csa_int_select(int N, int *data, int a) {
@@ -25,7 +21,7 @@ define i32 @simple_csa_int_select(i32 %N, ptr %data, i64 %a) {
; EVL-LABEL: @simple_csa_int_select(
; EVL-NEXT: entry:
; EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
; EVL: for.body.preheader:
; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -67,32 +63,32 @@ define i32 @simple_csa_int_select(i32 %N, ptr %data, i64 %a) {
; EVL-NEXT: [[TMP16:%.*]] = icmp sge i32 [[TMP15]], 0
; EVL-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[CSA_EXTRACT]], i32 -1
; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; EVL: scalar.ph:
; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
+; EVL: exit.loopexit:
; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; EVL-NEXT: br label [[EXIT]]
+; EVL: exit:
+; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
; EVL-NEXT: ret i32 [[T_0_LCSSA]]
; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; EVL-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
; EVL-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; EVL-NEXT: [[TMP19:%.*]] = sext i32 [[TMP18]] to i64
; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP19]]
; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP18]], i32 [[T_010]]
-; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
;
; NO-EVL-LABEL: @simple_csa_int_select(
; NO-EVL-NEXT: entry:
; NO-EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
; NO-EVL: for.body.preheader:
; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -134,122 +130,51 @@ define i32 @simple_csa_int_select(i32 %N, ptr %data, i64 %a) {
; NO-EVL-NEXT: [[TMP16:%.*]] = icmp sge i32 [[TMP15]], 0
; NO-EVL-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[CSA_EXTRACT]], i32 -1
; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; NO-EVL: scalar.ph:
; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL: exit.loopexit:
; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; NO-EVL-NEXT: br label [[EXIT]]
+; NO-EVL: exit:
+; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
; NO-EVL-NEXT: ret i32 [[T_0_LCSSA]]
; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; NO-EVL-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
; NO-EVL-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; NO-EVL-NEXT: [[TMP19:%.*]] = sext i32 [[TMP18]] to i64
; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP19]]
; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP18]], i32 [[T_010]]
-; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-;
-; DATA-LABEL: @simple_csa_int_select(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
-; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA: vector.ph:
-; DATA-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[A:%.*]], i64 0
-; DATA-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
-; DATA: vector.body:
-; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP3]]
-; DATA-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP5]], align 4
-; DATA-NEXT: [[TMP6:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; DATA-NEXT: [[TMP7:%.*]] = icmp slt <vscale x 1 x i64> [[BROADCAST_SPLAT]], [[TMP6]]
-; DATA-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP7]])
-; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP8]], <vscale x 1 x i1> [[TMP7]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP8]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
-; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; DATA-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; DATA: middle.block:
-; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; DATA-NEXT: [[TMP10:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; DATA-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP10]])
-; DATA-NEXT: [[TMP12:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP11]], 0
-; DATA-NEXT: [[TMP14:%.*]] = and i1 [[TMP12]], [[TMP13]]
-; DATA-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i32 0, i32 -1
-; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP15]]
-; DATA-NEXT: [[TMP16:%.*]] = icmp sge i32 [[TMP15]], 0
-; DATA-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[CSA_EXTRACT]], i32 -1
-; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA: scalar.ph:
-; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; DATA-NEXT: ret i32 [[T_0_LCSSA]]
-; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT: [[TMP19:%.*]] = sext i32 [[TMP18]] to i64
-; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP19]]
-; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP18]], i32 [[T_010]]
-; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
;
entry:
%cmp9 = icmp sgt i32 %N, 0
- br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+ br i1 %cmp9, label %for.body.preheader, label %exit
for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %N to i64
br label %for.body
-for.cond.cleanup.loopexit: ; preds = %for.body
- %spec.select.lcssa = phi i32 [ %spec.select, %for.body ]
- br label %for.cond.cleanup
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %t.0.lcssa = phi i32 [ -1, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+exit: ; preds = %for.body, %entry
+ %t.0.lcssa = phi i32 [ -1, %entry ], [ %spec.select, %for.body ]
ret i32 %t.0.lcssa
for.body: ; preds = %for.body.preheader, %for.body
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
%t.010 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
- %arrayidx = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
+ %arrayidx = getelementptr inbounds i32, ptr %data, i64 %iv
%0 = load i32, ptr %arrayidx, align 4
%1 = sext i32 %0 to i64
%cmp1 = icmp slt i64 %a, %1
%spec.select = select i1 %cmp1, i32 %0, i32 %t.010
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %exit, label %for.body
}
; This function is generated from the following C/C++ program:
@@ -265,7 +190,7 @@ define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
; EVL-LABEL: @simple_csa_int_select_induction_cmp(
; EVL-NEXT: entry:
; EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
; EVL: for.body.preheader:
; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -315,32 +240,32 @@ define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
; EVL-NEXT: [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
; EVL-NEXT: [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT]], i32 -1
; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; EVL: scalar.ph:
; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
+; EVL: exit.loopexit:
; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; EVL-NEXT: br label [[EXIT]]
+; EVL: exit:
+; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
; EVL-NEXT: ret i32 [[T_0_LCSSA]]
; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; EVL-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
; EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; EVL-NEXT: [[TMP24:%.*]] = sext i32 [[TMP23]] to i64
-; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP24]]
+; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP24]]
; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP23]], i32 [[T_010]]
-; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
;
; NO-EVL-LABEL: @simple_csa_int_select_induction_cmp(
; NO-EVL-NEXT: entry:
; NO-EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
; NO-EVL: for.body.preheader:
; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -390,130 +315,51 @@ define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
; NO-EVL-NEXT: [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
; NO-EVL-NEXT: [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT]], i32 -1
; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; NO-EVL: scalar.ph:
; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL: exit.loopexit:
; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; NO-EVL-NEXT: br label [[EXIT]]
+; NO-EVL: exit:
+; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
; NO-EVL-NEXT: ret i32 [[T_0_LCSSA]]
; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; NO-EVL-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
; NO-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; NO-EVL-NEXT: [[TMP24:%.*]] = sext i32 [[TMP23]] to i64
-; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP24]]
+; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP24]]
; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP23]], i32 [[T_010]]
-; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-;
-; DATA-LABEL: @simple_csa_int_select_induction_cmp(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
-; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA: vector.ph:
-; DATA-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
-; DATA-NEXT: [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
-; DATA-NEXT: [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; DATA-NEXT: [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
-; DATA-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP7:%.*]] = mul i64 1, [[TMP6]]
-; DATA-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
-; DATA-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
-; DATA: vector.body:
-; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP8]]
-; DATA-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
-; DATA-NEXT: [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; DATA-NEXT: [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
-; DATA-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
-; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
-; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; DATA-NEXT: [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; DATA-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; DATA: middle.block:
-; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; DATA-NEXT: [[TMP15:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; DATA-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP15]])
-; DATA-NEXT: [[TMP17:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP16]], 0
-; DATA-NEXT: [[TMP19:%.*]] = and i1 [[TMP17]], [[TMP18]]
-; DATA-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 -1
-; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP20]]
-; DATA-NEXT: [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
-; DATA-NEXT: [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT]], i32 -1
-; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA: scalar.ph:
-; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; DATA-NEXT: ret i32 [[T_0_LCSSA]]
-; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT: [[TMP24:%.*]] = sext i32 [[TMP23]] to i64
-; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP24]]
-; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP23]], i32 [[T_010]]
-; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; NO-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
;
entry:
%cmp9 = icmp sgt i32 %N, 0
- br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+ br i1 %cmp9, label %for.body.preheader, label %exit
for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %N to i64
br label %for.body
-for.cond.cleanup.loopexit: ; preds = %for.body
- %spec.select.lcssa = phi i32 [ %spec.select, %for.body ]
- br label %for.cond.cleanup
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %t.0.lcssa = phi i32 [ -1, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+exit: ; preds = %for.body, %entry
+ %t.0.lcssa = phi i32 [ -1, %entry ], [ %spec.select, %for.body ]
ret i32 %t.0.lcssa
for.body: ; preds = %for.body.preheader, %for.body
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
%t.010 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
- %arrayidx = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
+ %arrayidx = getelementptr inbounds i32, ptr %data, i64 %iv
%0 = load i32, ptr %arrayidx, align 4
%1 = sext i32 %0 to i64
- %cmp1 = icmp slt i64 %indvars.iv, %1
+ %cmp1 = icmp slt i64 %iv, %1
%spec.select = select i1 %cmp1, i32 %0, i32 %t.010
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %exit, label %for.body
}
; This function is generated from the following C/C++ program:
@@ -529,7 +375,7 @@ define float @simple_csa_float_select(i32 %N, ptr %data) {
; EVL-LABEL: @simple_csa_float_select(
; EVL-NEXT: entry:
; EVL-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
; EVL: for.body.preheader:
; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -568,31 +414,31 @@ define float @simple_csa_float_select(i32 %N, ptr %data) {
; EVL-NEXT: [[TMP15:%.*]] = icmp sge i32 [[TMP14]], 0
; EVL-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], float [[CSA_EXTRACT]], float 1.000000e+00
; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; EVL: scalar.ph:
; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
+; EVL: exit.loopexit:
; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; EVL-NEXT: br label [[EXIT]]
+; EVL: exit:
+; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[EXIT_LOOPEXIT]] ]
; EVL-NEXT: ret float [[T_0_LCSSA]]
; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; EVL-NEXT: [[T_09:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[IV]]
; EVL-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4
; EVL-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP17]], 0.000000e+00
; EVL-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP17]], float [[T_09]]
-; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
;
; NO-EVL-LABEL: @simple_csa_float_select(
; NO-EVL-NEXT: entry:
; NO-EVL-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
; NO-EVL: for.body.preheader:
; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -631,112 +477,49 @@ define float @simple_csa_float_select(i32 %N, ptr %data) {
; NO-EVL-NEXT: [[TMP15:%.*]] = icmp sge i32 [[TMP14]], 0
; NO-EVL-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], float [[CSA_EXTRACT]], float 1.000000e+00
; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; NO-EVL: scalar.ph:
; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL: exit.loopexit:
; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; NO-EVL-NEXT: br label [[EXIT]]
+; NO-EVL: exit:
+; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[EXIT_LOOPEXIT]] ]
; NO-EVL-NEXT: ret float [[T_0_LCSSA]]
; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; NO-EVL-NEXT: [[T_09:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[IV]]
; NO-EVL-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4
; NO-EVL-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP17]], 0.000000e+00
; NO-EVL-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP17]], float [[T_09]]
-; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-;
-; DATA-LABEL: @simple_csa_float_select(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
-; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA: vector.ph:
-; DATA-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
-; DATA: vector.body:
-; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[DATA:%.*]], i64 [[TMP3]]
-; DATA-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0
-; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x float>, ptr [[TMP5]], align 4
-; DATA-NEXT: [[TMP6:%.*]] = fcmp ogt <vscale x 1 x float> [[WIDE_LOAD]], zeroinitializer
-; DATA-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP6]])
-; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP7]], <vscale x 1 x i1> [[TMP6]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP7]], <vscale x 1 x float> [[WIDE_LOAD]], <vscale x 1 x float> [[CSA_DATA_PHI]]
-; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; DATA-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; DATA: middle.block:
-; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; DATA-NEXT: [[TMP9:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; DATA-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP9]])
-; DATA-NEXT: [[TMP11:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP10]], 0
-; DATA-NEXT: [[TMP13:%.*]] = and i1 [[TMP11]], [[TMP12]]
-; DATA-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 0, i32 -1
-; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x float> [[CSA_DATA_SEL]], i32 [[TMP14]]
-; DATA-NEXT: [[TMP15:%.*]] = icmp sge i32 [[TMP14]], 0
-; DATA-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], float [[CSA_EXTRACT]], float 1.000000e+00
-; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA: scalar.ph:
-; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[T_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; DATA-NEXT: ret float [[T_0_LCSSA]]
-; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[T_09:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP17]], 0.000000e+00
-; DATA-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP17]], float [[T_09]]
-; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; NO-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
;
entry:
%cmp8 = icmp sgt i32 %N, 0
- br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
+ br i1 %cmp8, label %for.body.preheader, label %exit
for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %N to i64
br label %for.body
-for.cond.cleanup: ; preds = %for.body, %entry
+exit: ; preds = %for.body, %entry
%t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.body ]
ret float %t.0.lcssa
for.body: ; preds = %for.body.preheader, %for.body
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
%t.09 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.body ]
- %arrayidx = getelementptr inbounds float, ptr %data, i64 %indvars.iv
+ %arrayidx = getelementptr inbounds float, ptr %data, i64 %iv
%0 = load float, ptr %arrayidx, align 4
%cmp1 = fcmp ogt float %0, 0.000000e+00
%t.1 = select i1 %cmp1, float %0, float %t.09
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %exit, label %for.body
}
; This function is generated from the following C/C++ program:
@@ -755,39 +538,36 @@ define i32 @simple_csa_int(i32 %N, ptr %cond, ptr %data) {
; NO-EVL-LABEL: @simple_csa_int(
; NO-EVL-NOT: vector.body:
;
-; DATA-LABEL: @simple_csa_int(
-; DATA-NOT: vector.body:
-;
entry:
%cmp6 = icmp sgt i32 %N, 0
- br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+ br i1 %cmp6, label %for.body.preheader, label %exit
for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %N to i64
br label %for.body
-for.cond.cleanup: ; preds = %for.inc, %entry
+exit: ; preds = %for.inc, %entry
%t.0.lcssa = phi i32 [ -1, %entry ], [ %t.1, %for.inc ]
ret i32 %t.0.lcssa
for.body: ; preds = %for.body.preheader, %for.inc
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
%t.07 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
- %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
+ %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %iv
%0 = load i8, ptr %arrayidx, align 1
%tobool.not = icmp eq i8 %0, 0
br i1 %tobool.not, label %for.inc, label %if.then
if.then: ; preds = %for.body
- %arrayidx2 = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
+ %arrayidx2 = getelementptr inbounds i32, ptr %data, i64 %iv
%1 = load i32, ptr %arrayidx2, align 4
br label %for.inc
for.inc: ; preds = %for.body, %if.then
%t.1 = phi i32 [ %1, %if.then ], [ %t.07, %for.body ]
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %exit, label %for.body
}
; This function is generated from the following C/C++ program:
@@ -806,39 +586,36 @@ define float @simple_csa_float(i32 %N, ptr %cond, ptr %data) {
; NO-EVL-LABEL: @simple_csa_float(
; NO-EVL-NOT: vector.body:
;
-; DATA-LABEL: @simple_csa_float(
-; NO-EVL-NOT: vector.body:
-;
entry:
%cmp6 = icmp sgt i32 %N, 0
- br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+ br i1 %cmp6, label %for.body.preheader, label %exit
for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %N to i64
br label %for.body
-for.cond.cleanup: ; preds = %for.inc, %entry
+exit: ; preds = %for.inc, %entry
%t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.inc ]
ret float %t.0.lcssa
for.body: ; preds = %for.body.preheader, %for.inc
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
%t.07 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
- %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
+ %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %iv
%0 = load i8, ptr %arrayidx, align 1
%tobool.not = icmp eq i8 %0, 0
br i1 %tobool.not, label %for.inc, label %if.then
if.then: ; preds = %for.body
- %arrayidx2 = getelementptr inbounds float, ptr %data, i64 %indvars.iv
+ %arrayidx2 = getelementptr inbounds float, ptr %data, i64 %iv
%1 = load float, ptr %arrayidx2, align 4
br label %for.inc
for.inc: ; preds = %for.body, %if.then
%t.1 = phi float [ %1, %if.then ], [ %t.07, %for.body ]
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %exit, label %for.body
}
; This function is generated from the following C/C++ program:
@@ -857,7 +634,7 @@ define i32 @csa_in_series_int_select(i32 %N, ptr %data0, ptr %data1, i64 %a) {
; EVL-LABEL: @csa_in_series_int_select(
; EVL-NEXT: entry:
; EVL-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
; EVL: for.body.preheader:
; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -919,40 +696,40 @@ define i32 @csa_in_series_int_select(i32 %N, ptr %data0, ptr %data1, i64 %a) {
; EVL-NEXT: [[TMP29:%.*]] = icmp sge i32 [[TMP28]], 0
; EVL-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[CSA_EXTRACT]], i32 -1
; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; EVL: scalar.ph:
; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
+; EVL: exit.loopexit:
; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ]
; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
; EVL-NEXT: [[TMP31:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP31]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
+; EVL-NEXT: br label [[EXIT]]
+; EVL: exit:
+; EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP31]], [[EXIT_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
; EVL-NEXT: ret i32 [[OR]]
; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; EVL-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
; EVL-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[IV]]
; EVL-NEXT: [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; EVL-NEXT: [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP33]]
; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP32]], i32 [[T_022]]
-; EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]]
; EVL-NEXT: [[TMP34:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
; EVL-NEXT: [[TMP35:%.*]] = sext i32 [[TMP34]] to i64
; EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP35]]
; EVL-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP34]], i32 [[S_023]]
-; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
;
; NO-EVL-LABEL: @csa_in_series_int_select(
; NO-EVL-NEXT: entry:
; NO-EVL-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
; NO-EVL: for.body.preheader:
; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -1014,164 +791,69 @@ define i32 @csa_in_series_int_select(i32 %N, ptr %data0, ptr %data1, i64 %a) {
; NO-EVL-NEXT: [[TMP29:%.*]] = icmp sge i32 [[TMP28]], 0
; NO-EVL-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[CSA_EXTRACT]], i32 -1
; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; NO-EVL: scalar.ph:
; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL: exit.loopexit:
; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ]
; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
; NO-EVL-NEXT: [[TMP31:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP31]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT: br label [[EXIT]]
+; NO-EVL: exit:
+; NO-EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP31]], [[EXIT_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
; NO-EVL-NEXT: ret i32 [[OR]]
; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; NO-EVL-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
; NO-EVL-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[IV]]
; NO-EVL-NEXT: [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; NO-EVL-NEXT: [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP33]]
; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP32]], i32 [[T_022]]
-; NO-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]]
; NO-EVL-NEXT: [[TMP34:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
; NO-EVL-NEXT: [[TMP35:%.*]] = sext i32 [[TMP34]] to i64
; NO-EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP35]]
; NO-EVL-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP34]], i32 [[S_023]]
-; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-;
-; DATA-LABEL: @csa_in_series_int_select(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
-; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA: vector.ph:
-; DATA-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[A:%.*]], i64 0
-; DATA-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
-; DATA: vector.body:
-; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_MASK_PHI1:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP3]]
-; DATA-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
-; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP5]], align 4
-; DATA-NEXT: [[TMP6:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; DATA-NEXT: [[TMP7:%.*]] = icmp slt <vscale x 1 x i64> [[BROADCAST_SPLAT]], [[TMP6]]
-; DATA-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP7]])
-; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP8]], <vscale x 1 x i1> [[TMP7]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
-; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP8]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI2]]
-; DATA-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP3]]
-; DATA-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; DATA-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
-; DATA-NEXT: [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD3]] to <vscale x 1 x i64>
-; DATA-NEXT: [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[BROADCAST_SPLAT]], [[TMP11]]
-; DATA-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
-; DATA-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD3]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
-; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; DATA-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; DATA: middle.block:
-; DATA-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; DATA-NEXT: [[TMP15:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
-; DATA-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP15]])
-; DATA-NEXT: [[TMP17:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
-; DATA-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP16]], 0
-; DATA-NEXT: [[TMP19:%.*]] = and i1 [[TMP17]], [[TMP18]]
-; DATA-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 -1
-; DATA-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL5]], i32 [[TMP20]]
-; DATA-NEXT: [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
-; DATA-NEXT: [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT7]], i32 -1
-; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; DATA-NEXT: [[TMP23:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; DATA-NEXT: [[TMP24:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP23]])
-; DATA-NEXT: [[TMP25:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT: [[TMP26:%.*]] = icmp eq i32 [[TMP24]], 0
-; DATA-NEXT: [[TMP27:%.*]] = and i1 [[TMP25]], [[TMP26]]
-; DATA-NEXT: [[TMP28:%.*]] = select i1 [[TMP27]], i32 0, i32 -1
-; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP28]]
-; DATA-NEXT: [[TMP29:%.*]] = icmp sge i32 [[TMP28]], 0
-; DATA-NEXT: [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[CSA_EXTRACT]], i32 -1
-; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA: scalar.ph:
-; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT: [[TMP31:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[OR:%.*]] = phi i32 [ [[TMP31]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
-; DATA-NEXT: ret i32 [[OR]]
-; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; DATA-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT: [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
-; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A]], [[TMP33]]
-; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP32]], i32 [[T_022]]
-; DATA-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP34:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; DATA-NEXT: [[TMP35:%.*]] = sext i32 [[TMP34]] to i64
-; DATA-NEXT: [[CMP6:%.*]] = icmp slt i64 [[A]], [[TMP35]]
-; DATA-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP34]], i32 [[S_023]]
-; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; NO-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
;
entry:
%cmp21 = icmp sgt i32 %N, 0
- br i1 %cmp21, label %for.body.preheader, label %for.cond.cleanup
+ br i1 %cmp21, label %for.body.preheader, label %exit
for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %N to i64
br label %for.body
-for.cond.cleanup.loopexit: ; preds = %for.body
+exit.loopexit: ; preds = %for.body
%0 = or i32 %s.1, %spec.select
- br label %for.cond.cleanup
+ br label %exit
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
+exit: ; preds = %exit.loopexit, %entry
+ %or = phi i32 [ %0, %exit.loopexit ], [ -1, %entry ]
ret i32 %or
for.body: ; preds = %for.body.preheader, %for.body
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
%s.023 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.body ]
%t.022 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
- %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+ %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %iv
%1 = load i32, ptr %arrayidx, align 4
%2 = sext i32 %1 to i64
%cmp1 = icmp slt i64 %a, %2
%spec.select = select i1 %cmp1, i32 %1, i32 %t.022
- %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+ %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %iv
%3 = load i32, ptr %arrayidx5, align 4
%4 = sext i32 %3 to i64
%cmp6 = icmp slt i64 %a, %4
%s.1 = select i1 %cmp6, i32 %3, i32 %s.023
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %exit.loopexit, label %for.body
}
; This function is generated from the following C/C++ program:
@@ -1190,7 +872,7 @@ define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data
; EVL-LABEL: @csa_in_series_int_select_induction_cmp(
; EVL-NEXT: entry:
; EVL-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
; EVL: for.body.preheader:
; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -1260,40 +942,40 @@ define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data
; EVL-NEXT: [[TMP34:%.*]] = icmp sge i32 [[TMP33]], 0
; EVL-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[CSA_EXTRACT]], i32 -1
; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; EVL: scalar.ph:
; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
+; EVL: exit.loopexit:
; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ]
; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
; EVL-NEXT: [[TMP36:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP36]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
+; EVL-NEXT: br label [[EXIT]]
+; EVL: exit:
+; EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP36]], [[EXIT_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
; EVL-NEXT: ret i32 [[OR]]
; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; EVL-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
; EVL-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[IV]]
; EVL-NEXT: [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; EVL-NEXT: [[TMP38:%.*]] = sext i32 [[TMP37]] to i64
-; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP38]]
+; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP38]]
; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP37]], i32 [[T_022]]
-; EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]]
; EVL-NEXT: [[TMP39:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
; EVL-NEXT: [[TMP40:%.*]] = sext i32 [[TMP39]] to i64
-; EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP40]]
+; EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[IV]], [[TMP40]]
; EVL-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP39]], i32 [[S_023]]
-; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
;
; NO-EVL-LABEL: @csa_in_series_int_select_induction_cmp(
; NO-EVL-NEXT: entry:
; NO-EVL-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
; NO-EVL: for.body.preheader:
; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -1363,172 +1045,69 @@ define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data
; NO-EVL-NEXT: [[TMP34:%.*]] = icmp sge i32 [[TMP33]], 0
; NO-EVL-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[CSA_EXTRACT]], i32 -1
; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; NO-EVL: scalar.ph:
; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL: exit.loopexit:
; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ]
; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
; NO-EVL-NEXT: [[TMP36:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP36]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT: br label [[EXIT]]
+; NO-EVL: exit:
+; NO-EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP36]], [[EXIT_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
; NO-EVL-NEXT: ret i32 [[OR]]
; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; NO-EVL-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
; NO-EVL-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[IV]]
; NO-EVL-NEXT: [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; NO-EVL-NEXT: [[TMP38:%.*]] = sext i32 [[TMP37]] to i64
-; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP38]]
+; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP38]]
; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP37]], i32 [[T_022]]
-; NO-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]]
; NO-EVL-NEXT: [[TMP39:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
; NO-EVL-NEXT: [[TMP40:%.*]] = sext i32 [[TMP39]] to i64
-; NO-EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP40]]
+; NO-EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[IV]], [[TMP40]]
; NO-EVL-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP39]], i32 [[S_023]]
-; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-;
-; DATA-LABEL: @csa_in_series_int_select_induction_cmp(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP21]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
-; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA: vector.ph:
-; DATA-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
-; DATA-NEXT: [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
-; DATA-NEXT: [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; DATA-NEXT: [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
-; DATA-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP7:%.*]] = mul i64 1, [[TMP6]]
-; DATA-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
-; DATA-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
-; DATA: vector.body:
-; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_MASK_PHI1:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP8]]
-; DATA-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
-; DATA-NEXT: [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; DATA-NEXT: [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
-; DATA-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
-; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
-; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI2]]
-; DATA-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP8]]
-; DATA-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
-; DATA-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 1 x i32>, ptr [[TMP15]], align 4
-; DATA-NEXT: [[TMP16:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD3]] to <vscale x 1 x i64>
-; DATA-NEXT: [[TMP17:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP16]]
-; DATA-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP17]])
-; DATA-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP18]], <vscale x 1 x i1> [[TMP17]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP18]], <vscale x 1 x i32> [[WIDE_LOAD3]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
-; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; DATA-NEXT: [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; DATA-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; DATA: middle.block:
-; DATA-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; DATA-NEXT: [[TMP20:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
-; DATA-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP20]])
-; DATA-NEXT: [[TMP22:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
-; DATA-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
-; DATA-NEXT: [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
-; DATA-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
-; DATA-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL5]], i32 [[TMP25]]
-; DATA-NEXT: [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
-; DATA-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[CSA_EXTRACT7]], i32 -1
-; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; DATA-NEXT: [[TMP28:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; DATA-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP28]])
-; DATA-NEXT: [[TMP30:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT: [[TMP31:%.*]] = icmp eq i32 [[TMP29]], 0
-; DATA-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]]
-; DATA-NEXT: [[TMP33:%.*]] = select i1 [[TMP32]], i32 0, i32 -1
-; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP33]]
-; DATA-NEXT: [[TMP34:%.*]] = icmp sge i32 [[TMP33]], 0
-; DATA-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[CSA_EXTRACT]], i32 -1
-; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA: scalar.ph:
-; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT: [[TMP36:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[OR:%.*]] = phi i32 [ [[TMP36]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
-; DATA-NEXT: ret i32 [[OR]]
-; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; DATA-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT: [[TMP38:%.*]] = sext i32 [[TMP37]] to i64
-; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP38]]
-; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP37]], i32 [[T_022]]
-; DATA-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP39:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; DATA-NEXT: [[TMP40:%.*]] = sext i32 [[TMP39]] to i64
-; DATA-NEXT: [[CMP6:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP40]]
-; DATA-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP39]], i32 [[S_023]]
-; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; NO-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
;
entry:
%cmp21 = icmp sgt i32 %N, 0
- br i1 %cmp21, label %for.body.preheader, label %for.cond.cleanup
+ br i1 %cmp21, label %for.body.preheader, label %exit
for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %N to i64
br label %for.body
-for.cond.cleanup.loopexit: ; preds = %for.body
+exit.loopexit: ; preds = %for.body
%0 = or i32 %s.1, %spec.select
- br label %for.cond.cleanup
+ br label %exit
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
+exit: ; preds = %exit.loopexit, %entry
+ %or = phi i32 [ %0, %exit.loopexit ], [ -1, %entry ]
ret i32 %or
for.body: ; preds = %for.body.preheader, %for.body
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
%s.023 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.body ]
%t.022 = phi i32 [ -1, %for.body.preheader ], [ %spec.select, %for.body ]
- %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+ %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %iv
%1 = load i32, ptr %arrayidx, align 4
%2 = sext i32 %1 to i64
- %cmp1 = icmp slt i64 %indvars.iv, %2
+ %cmp1 = icmp slt i64 %iv, %2
%spec.select = select i1 %cmp1, i32 %1, i32 %t.022
- %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+ %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %iv
%3 = load i32, ptr %arrayidx5, align 4
%4 = sext i32 %3 to i64
- %cmp6 = icmp slt i64 %indvars.iv, %4
+ %cmp6 = icmp slt i64 %iv, %4
%s.1 = select i1 %cmp6, i32 %3, i32 %s.023
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %exit.loopexit, label %for.body
}
; This function is generated from the following C/C++ program:
@@ -1548,7 +1127,7 @@ define float @csa_in_series_float_select(i32 %N, ptr %data0, ptr %data1) {
; EVL-LABEL: @csa_in_series_float_select(
; EVL-NEXT: entry:
; EVL-NEXT: [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL-NEXT: br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
; EVL: for.body.preheader:
; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -1606,38 +1185,38 @@ define float @csa_in_series_float_select(i32 %N, ptr %data0, ptr %data1) {
; EVL-NEXT: [[TMP27:%.*]] = icmp sge i32 [[TMP26]], 0
; EVL-NEXT: [[TMP28:%.*]] = select i1 [[TMP27]], float [[CSA_EXTRACT]], float 1.000000e+00
; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; EVL: scalar.ph:
; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
+; EVL: exit.loopexit:
; EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
; EVL-NEXT: [[TMP29:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[ADD:%.*]] = phi float [ [[TMP29]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; EVL-NEXT: br label [[EXIT]]
+; EVL: exit:
+; EVL-NEXT: [[ADD:%.*]] = phi float [ [[TMP29]], [[EXIT_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
; EVL-NEXT: ret float [[ADD]]
; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; EVL-NEXT: [[S_021:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
; EVL-NEXT: [[T_020:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[IV]]
; EVL-NEXT: [[TMP30:%.*]] = load float, ptr [[ARRAYIDX]], align 4
; EVL-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP30]], 0.000000e+00
; EVL-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP30]], float [[T_020]]
-; EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[IV]]
; EVL-NEXT: [[TMP31:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
; EVL-NEXT: [[CMP6:%.*]] = fcmp ogt float [[TMP31]], 0.000000e+00
; EVL-NEXT: [[S_1]] = select i1 [[CMP6]], float [[TMP31]], float [[S_021]]
-; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
;
; NO-EVL-LABEL: @csa_in_series_float_select(
; NO-EVL-NEXT: entry:
; NO-EVL-NEXT: [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL-NEXT: br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
; NO-EVL: for.body.preheader:
; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -1695,154 +1274,65 @@ define float @csa_in_series_float_select(i32 %N, ptr %data0, ptr %data1) {
; NO-EVL-NEXT: [[TMP27:%.*]] = icmp sge i32 [[TMP26]], 0
; NO-EVL-NEXT: [[TMP28:%.*]] = select i1 [[TMP27]], float [[CSA_EXTRACT]], float 1.000000e+00
; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; NO-EVL: scalar.ph:
; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL: exit.loopexit:
; NO-EVL-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
; NO-EVL-NEXT: [[TMP29:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[ADD:%.*]] = phi float [ [[TMP29]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT: br label [[EXIT]]
+; NO-EVL: exit:
+; NO-EVL-NEXT: [[ADD:%.*]] = phi float [ [[TMP29]], [[EXIT_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
; NO-EVL-NEXT: ret float [[ADD]]
; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; NO-EVL-NEXT: [[S_021:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
; NO-EVL-NEXT: [[T_020:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[IV]]
; NO-EVL-NEXT: [[TMP30:%.*]] = load float, ptr [[ARRAYIDX]], align 4
; NO-EVL-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP30]], 0.000000e+00
; NO-EVL-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP30]], float [[T_020]]
-; NO-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[IV]]
; NO-EVL-NEXT: [[TMP31:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
; NO-EVL-NEXT: [[CMP6:%.*]] = fcmp ogt float [[TMP31]], 0.000000e+00
; NO-EVL-NEXT: [[S_1]] = select i1 [[CMP6]], float [[TMP31]], float [[S_021]]
-; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-;
-; DATA-LABEL: @csa_in_series_float_select(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP19:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
-; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA: vector.ph:
-; DATA-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
-; DATA: vector.body:
-; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL4:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_MASK_PHI1:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x float> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[DATA0:%.*]], i64 [[TMP3]]
-; DATA-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0
-; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x float>, ptr [[TMP5]], align 4
-; DATA-NEXT: [[TMP6:%.*]] = fcmp ogt <vscale x 1 x float> [[WIDE_LOAD]], zeroinitializer
-; DATA-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP6]])
-; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP7]], <vscale x 1 x i1> [[TMP6]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
-; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP7]], <vscale x 1 x float> [[WIDE_LOAD]], <vscale x 1 x float> [[CSA_DATA_PHI2]]
-; DATA-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[DATA1:%.*]], i64 [[TMP3]]
-; DATA-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
-; DATA-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 1 x float>, ptr [[TMP9]], align 4
-; DATA-NEXT: [[TMP10:%.*]] = fcmp ogt <vscale x 1 x float> [[WIDE_LOAD3]], zeroinitializer
-; DATA-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP10]])
-; DATA-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP11]], <vscale x 1 x i1> [[TMP10]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP11]], <vscale x 1 x float> [[WIDE_LOAD3]], <vscale x 1 x float> [[CSA_DATA_PHI]]
-; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; DATA-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; DATA: middle.block:
-; DATA-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; DATA-NEXT: [[TMP13:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
-; DATA-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP13]])
-; DATA-NEXT: [[TMP15:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
-; DATA-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP14]], 0
-; DATA-NEXT: [[TMP17:%.*]] = and i1 [[TMP15]], [[TMP16]]
-; DATA-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], i32 0, i32 -1
-; DATA-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x float> [[CSA_DATA_SEL5]], i32 [[TMP18]]
-; DATA-NEXT: [[TMP19:%.*]] = icmp sge i32 [[TMP18]], 0
-; DATA-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], float [[CSA_EXTRACT7]], float 1.000000e+00
-; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; DATA-NEXT: [[TMP21:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; DATA-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP21]])
-; DATA-NEXT: [[TMP23:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP22]], 0
-; DATA-NEXT: [[TMP25:%.*]] = and i1 [[TMP23]], [[TMP24]]
-; DATA-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 0, i32 -1
-; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x float> [[CSA_DATA_SEL]], i32 [[TMP26]]
-; DATA-NEXT: [[TMP27:%.*]] = icmp sge i32 [[TMP26]], 0
-; DATA-NEXT: [[TMP28:%.*]] = select i1 [[TMP27]], float [[CSA_EXTRACT]], float 1.000000e+00
-; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA: scalar.ph:
-; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[T_1_LCSSA:%.*]] = phi float [ [[T_1:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT: [[S_1_LCSSA:%.*]] = phi float [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT: [[TMP29:%.*]] = fadd float [[T_1_LCSSA]], [[S_1_LCSSA]]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[ADD:%.*]] = phi float [ [[TMP29]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ 2.000000e+00, [[ENTRY:%.*]] ]
-; DATA-NEXT: ret float [[ADD]]
-; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[S_021:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
-; DATA-NEXT: [[T_020:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[T_1]], [[FOR_BODY]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[DATA0]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP30:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP30]], 0.000000e+00
-; DATA-NEXT: [[T_1]] = select i1 [[CMP1]], float [[TMP30]], float [[T_020]]
-; DATA-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DATA1]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP31:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
-; DATA-NEXT: [[CMP6:%.*]] = fcmp ogt float [[TMP31]], 0.000000e+00
-; DATA-NEXT: [[S_1]] = select i1 [[CMP6]], float [[TMP31]], float [[S_021]]
-; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; NO-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
;
entry:
%cmp19 = icmp sgt i32 %N, 0
- br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup
+ br i1 %cmp19, label %for.body.preheader, label %exit
for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %N to i64
br label %for.body
-for.cond.cleanup.loopexit: ; preds = %for.body
+exit.loopexit: ; preds = %for.body
%0 = fadd float %t.1, %s.1
- br label %for.cond.cleanup
+ br label %exit
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
+exit: ; preds = %exit.loopexit, %entry
+ %add = phi float [ %0, %exit.loopexit ], [ 2.000000e+00, %entry ]
ret float %add
for.body: ; preds = %for.body.preheader, %for.body
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
%s.021 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.body ]
%t.020 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.body ]
- %arrayidx = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+ %arrayidx = getelementptr inbounds float, ptr %data0, i64 %iv
%1 = load float, ptr %arrayidx, align 4
%cmp1 = fcmp ogt float %1, 0.000000e+00
%t.1 = select i1 %cmp1, float %1, float %t.020
- %arrayidx5 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+ %arrayidx5 = getelementptr inbounds float, ptr %data1, i64 %iv
%2 = load float, ptr %arrayidx5, align 4
%cmp6 = fcmp ogt float %2, 0.000000e+00
%s.1 = select i1 %cmp6, float %2, float %s.021
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %exit.loopexit, label %for.body
}
; This function is generated from the following C/C++ program:
@@ -1864,56 +1354,53 @@ define i32 @csa_in_series_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %d
; NO-EVL-LABEL: @csa_in_series_int(
; NO-EVL-NOT: vector.body:
;
-; DATA-LABEL: @csa_in_series_int(
-; DATA-NOT: vector.body:
-;
entry:
%cmp15 = icmp sgt i32 %N, 0
- br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+ br i1 %cmp15, label %for.body.preheader, label %exit
for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %N to i64
br label %for.body
-for.cond.cleanup.loopexit: ; preds = %for.inc
+exit.loopexit: ; preds = %for.inc
%0 = or i32 %s.1, %t.1
- br label %for.cond.cleanup
+ br label %exit
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
+exit: ; preds = %exit.loopexit, %entry
+ %or = phi i32 [ %0, %exit.loopexit ], [ -1, %entry ]
ret i32 %or
for.body: ; preds = %for.body.preheader, %for.inc
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
%s.017 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
%t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
- %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+ %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
%1 = load i8, ptr %arrayidx, align 1
%tobool.not = icmp eq i8 %1, 0
br i1 %tobool.not, label %if.end, label %if.then
if.then: ; preds = %for.body
- %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+ %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %iv
%2 = load i32, ptr %arrayidx2, align 4
br label %if.end
if.end: ; preds = %if.then, %for.body
%t.1 = phi i32 [ %2, %if.then ], [ %t.016, %for.body ]
- %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+ %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
%3 = load i8, ptr %arrayidx4, align 1
%tobool5.not = icmp eq i8 %3, 0
br i1 %tobool5.not, label %for.inc, label %if.then6
if.then6: ; preds = %if.end
- %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+ %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %iv
%4 = load i32, ptr %arrayidx8, align 4
br label %for.inc
for.inc: ; preds = %if.end, %if.then6
%s.1 = phi i32 [ %4, %if.then6 ], [ %s.017, %if.end ]
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %exit.loopexit, label %for.body
}
; This function is generated from the following C/C++ program:
@@ -1936,56 +1423,53 @@ define float @csa_in_series_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, pt
; NO-EVL-LABEL: @csa_in_series_float(
; NO-EVL-NOT: vector.body:
;
-; DATA-LABEL: @csa_in_series_float(
-; DATA-NOT: vector.body:
-;
entry:
%cmp15 = icmp sgt i32 %N, 0
- br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+ br i1 %cmp15, label %for.body.preheader, label %exit
for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %N to i64
br label %for.body
-for.cond.cleanup.loopexit: ; preds = %for.inc
+exit.loopexit: ; preds = %for.inc
%0 = fadd float %t.1, %s.1
- br label %for.cond.cleanup
+ br label %exit
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
+exit: ; preds = %exit.loopexit, %entry
+ %add = phi float [ %0, %exit.loopexit ], [ 2.000000e+00, %entry ]
ret float %add
for.body: ; preds = %for.body.preheader, %for.inc
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
%s.017 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
%t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
- %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+ %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
%1 = load i8, ptr %arrayidx, align 1
%tobool.not = icmp eq i8 %1, 0
br i1 %tobool.not, label %if.end, label %if.then
if.then: ; preds = %for.body
- %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+ %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %iv
%2 = load float, ptr %arrayidx2, align 4
br label %if.end
if.end: ; preds = %if.then, %for.body
%t.1 = phi float [ %2, %if.then ], [ %t.016, %for.body ]
- %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+ %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
%3 = load i8, ptr %arrayidx4, align 1
%tobool5.not = icmp eq i8 %3, 0
br i1 %tobool5.not, label %for.inc, label %if.then6
if.then6: ; preds = %if.end
- %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+ %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %iv
%4 = load float, ptr %arrayidx8, align 4
br label %for.inc
for.inc: ; preds = %if.end, %if.then6
%s.1 = phi float [ %4, %if.then6 ], [ %s.017, %if.end ]
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %exit.loopexit, label %for.body
}
; This function is generated from the following C/C++ program:
@@ -2007,37 +1491,34 @@ define i32 @csa_in_series_same_scalar_int_select(i32 %N, ptr %data0, ptr %data1)
; NO-EVL-LABEL: @csa_in_series_same_scalar_int_select(
; NO-EVL-NOT: vector.body:
;
-; DATA-LABEL: @csa_in_series_same_scalar_int_select(
-; DATA-NOT: vector.body:
-;
entry:
%cmp21 = icmp sgt i32 %N, 0
- br i1 %cmp21, label %for.body.preheader, label %for.cond.cleanup
+ br i1 %cmp21, label %for.body.preheader, label %exit
for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %N to i64
br label %for.body
-for.cond.cleanup: ; preds = %for.body, %entry
+exit: ; preds = %for.body, %entry
%t.0.lcssa = phi i32 [ -1, %entry ], [ %t.2, %for.body ]
ret i32 %t.0.lcssa
for.body: ; preds = %for.body.preheader, %for.body
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
%t.022 = phi i32 [ -1, %for.body.preheader ], [ %t.2, %for.body ]
- %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+ %arrayidx = getelementptr inbounds i32, ptr %data0, i64 %iv
%0 = load i32, ptr %arrayidx, align 4
%1 = sext i32 %0 to i64
- %cmp1 = icmp slt i64 %indvars.iv, %1
+ %cmp1 = icmp slt i64 %iv, %1
%spec.select = select i1 %cmp1, i32 %0, i32 %t.022
- %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+ %arrayidx5 = getelementptr inbounds i32, ptr %data1, i64 %iv
%2 = load i32, ptr %arrayidx5, align 4
%3 = sext i32 %2 to i64
- %cmp6 = icmp slt i64 %indvars.iv, %3
+ %cmp6 = icmp slt i64 %iv, %3
%t.2 = select i1 %cmp6, i32 %2, i32 %spec.select
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %exit, label %for.body
}
; This function is generated from the following C/C++ program:
@@ -2059,35 +1540,32 @@ define float @csa_in_series_same_scalar_float_select(i32 %N, ptr %data0, ptr %da
; NO-EVL-LABEL: @csa_in_series_same_scalar_float_select(
; NO-EVL-NOT: vector.body:
;
-; DATA-LABEL: @csa_in_series_same_scalar_float_select(
-; NO-EVL-NOT: vector.body:
-;
entry:
%cmp19 = icmp sgt i32 %N, 0
- br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup
+ br i1 %cmp19, label %for.body.preheader, label %exit
for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %N to i64
br label %for.body
-for.cond.cleanup: ; preds = %for.body, %entry
+exit: ; preds = %for.body, %entry
%t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.2, %for.body ]
ret float %t.0.lcssa
for.body: ; preds = %for.body.preheader, %for.body
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
%t.020 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.2, %for.body ]
- %arrayidx = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+ %arrayidx = getelementptr inbounds float, ptr %data0, i64 %iv
%0 = load float, ptr %arrayidx, align 4
%cmp1 = fcmp ogt float %0, 0.000000e+00
%t.1 = select i1 %cmp1, float %0, float %t.020
- %arrayidx5 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+ %arrayidx5 = getelementptr inbounds float, ptr %data1, i64 %iv
%1 = load float, ptr %arrayidx5, align 4
%cmp6 = fcmp ogt float %1, 0.000000e+00
%t.2 = select i1 %cmp6, float %1, float %t.1
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %exit, label %for.body
}
; This function is generated from the following C/C++ program:
@@ -2109,51 +1587,48 @@ define i32 @csa_in_series_same_scalar_int(i32 %N, ptr %cond0, ptr %cond1, ptr %d
; NO-EVL-LABEL: @csa_in_series_same_scalar_int(
; NO-EVL-NOT: vector.body:
;
-; DATA-LABEL: @csa_in_series_same_scalar_int(
-; DATA-NOT: vector.body:
-;
entry:
%cmp15 = icmp sgt i32 %N, 0
- br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+ br i1 %cmp15, label %for.body.preheader, label %exit
for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %N to i64
br label %for.body
-for.cond.cleanup: ; preds = %for.inc, %entry
+exit: ; preds = %for.inc, %entry
%t.0.lcssa = phi i32 [ -1, %entry ], [ %t.2, %for.inc ]
ret i32 %t.0.lcssa
for.body: ; preds = %for.body.preheader, %for.inc
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
%t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.2, %for.inc ]
- %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+ %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
%0 = load i8, ptr %arrayidx, align 1
%tobool.not = icmp eq i8 %0, 0
br i1 %tobool.not, label %if.end, label %if.then
if.then: ; preds = %for.body
- %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+ %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %iv
%1 = load i32, ptr %arrayidx2, align 4
br label %if.end
if.end: ; preds = %if.then, %for.body
%t.1 = phi i32 [ %1, %if.then ], [ %t.016, %for.body ]
- %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+ %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
%2 = load i8, ptr %arrayidx4, align 1
%tobool5.not = icmp eq i8 %2, 0
br i1 %tobool5.not, label %for.inc, label %if.then6
if.then6: ; preds = %if.end
- %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+ %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %iv
%3 = load i32, ptr %arrayidx8, align 4
br label %for.inc
for.inc: ; preds = %if.end, %if.then6
%t.2 = phi i32 [ %3, %if.then6 ], [ %t.1, %if.end ]
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %exit, label %for.body
}
; This function is generated from the following C/C++ program:
@@ -2175,51 +1650,48 @@ define float @csa_in_series_same_scalar_float(i32 %N, ptr %cond0, ptr %cond1, pt
; NO-EVL-LABEL: @csa_in_series_same_scalar_float(
; NO-EVL-NOT: vector.body:
;
-; DATA-LABEL: @csa_in_series_same_scalar_float(
-; DATA-NOT: vector.body:
-;
entry:
%cmp15 = icmp sgt i32 %N, 0
- br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+ br i1 %cmp15, label %for.body.preheader, label %exit
for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %N to i64
br label %for.body
-for.cond.cleanup: ; preds = %for.inc, %entry
+exit: ; preds = %for.inc, %entry
%t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.2, %for.inc ]
ret float %t.0.lcssa
for.body: ; preds = %for.body.preheader, %for.inc
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
%t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.2, %for.inc ]
- %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+ %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
%0 = load i8, ptr %arrayidx, align 1
%tobool.not = icmp eq i8 %0, 0
br i1 %tobool.not, label %if.end, label %if.then
if.then: ; preds = %for.body
- %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+ %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %iv
%1 = load float, ptr %arrayidx2, align 4
br label %if.end
if.end: ; preds = %if.then, %for.body
%t.1 = phi float [ %1, %if.then ], [ %t.016, %for.body ]
- %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+ %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
%2 = load i8, ptr %arrayidx4, align 1
%tobool5.not = icmp eq i8 %2, 0
br i1 %tobool5.not, label %for.inc, label %if.then6
if.then6: ; preds = %if.end
- %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+ %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %iv
%3 = load float, ptr %arrayidx8, align 4
br label %for.inc
for.inc: ; preds = %if.end, %if.then6
%t.2 = phi float [ %3, %if.then6 ], [ %t.1, %if.end ]
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %exit, label %for.body
}
; This function is generated from the following C/C++ program:
@@ -2241,47 +1713,44 @@ define i32 @csa_same_cond_int(i32 %N, ptr %cond, ptr %data0, ptr %data1) {
; NO-EVL-LABEL: @csa_same_cond_int(
; NO-EVL-NOT: vector.body:
;
-; DATA-LABEL: @csa_same_cond_int(
-; DATA-NOT: vector.body:
-;
entry:
%cmp9 = icmp sgt i32 %N, 0
- br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+ br i1 %cmp9, label %for.body.preheader, label %exit
for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %N to i64
br label %for.body
-for.cond.cleanup.loopexit: ; preds = %for.inc
+exit.loopexit: ; preds = %for.inc
%0 = or i32 %s.1, %t.1
- br label %for.cond.cleanup
+ br label %exit
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
+exit: ; preds = %exit.loopexit, %entry
+ %or = phi i32 [ %0, %exit.loopexit ], [ -1, %entry ]
ret i32 %or
for.body: ; preds = %for.body.preheader, %for.inc
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
%s.011 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
%t.010 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
- %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
+ %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %iv
%1 = load i8, ptr %arrayidx, align 1
%tobool.not = icmp eq i8 %1, 0
br i1 %tobool.not, label %for.inc, label %if.then
if.then: ; preds = %for.body
- %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+ %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %iv
%2 = load i32, ptr %arrayidx2, align 4
- %arrayidx4 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+ %arrayidx4 = getelementptr inbounds i32, ptr %data1, i64 %iv
%3 = load i32, ptr %arrayidx4, align 4
br label %for.inc
for.inc: ; preds = %for.body, %if.then
%t.1 = phi i32 [ %2, %if.then ], [ %t.010, %for.body ]
%s.1 = phi i32 [ %3, %if.then ], [ %s.011, %for.body ]
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %exit.loopexit, label %for.body
}
; This function is generated from the following C/C++ program:
@@ -2303,47 +1772,44 @@ define float @csa_same_cond_float(i32 %N, ptr %cond, ptr %data0, ptr %data1) {
; NO-EVL-LABEL: @csa_same_cond_float(
; NO-EVL-NOT: vector.body:
;
-; DATA-LABEL: @csa_same_cond_float(
-; DATA-NOT: vector.body:
-;
entry:
%cmp9 = icmp sgt i32 %N, 0
- br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+ br i1 %cmp9, label %for.body.preheader, label %exit
for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %N to i64
br label %for.body
-for.cond.cleanup.loopexit: ; preds = %for.inc
+exit.loopexit: ; preds = %for.inc
%0 = fadd float %t.1, %s.1
- br label %for.cond.cleanup
+ br label %exit
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
+exit: ; preds = %exit.loopexit, %entry
+ %add = phi float [ %0, %exit.loopexit ], [ 2.000000e+00, %entry ]
ret float %add
for.body: ; preds = %for.body.preheader, %for.inc
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
%s.011 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
%t.010 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
- %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %indvars.iv
+ %arrayidx = getelementptr inbounds i8, ptr %cond, i64 %iv
%1 = load i8, ptr %arrayidx, align 1
%tobool.not = icmp eq i8 %1, 0
br i1 %tobool.not, label %for.inc, label %if.then
if.then: ; preds = %for.body
- %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+ %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %iv
%2 = load float, ptr %arrayidx2, align 4
- %arrayidx4 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+ %arrayidx4 = getelementptr inbounds float, ptr %data1, i64 %iv
%3 = load float, ptr %arrayidx4, align 4
br label %for.inc
for.inc: ; preds = %for.body, %if.then
%t.1 = phi float [ %2, %if.then ], [ %t.010, %for.body ]
%s.1 = phi float [ %3, %if.then ], [ %s.011, %for.body ]
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %exit.loopexit, label %for.body
}
; This function is generated from the following C/C++ program:
@@ -2365,46 +1831,43 @@ define i32 @csa_else_if_same_scalar_int(i32 %N, ptr %cond0, ptr %cond1, ptr %dat
; NO-EVL-LABEL: @csa_else_if_same_scalar_int(
; NO-EVL-NOT: vector.body:
;
-; DATA-LABEL: @csa_else_if_same_scalar_int(
-; DATA-NOT: vector.body:
-;
entry:
%cmp15 = icmp sgt i32 %N, 0
- br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+ br i1 %cmp15, label %for.body.preheader, label %exit
for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %N to i64
br label %for.body
-for.cond.cleanup: ; preds = %for.inc, %entry
+exit: ; preds = %for.inc, %entry
%t.0.lcssa = phi i32 [ -1, %entry ], [ %t.1, %for.inc ]
ret i32 %t.0.lcssa
for.body: ; preds = %for.body.preheader, %for.inc
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
%t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
- %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+ %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
%0 = load i8, ptr %arrayidx, align 1
%tobool.not = icmp eq i8 %0, 0
br i1 %tobool.not, label %if.else, label %for.inc.sink.split
if.else: ; preds = %for.body
- %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+ %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
%1 = load i8, ptr %arrayidx4, align 1
%tobool5.not = icmp eq i8 %1, 0
br i1 %tobool5.not, label %for.inc, label %for.inc.sink.split
for.inc.sink.split: ; preds = %if.else, %for.body
%data0.sink = phi ptr [ %data0, %for.body ], [ %data1, %if.else ]
- %arrayidx2 = getelementptr inbounds i32, ptr %data0.sink, i64 %indvars.iv
+ %arrayidx2 = getelementptr inbounds i32, ptr %data0.sink, i64 %iv
%2 = load i32, ptr %arrayidx2, align 4
br label %for.inc
for.inc: ; preds = %for.inc.sink.split, %if.else
%t.1 = phi i32 [ %t.016, %if.else ], [ %2, %for.inc.sink.split ]
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %exit, label %for.body
}
; This function is generated from the following C/C++ program:
@@ -2426,46 +1889,43 @@ define float @csa_else_if_same_scalar_float(i32 %N, ptr %cond0, ptr %cond1, ptr
; NO-EVL-LABEL: @csa_else_if_same_scalar_float(
; NO-EVL-NOT: vector.body:
;
-; DATA-LABEL: @csa_else_if_same_scalar_float(
-; DATA-NOT: vector.body:
-;
entry:
%cmp15 = icmp sgt i32 %N, 0
- br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+ br i1 %cmp15, label %for.body.preheader, label %exit
for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %N to i64
br label %for.body
-for.cond.cleanup: ; preds = %for.inc, %entry
+exit: ; preds = %for.inc, %entry
%t.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %t.1, %for.inc ]
ret float %t.0.lcssa
for.body: ; preds = %for.body.preheader, %for.inc
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
%t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
- %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+ %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
%0 = load i8, ptr %arrayidx, align 1
%tobool.not = icmp eq i8 %0, 0
br i1 %tobool.not, label %if.else, label %for.inc.sink.split
if.else: ; preds = %for.body
- %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+ %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
%1 = load i8, ptr %arrayidx4, align 1
%tobool5.not = icmp eq i8 %1, 0
br i1 %tobool5.not, label %for.inc, label %for.inc.sink.split
for.inc.sink.split: ; preds = %if.else, %for.body
%data0.sink = phi ptr [ %data0, %for.body ], [ %data1, %if.else ]
- %arrayidx2 = getelementptr inbounds float, ptr %data0.sink, i64 %indvars.iv
+ %arrayidx2 = getelementptr inbounds float, ptr %data0.sink, i64 %iv
%2 = load float, ptr %arrayidx2, align 4
br label %for.inc
for.inc: ; preds = %for.inc.sink.split, %if.else
%t.1 = phi float [ %t.016, %if.else ], [ %2, %for.inc.sink.split ]
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %exit, label %for.body
}
; This function is generated from the following C/C++ program:
@@ -2487,56 +1947,53 @@ define i32 @csa_else_if_int(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr %dat
; NO-EVL-LABEL: @csa_else_if_int(
; NO-EVL-NOT: vector.body:
;
-; DATA-LABEL: @csa_else_if_int(
-; DATA-NOT: vector.body:
-;
entry:
%cmp15 = icmp sgt i32 %N, 0
- br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+ br i1 %cmp15, label %for.body.preheader, label %exit
for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %N to i64
br label %for.body
-for.cond.cleanup.loopexit: ; preds = %for.inc
+exit.loopexit: ; preds = %for.inc
%0 = or i32 %s.1, %t.1
- br label %for.cond.cleanup
+ br label %exit
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %or = phi i32 [ %0, %for.cond.cleanup.loopexit ], [ -1, %entry ]
+exit: ; preds = %exit.loopexit, %entry
+ %or = phi i32 [ %0, %exit.loopexit ], [ -1, %entry ]
ret i32 %or
for.body: ; preds = %for.body.preheader, %for.inc
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
%s.017 = phi i32 [ -1, %for.body.preheader ], [ %s.1, %for.inc ]
%t.016 = phi i32 [ -1, %for.body.preheader ], [ %t.1, %for.inc ]
- %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+ %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
%1 = load i8, ptr %arrayidx, align 1
%tobool.not = icmp eq i8 %1, 0
br i1 %tobool.not, label %if.else, label %if.then
if.then: ; preds = %for.body
- %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %indvars.iv
+ %arrayidx2 = getelementptr inbounds i32, ptr %data0, i64 %iv
%2 = load i32, ptr %arrayidx2, align 4
br label %for.inc
if.else: ; preds = %for.body
- %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+ %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
%3 = load i8, ptr %arrayidx4, align 1
%tobool5.not = icmp eq i8 %3, 0
br i1 %tobool5.not, label %for.inc, label %if.then6
if.then6: ; preds = %if.else
- %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %indvars.iv
+ %arrayidx8 = getelementptr inbounds i32, ptr %data1, i64 %iv
%4 = load i32, ptr %arrayidx8, align 4
br label %for.inc
for.inc: ; preds = %if.then, %if.then6, %if.else
%t.1 = phi i32 [ %2, %if.then ], [ %t.016, %if.then6 ], [ %t.016, %if.else ]
%s.1 = phi i32 [ %s.017, %if.then ], [ %4, %if.then6 ], [ %s.017, %if.else ]
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %exit.loopexit, label %for.body
}
; This function is generated from the following C/C++ program:
@@ -2559,56 +2016,53 @@ define float @csa_else_if_float(i32 %N, ptr %cond0, ptr %cond1, ptr %data0, ptr
; NO-EVL-LABEL: @csa_else_if_float(
; EVL-NOT: vector.body:
;
-; DATA-LABEL: @csa_else_if_float(
-; DATA-NOT: vector.body:
-;
entry:
%cmp15 = icmp sgt i32 %N, 0
- br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
+ br i1 %cmp15, label %for.body.preheader, label %exit
for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %N to i64
br label %for.body
-for.cond.cleanup.loopexit: ; preds = %for.inc
+exit.loopexit: ; preds = %for.inc
%0 = fadd float %t.1, %s.1
- br label %for.cond.cleanup
+ br label %exit
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %add = phi float [ %0, %for.cond.cleanup.loopexit ], [ 2.000000e+00, %entry ]
+exit: ; preds = %exit.loopexit, %entry
+ %add = phi float [ %0, %exit.loopexit ], [ 2.000000e+00, %entry ]
ret float %add
for.body: ; preds = %for.body.preheader, %for.inc
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.inc ]
%s.017 = phi float [ 1.000000e+00, %for.body.preheader ], [ %s.1, %for.inc ]
%t.016 = phi float [ 1.000000e+00, %for.body.preheader ], [ %t.1, %for.inc ]
- %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %indvars.iv
+ %arrayidx = getelementptr inbounds i8, ptr %cond0, i64 %iv
%1 = load i8, ptr %arrayidx, align 1
%tobool.not = icmp eq i8 %1, 0
br i1 %tobool.not, label %if.else, label %if.then
if.then: ; preds = %for.body
- %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %indvars.iv
+ %arrayidx2 = getelementptr inbounds float, ptr %data0, i64 %iv
%2 = load float, ptr %arrayidx2, align 4
br label %for.inc
if.else: ; preds = %for.body
- %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %indvars.iv
+ %arrayidx4 = getelementptr inbounds i8, ptr %cond1, i64 %iv
%3 = load i8, ptr %arrayidx4, align 1
%tobool5.not = icmp eq i8 %3, 0
br i1 %tobool5.not, label %for.inc, label %if.then6
if.then6: ; preds = %if.else
- %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %indvars.iv
+ %arrayidx8 = getelementptr inbounds float, ptr %data1, i64 %iv
%4 = load float, ptr %arrayidx8, align 4
br label %for.inc
for.inc: ; preds = %if.then, %if.then6, %if.else
%t.1 = phi float [ %2, %if.then ], [ %t.016, %if.then6 ], [ %t.016, %if.else ]
%s.1 = phi float [ %s.017, %if.then ], [ %4, %if.then6 ], [ %s.017, %if.else ]
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %exit.loopexit, label %for.body
}
; This function is generated from the following C/C++ program:
@@ -2625,22 +2079,15 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
; NO-EVL-LABEL: @idx_scalar(
; NO-EVL-NOT: vector.body:
;
-; DATA-LABEL: @idx_scalar(
-; DATA-NOT: vector.body:
-;
entry:
%cmp8.not = icmp eq i64 %n, 0
- br i1 %cmp8.not, label %for.cond.cleanup, label %for.body.preheader
+ br i1 %cmp8.not, label %exit, label %for.body.preheader
for.body.preheader: ; preds = %entry
br label %for.body
-for.cond.cleanup.loopexit: ; preds = %for.body
- %cond.lcssa = phi i64 [ %cond, %for.body ]
- br label %for.cond.cleanup
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond.lcssa, %for.cond.cleanup.loopexit ]
+exit: ; preds = %for.body, %entry
+ %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond, %for.body ]
ret i64 %idx.0.lcssa
for.body: ; preds = %for.body.preheader, %for.body
@@ -2654,7 +2101,7 @@ for.body: ; preds = %for.body.preheader,
%cond = select i1 %cmp2, i64 %i.010, i64 %idx.09
%inc = add nuw i64 %i.010, 1
%exitcond.not = icmp eq i64 %inc, %n
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+ br i1 %exitcond.not, label %exit, label %for.body
}
; This function is generated from the following C/C++ program:
@@ -2671,22 +2118,15 @@ define dso_local i64 @idx_scalar_dec(ptr %a, ptr %b, i64 %ii, i64 %n) {
; NO-EVL-LABEL: @idx_scalar_dec(
; NO-EVL-NOT: vector.body:
;
-; DATA-LABEL: @idx_scalar_dec(
-; DATA-NOT: vector.body:
-;
entry:
%cmp.not9 = icmp eq i64 %n, 0
- br i1 %cmp.not9, label %for.cond.cleanup, label %for.body.preheader
+ br i1 %cmp.not9, label %exit, label %for.body.preheader
for.body.preheader: ; preds = %entry
br label %for.body
-for.cond.cleanup.loopexit: ; preds = %for.body
- %cond.lcssa = phi i64 [ %cond, %for.body ]
- br label %for.cond.cleanup
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond.lcssa, %for.cond.cleanup.loopexit ]
+exit: ; preds = %for.body, %entry
+ %idx.0.lcssa = phi i64 [ %ii, %entry ], [ %cond, %for.body ]
ret i64 %idx.0.lcssa
for.body: ; preds = %for.body.preheader, %for.body
@@ -2700,7 +2140,7 @@ for.body: ; preds = %for.body.preheader,
%cmp3 = icmp sgt i64 %0, %1
%cond = select i1 %cmp3, i64 %i.011, i64 %idx.010
%cmp.not = icmp eq i64 %sub, 0
- br i1 %cmp.not, label %for.cond.cleanup.loopexit, label %for.body
+ br i1 %cmp.not, label %exit, label %for.body
}
; This function is generated from the following C/C++ program:
@@ -2718,7 +2158,7 @@ define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
; EVL-LABEL: @simple_csa_int_select_neg_cond(
; EVL-NEXT: entry:
; EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
; EVL: for.body.preheader:
; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
; EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -2769,32 +2209,32 @@ define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
; EVL-NEXT: [[TMP22:%.*]] = icmp sge i32 [[TMP21]], 0
; EVL-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[CSA_EXTRACT]], i32 0
; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; EVL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; EVL: scalar.ph:
; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
+; EVL: exit.loopexit:
; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; EVL-NEXT: br label [[EXIT]]
+; EVL: exit:
+; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
; EVL-NEXT: ret i32 [[T_0_LCSSA]]
; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; EVL-NEXT: [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
; EVL-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; EVL-NEXT: [[TMP25:%.*]] = zext i32 [[TMP24]] to i64
-; EVL-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP25]]
+; EVL-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[IV]], [[TMP25]]
; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP24]]
-; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
;
; NO-EVL-LABEL: @simple_csa_int_select_neg_cond(
; NO-EVL-NEXT: entry:
; NO-EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
; NO-EVL: for.body.preheader:
; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
; NO-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
@@ -2845,131 +2285,51 @@ define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
; NO-EVL-NEXT: [[TMP22:%.*]] = icmp sge i32 [[TMP21]], 0
; NO-EVL-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[CSA_EXTRACT]], i32 0
; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-EVL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; NO-EVL: scalar.ph:
; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL: exit.loopexit:
; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; NO-EVL-NEXT: br label [[EXIT]]
+; NO-EVL: exit:
+; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
; NO-EVL-NEXT: ret i32 [[T_0_LCSSA]]
; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; NO-EVL-NEXT: [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
; NO-EVL-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; NO-EVL-NEXT: [[TMP25:%.*]] = zext i32 [[TMP24]] to i64
-; NO-EVL-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP25]]
+; NO-EVL-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[IV]], [[TMP25]]
; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP24]]
-; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-;
-; DATA-LABEL: @simple_csa_int_select_neg_cond(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]]
-; DATA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DATA: vector.ph:
-; DATA-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
-; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; DATA-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
-; DATA-NEXT: [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
-; DATA-NEXT: [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; DATA-NEXT: [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
-; DATA-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; DATA-NEXT: [[TMP7:%.*]] = mul i64 1, [[TMP6]]
-; DATA-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
-; DATA-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; DATA-NEXT: br label [[VECTOR_BODY:%.*]]
-; DATA: vector.body:
-; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; DATA-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0
-; DATA-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP8]]
-; DATA-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; DATA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
-; DATA-NEXT: [[TMP11:%.*]] = zext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; DATA-NEXT: [[TMP12:%.*]] = icmp eq <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
-; DATA-NEXT: [[TMP13:%.*]] = xor <vscale x 1 x i1> [[TMP12]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
-; DATA-NEXT: [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP13]])
-; DATA-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP14]], <vscale x 1 x i1> [[TMP13]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; DATA-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP14]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
-; DATA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; DATA-NEXT: [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; DATA-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; DATA: middle.block:
-; DATA-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; DATA-NEXT: [[TMP16:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; DATA-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP16]])
-; DATA-NEXT: [[TMP18:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; DATA-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP17]], 0
-; DATA-NEXT: [[TMP20:%.*]] = and i1 [[TMP18]], [[TMP19]]
-; DATA-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i32 0, i32 -1
-; DATA-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP21]]
-; DATA-NEXT: [[TMP22:%.*]] = icmp sge i32 [[TMP21]], 0
-; DATA-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[CSA_EXTRACT]], i32 0
-; DATA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; DATA-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; DATA: scalar.ph:
-; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; DATA-NEXT: ret i32 [[T_0_LCSSA]]
-; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DATA-NEXT: [[TMP25:%.*]] = zext i32 [[TMP24]] to i64
-; DATA-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP25]]
-; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP24]]
-; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; NO-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
;
entry:
%cmp9 = icmp sgt i32 %N, 0
- br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+ br i1 %cmp9, label %for.body.preheader, label %exit
for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %N to i64
br label %for.body
-for.cond.cleanup.loopexit: ; preds = %for.body
- %spec.select.lcssa = phi i32 [ %spec.select, %for.body ]
- br label %for.cond.cleanup
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %t.0.lcssa = phi i32 [ 0, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+exit: ; preds = %for.body, %entry
+ %t.0.lcssa = phi i32 [ 0, %entry ], [ %spec.select, %for.body ]
ret i32 %t.0.lcssa
for.body: ; preds = %for.body.preheader, %for.body
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
%t.010 = phi i32 [ 0, %for.body.preheader ], [ %spec.select, %for.body ]
- %arrayidx = getelementptr inbounds i32, ptr %data, i64 %indvars.iv
+ %arrayidx = getelementptr inbounds i32, ptr %data, i64 %iv
%0 = load i32, ptr %arrayidx, align 4
%1 = zext i32 %0 to i64
- %cmp1.not = icmp eq i64 %indvars.iv, %1
+ %cmp1.not = icmp eq i64 %iv, %1
%spec.select = select i1 %cmp1.not, i32 %t.010, i32 %0
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %exit, label %for.body
}
; This function is generated from the following C/C++ program:
@@ -2985,107 +2345,77 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data, i64 %a) {
; EVL-LABEL: @simple_csa_ptr_select(
; EVL-NEXT: entry:
; EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
; EVL: for.body.preheader:
; EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
; EVL-NEXT: br label [[FOR_BODY:%.*]]
-; EVL: for.cond.cleanup.loopexit:
+; EVL: exit.loopexit:
; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
-; EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; EVL: for.cond.cleanup:
-; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; EVL-NEXT: br label [[EXIT]]
+; EVL: exit:
+; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT:%.*]] ]
; EVL-NEXT: ret ptr [[T_0_LCSSA]]
; EVL: for.body:
-; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; EVL-NEXT: [[T_010:%.*]] = phi ptr [ null, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[IV]]
; EVL-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
; EVL-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
; EVL-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP2]]
; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
-; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]]
;
; NO-EVL-LABEL: @simple_csa_ptr_select(
; NO-EVL-NEXT: entry:
; NO-EVL-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; NO-EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-EVL-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
; NO-EVL: for.body.preheader:
; NO-EVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
-; NO-EVL: for.cond.cleanup.loopexit:
+; NO-EVL: exit.loopexit:
; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: br label [[FOR_COND_CLEANUP]]
-; NO-EVL: for.cond.cleanup:
-; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; NO-EVL-NEXT: br label [[EXIT]]
+; NO-EVL: exit:
+; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT:%.*]] ]
; NO-EVL-NEXT: ret ptr [[T_0_LCSSA]]
; NO-EVL: for.body:
-; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; NO-EVL-NEXT: [[T_010:%.*]] = phi ptr [ null, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
+; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[IV]]
; NO-EVL-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
; NO-EVL-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
; NO-EVL-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP2]]
; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
-; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
-; DATA-LABEL: @simple_csa_ptr_select(
-; DATA-NEXT: entry:
-; DATA-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; DATA-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; DATA: for.body.preheader:
-; DATA-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; DATA-NEXT: br label [[FOR_BODY:%.*]]
-; DATA: for.cond.cleanup.loopexit:
-; DATA-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: br label [[FOR_COND_CLEANUP]]
-; DATA: for.cond.cleanup:
-; DATA-NEXT: [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; DATA-NEXT: ret ptr [[T_0_LCSSA]]
-; DATA: for.body:
-; DATA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DATA-NEXT: [[T_010:%.*]] = phi ptr [ null, [[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
-; DATA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
-; DATA-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-; DATA-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
-; DATA-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; DATA-NEXT: [[CMP1:%.*]] = icmp slt i64 [[A:%.*]], [[TMP2]]
-; DATA-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP0]], ptr [[T_010]]
-; DATA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DATA-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; DATA-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+; NO-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]]
;
entry:
%cmp9 = icmp sgt i32 %N, 0
- br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+ br i1 %cmp9, label %for.body.preheader, label %exit
for.body.preheader:
%wide.trip.count = zext i32 %N to i64
br label %for.body
-for.cond.cleanup.loopexit:
- %spec.select.lcssa = phi ptr [ %spec.select, %for.body ]
- br label %for.cond.cleanup
-
-for.cond.cleanup:
- %t.0.lcssa = phi ptr [ null, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+exit:
+ %t.0.lcssa = phi ptr [ null, %entry ], [ %spec.select, %for.body ]
ret ptr %t.0.lcssa
for.body:
- %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %iv = phi i64 [ 0, %for.body.preheader ], [ %iv.next, %for.body ]
%t.010 = phi ptr [ null, %for.body.preheader ], [ %spec.select, %for.body ]
- %arrayidx = getelementptr inbounds ptr, ptr %data, i64 %indvars.iv
+ %arrayidx = getelementptr inbounds ptr, ptr %data, i64 %iv
%0 = load ptr, ptr %arrayidx, align 8
%1 = load i32, ptr %0, align 4
%2 = sext i32 %1 to i64
%cmp1 = icmp slt i64 %a, %2
%spec.select = select i1 %cmp1, ptr %0, ptr %t.010
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
- br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %exit, label %for.body
}
>From 830d4d3d3d56179218c08f7f137b9be9639c6143 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Tue, 17 Sep 2024 15:38:53 -0700
Subject: [PATCH 14/16] fixup! fix cost functions
---
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 36 ++++++++++---------
1 file changed, 19 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 64ae1c276f01af..7c40467387ef5a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2364,15 +2364,17 @@ InstructionCost VPCSADataUpdateRecipe::computeCost(ElementCount VF,
// FIXME: These costs should be moved into VPInstruction::computeCost. We put
// them here for now since they are related to updating the data and there is
- // no VPInstruction::computeCost support at the moment. CSAInitMask AnyActive
- C += TTI.getArithmeticInstrCost(Instruction::Select, VTy, CostKind);
- // vp.reduce.or
+ // no VPInstruction::computeCost support at the moment.
+
+ // CSAAnyActive
C += TTI.getArithmeticReductionCost(Instruction::Or, VTy, std::nullopt,
CostKind);
// VPVLSel
- C += TTI.getArithmeticInstrCost(Instruction::Select, VTy, CostKind);
+ C += TTI.getCmpSelInstrCost(Instruction::Select, VTy, MaskTy,
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
// MaskUpdate
- C += TTI.getArithmeticInstrCost(Instruction::Select, MaskTy, CostKind);
+ C += TTI.getCmpSelInstrCost(Instruction::Select, MaskTy, MaskTy,
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
return C;
}
@@ -2462,31 +2464,31 @@ VPCSAExtractScalarRecipe::computeCost(ElementCount VF,
CostKind);
} else {
// ActiveLaneIdxs
- C += TTI.getArithmeticInstrCost(Instruction::Select,
- MaskTy->getScalarType(), CostKind);
+ C += TTI.getCmpSelInstrCost(Instruction::Select, MaskTy, MaskTy,
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
// MaybeLastIdx
C += TTI.getMinMaxReductionCost(Intrinsic::smax, Int32VTy, FastMathFlags(),
CostKind);
// IsLaneZeroActive
- C += TTI.getArithmeticInstrCost(Instruction::ExtractElement, MaskTy,
- CostKind);
+ C += TTI.getVectorInstrCost(Instruction::ExtractElement, MaskTy, CostKind);
// MaybeLastIdxEQZero
- C += TTI.getArithmeticInstrCost(Instruction::ICmp, MaskTy->getScalarType(),
- CostKind);
+ C += TTI.getCmpSelInstrCost(Instruction::ICmp, Int32VTy, MaskTy,
+ CmpInst::ICMP_EQ, CostKind);
// And
C += TTI.getArithmeticInstrCost(Instruction::And, MaskTy->getScalarType(),
CostKind);
// LastIdx
- C += TTI.getArithmeticInstrCost(Instruction::Select, VTy->getScalarType(),
- CostKind);
+ C += TTI.getCmpSelInstrCost(Instruction::Select, VTy, MaskTy,
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
}
// ExtractFromVec
C += TTI.getArithmeticInstrCost(Instruction::ExtractElement, VTy, CostKind);
- // LastIdxGeZero
- C += TTI.getArithmeticInstrCost(Instruction::ICmp, Int32VTy, CostKind);
+ // LastIdxGEZero
+ C += TTI.getCmpSelInstrCost(Instruction::ICmp, Int32VTy, MaskTy,
+ CmpInst::ICMP_SGE, CostKind);
// ChooseFromVecOrInit
- C += TTI.getArithmeticInstrCost(Instruction::Select, VTy->getScalarType(),
- CostKind);
+ C += TTI.getCmpSelInstrCost(Instruction::Select, VTy, MaskTy,
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
return C;
}
>From a2d0b8996e1899c4221ff6b3e7c5ae5bdb658745 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Tue, 17 Sep 2024 20:26:48 -0700
Subject: [PATCH 15/16] fixup! fix extract cost
---
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 7c40467387ef5a..4fb6acd91dcd15 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2482,7 +2482,7 @@ VPCSAExtractScalarRecipe::computeCost(ElementCount VF,
CmpInst::BAD_ICMP_PREDICATE, CostKind);
}
// ExtractFromVec
- C += TTI.getArithmeticInstrCost(Instruction::ExtractElement, VTy, CostKind);
+ C += TTI.getVectorInstrCost(Instruction::ExtractElement, VTy, CostKind);
// LastIdxGEZero
C += TTI.getCmpSelInstrCost(Instruction::ICmp, Int32VTy, MaskTy,
CmpInst::ICMP_SGE, CostKind);
>From 715b615339a57f3572a06ea4d8526811820b39cf Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Wed, 18 Sep 2024 09:35:15 -0700
Subject: [PATCH 16/16] fixup! update tests after rebase
---
.../RISCV/conditional-scalar-assignment.ll | 228 +++++-----
.../conditional-scalar-assignment.ll | 430 +++++++++---------
2 files changed, 322 insertions(+), 336 deletions(-)
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll
index d8dd1d34e2bec3..7c625e0d98e569 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment.ll
@@ -34,10 +34,8 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
; EVL-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
; EVL-NEXT: [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
; EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
-; EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; EVL-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; EVL-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP5]]
+; EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i64 0
; EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; EVL: vector.body:
@@ -45,39 +43,39 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
; EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP12]]
+; EVL-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP10]]
+; EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP11]], i32 0
+; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 8
+; EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP10]]
; EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
-; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP14]], align 8
-; EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP12]]
-; EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i32 0
-; EVL-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP16]], align 8
-; EVL-NEXT: [[TMP17:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; EVL-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP17]])
-; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x i64> [[CSA_DATA_PHI]]
+; EVL-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP14]], align 8
+; EVL-NEXT: [[TMP15:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; EVL-NEXT: [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP15]])
+; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP16]], <vscale x 2 x i1> [[TMP15]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP16]], <vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x i64> [[CSA_DATA_PHI]]
; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; EVL: middle.block:
; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; EVL-NEXT: [[TMP20:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
-; EVL-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP20]])
-; EVL-NEXT: [[TMP22:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
-; EVL-NEXT: [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
-; EVL-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
-; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x i64> [[CSA_DATA_SEL]], i32 [[TMP25]]
-; EVL-NEXT: [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
-; EVL-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; EVL-NEXT: [[TMP18:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP18]])
+; EVL-NEXT: [[TMP20:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT: [[TMP21:%.*]] = icmp eq i32 [[TMP19]], 0
+; EVL-NEXT: [[TMP22:%.*]] = and i1 [[TMP20]], [[TMP21]]
+; EVL-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x i64> [[CSA_DATA_SEL]], i32 [[TMP23]]
+; EVL-NEXT: [[TMP24:%.*]] = icmp sge i32 [[TMP23]], 0
+; EVL-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; EVL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; EVL: scalar.ph:
; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; EVL-NEXT: br label [[FOR_BODY:%.*]]
; EVL: exit.loopexit:
-; EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
; EVL-NEXT: br label [[EXIT]]
; EVL: exit:
; EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[EXIT_LOOPEXIT]] ]
@@ -86,10 +84,10 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
; EVL-NEXT: [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; EVL-NEXT: [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I_010]]
-; EVL-NEXT: [[TMP28:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; EVL-NEXT: [[TMP26:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
; EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[I_010]]
-; EVL-NEXT: [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
-; EVL-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[TMP28]], [[TMP29]]
+; EVL-NEXT: [[TMP27:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; EVL-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[TMP26]], [[TMP27]]
; EVL-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
; EVL-NEXT: [[INC]] = add nuw i64 [[I_010]], 1
; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
@@ -115,10 +113,8 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
; NO-EVL-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
; NO-EVL-NEXT: [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
; NO-EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
-; NO-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; NO-EVL-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; NO-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; NO-EVL-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP5]]
+; NO-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i64 0
; NO-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; NO-EVL: vector.body:
@@ -126,39 +122,39 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
; NO-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x i64> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP12]]
+; NO-EVL-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP10]]
+; NO-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP11]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 8
+; NO-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP10]]
; NO-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
-; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP14]], align 8
-; NO-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP12]]
-; NO-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i32 0
-; NO-EVL-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP16]], align 8
-; NO-EVL-NEXT: [[TMP17:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; NO-EVL-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP17]])
-; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP18]], <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP18]], <vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x i64> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP14]], align 8
+; NO-EVL-NEXT: [[TMP15:%.*]] = icmp sgt <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; NO-EVL-NEXT: [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP15]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP16]], <vscale x 2 x i1> [[TMP15]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP16]], <vscale x 2 x i64> [[VEC_IND]], <vscale x 2 x i64> [[CSA_DATA_PHI]]
; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; NO-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; NO-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; NO-EVL: middle.block:
; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; NO-EVL-NEXT: [[TMP20:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
-; NO-EVL-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP20]])
-; NO-EVL-NEXT: [[TMP22:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
-; NO-EVL-NEXT: [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
-; NO-EVL-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
-; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x i64> [[CSA_DATA_SEL]], i32 [[TMP25]]
-; NO-EVL-NEXT: [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
-; NO-EVL-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
+; NO-EVL-NEXT: [[TMP18:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP18]])
+; NO-EVL-NEXT: [[TMP20:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT: [[TMP21:%.*]] = icmp eq i32 [[TMP19]], 0
+; NO-EVL-NEXT: [[TMP22:%.*]] = and i1 [[TMP20]], [[TMP21]]
+; NO-EVL-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x i64> [[CSA_DATA_SEL]], i32 [[TMP23]]
+; NO-EVL-NEXT: [[TMP24:%.*]] = icmp sge i32 [[TMP23]], 0
+; NO-EVL-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[CSA_EXTRACT]], i64 [[II:%.*]]
; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; NO-EVL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; NO-EVL: scalar.ph:
; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
; NO-EVL: exit.loopexit:
-; NO-EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND:%.*]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
; NO-EVL-NEXT: br label [[EXIT]]
; NO-EVL: exit:
; NO-EVL-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[II]], [[ENTRY:%.*]] ], [ [[COND_LCSSA]], [[EXIT_LOOPEXIT]] ]
@@ -167,10 +163,10 @@ define i64 @idx_scalar(ptr %a, ptr %b, i64 %ii, i64 %n) {
; NO-EVL-NEXT: [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; NO-EVL-NEXT: [[IDX_09:%.*]] = phi i64 [ [[COND]], [[FOR_BODY]] ], [ [[II]], [[SCALAR_PH]] ]
; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I_010]]
-; NO-EVL-NEXT: [[TMP28:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; NO-EVL-NEXT: [[TMP26:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
; NO-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[I_010]]
-; NO-EVL-NEXT: [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
-; NO-EVL-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[TMP28]], [[TMP29]]
+; NO-EVL-NEXT: [[TMP27:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; NO-EVL-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[TMP26]], [[TMP27]]
; NO-EVL-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[I_010]], i64 [[IDX_09]]
; NO-EVL-NEXT: [[INC]] = add nuw i64 [[I_010]], 1
; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
@@ -413,10 +409,8 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
; EVL-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
; EVL-NEXT: [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
; EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
-; EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; EVL-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; EVL-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP5]]
+; EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i64 0
; EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; EVL: vector.body:
@@ -424,38 +418,38 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
; EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x ptr> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[TMP12]]
-; EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds ptr, ptr [[TMP13]], i32 0
-; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP14]], align 8
+; EVL-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[TMP10]]
+; EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds ptr, ptr [[TMP11]], i32 0
+; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP12]], align 8
; EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[WIDE_LOAD]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
-; EVL-NEXT: [[TMP15:%.*]] = sext <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i64>
-; EVL-NEXT: [[TMP16:%.*]] = icmp slt <vscale x 2 x i64> [[VEC_IND]], [[TMP15]]
-; EVL-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP16]])
-; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 2 x ptr> [[WIDE_LOAD]], <vscale x 2 x ptr> [[CSA_DATA_PHI]]
+; EVL-NEXT: [[TMP13:%.*]] = sext <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i64>
+; EVL-NEXT: [[TMP14:%.*]] = icmp slt <vscale x 2 x i64> [[VEC_IND]], [[TMP13]]
+; EVL-NEXT: [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP14]])
+; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP15]], <vscale x 2 x i1> [[TMP14]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP15]], <vscale x 2 x ptr> [[WIDE_LOAD]], <vscale x 2 x ptr> [[CSA_DATA_PHI]]
; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; EVL: middle.block:
; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; EVL-NEXT: [[TMP19:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
-; EVL-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP19]])
-; EVL-NEXT: [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
-; EVL-NEXT: [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
-; EVL-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
-; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x ptr> [[CSA_DATA_SEL]], i32 [[TMP24]]
-; EVL-NEXT: [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
-; EVL-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], ptr [[CSA_EXTRACT]], ptr null
+; EVL-NEXT: [[TMP17:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; EVL-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP17]])
+; EVL-NEXT: [[TMP19:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT: [[TMP20:%.*]] = icmp eq i32 [[TMP18]], 0
+; EVL-NEXT: [[TMP21:%.*]] = and i1 [[TMP19]], [[TMP20]]
+; EVL-NEXT: [[TMP22:%.*]] = select i1 [[TMP21]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x ptr> [[CSA_DATA_SEL]], i32 [[TMP22]]
+; EVL-NEXT: [[TMP23:%.*]] = icmp sge i32 [[TMP22]], 0
+; EVL-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], ptr [[CSA_EXTRACT]], ptr null
; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; EVL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; EVL: scalar.ph:
; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; EVL-NEXT: br label [[FOR_BODY:%.*]]
; EVL: exit.loopexit:
-; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP24]], [[MIDDLE_BLOCK]] ]
; EVL-NEXT: br label [[EXIT]]
; EVL: exit:
; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
@@ -464,11 +458,11 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
; EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; EVL-NEXT: [[T_010:%.*]] = phi ptr [ null, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[INDVARS_IV]]
-; EVL-NEXT: [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-; EVL-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
-; EVL-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
-; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP29]]
-; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP27]], ptr [[T_010]]
+; EVL-NEXT: [[TMP25:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; EVL-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
+; EVL-NEXT: [[TMP27:%.*]] = sext i32 [[TMP26]] to i64
+; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP27]]
+; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP25]], ptr [[T_010]]
; EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
@@ -494,10 +488,8 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
; NO-EVL-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[TMP6]], zeroinitializer
; NO-EVL-NEXT: [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
; NO-EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
-; NO-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; NO-EVL-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]]
-; NO-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
+; NO-EVL-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP5]]
+; NO-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i64 0
; NO-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; NO-EVL: vector.body:
@@ -505,38 +497,38 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
; NO-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 2 x ptr> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[TMP12]]
-; NO-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds ptr, ptr [[TMP13]], i32 0
-; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP14]], align 8
+; NO-EVL-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds ptr, ptr [[DATA:%.*]], i64 [[TMP10]]
+; NO-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds ptr, ptr [[TMP11]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP12]], align 8
; NO-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[WIDE_LOAD]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> poison)
-; NO-EVL-NEXT: [[TMP15:%.*]] = sext <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i64>
-; NO-EVL-NEXT: [[TMP16:%.*]] = icmp slt <vscale x 2 x i64> [[VEC_IND]], [[TMP15]]
-; NO-EVL-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP16]])
-; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP17]], <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP17]], <vscale x 2 x ptr> [[WIDE_LOAD]], <vscale x 2 x ptr> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT: [[TMP13:%.*]] = sext <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i64>
+; NO-EVL-NEXT: [[TMP14:%.*]] = icmp slt <vscale x 2 x i64> [[VEC_IND]], [[TMP13]]
+; NO-EVL-NEXT: [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP14]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP15]], <vscale x 2 x i1> [[TMP14]], <vscale x 2 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP15]], <vscale x 2 x ptr> [[WIDE_LOAD]], <vscale x 2 x ptr> [[CSA_DATA_PHI]]
; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; NO-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; NO-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; NO-EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; NO-EVL: middle.block:
; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; NO-EVL-NEXT: [[TMP19:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
-; NO-EVL-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP19]])
-; NO-EVL-NEXT: [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
-; NO-EVL-NEXT: [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
-; NO-EVL-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
-; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x ptr> [[CSA_DATA_SEL]], i32 [[TMP24]]
-; NO-EVL-NEXT: [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
-; NO-EVL-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], ptr [[CSA_EXTRACT]], ptr null
+; NO-EVL-NEXT: [[TMP17:%.*]] = select <vscale x 2 x i1> [[CSA_MASK_SEL]], <vscale x 2 x i32> [[CSA_STEP]], <vscale x 2 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.umax.nxv2i32(<vscale x 2 x i32> [[TMP17]])
+; NO-EVL-NEXT: [[TMP19:%.*]] = extractelement <vscale x 2 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT: [[TMP20:%.*]] = icmp eq i32 [[TMP18]], 0
+; NO-EVL-NEXT: [[TMP21:%.*]] = and i1 [[TMP19]], [[TMP20]]
+; NO-EVL-NEXT: [[TMP22:%.*]] = select i1 [[TMP21]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 2 x ptr> [[CSA_DATA_SEL]], i32 [[TMP22]]
+; NO-EVL-NEXT: [[TMP23:%.*]] = icmp sge i32 [[TMP22]], 0
+; NO-EVL-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], ptr [[CSA_EXTRACT]], ptr null
; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; NO-EVL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; NO-EVL: scalar.ph:
; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
; NO-EVL: exit.loopexit:
-; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi ptr [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP24]], [[MIDDLE_BLOCK]] ]
; NO-EVL-NEXT: br label [[EXIT]]
; NO-EVL: exit:
; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi ptr [ null, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
@@ -545,11 +537,11 @@ define ptr @simple_csa_ptr_select(i32 %N, ptr %data) {
; NO-EVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; NO-EVL-NEXT: [[T_010:%.*]] = phi ptr [ null, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[INDVARS_IV]]
-; NO-EVL-NEXT: [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
-; NO-EVL-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
-; NO-EVL-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
-; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP29]]
-; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP27]], ptr [[T_010]]
+; NO-EVL-NEXT: [[TMP25:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; NO-EVL-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
+; NO-EVL-NEXT: [[TMP27:%.*]] = sext i32 [[TMP26]] to i64
+; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV]], [[TMP27]]
+; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], ptr [[TMP25]], ptr [[T_010]]
; NO-EVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll
index dc88d3d5d5528c..8f3259b2565e3d 100644
--- a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll
+++ b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment.ll
@@ -205,9 +205,8 @@ define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
; EVL-NEXT: [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
; EVL-NEXT: [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
; EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
-; EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP7:%.*]] = mul i64 1, [[TMP6]]
-; EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
+; EVL-NEXT: [[TMP6:%.*]] = mul i64 1, [[TMP2]]
+; EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP6]], i64 0
; EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; EVL: vector.body:
@@ -215,37 +214,37 @@ define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
; EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP8]]
-; EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
-; EVL-NEXT: [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; EVL-NEXT: [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
-; EVL-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
-; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; EVL-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP7]]
+; EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP9]], align 4
+; EVL-NEXT: [[TMP10:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; EVL-NEXT: [[TMP11:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP10]]
+; EVL-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP11]])
+; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP12]], <vscale x 1 x i1> [[TMP11]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP12]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
; EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; EVL-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; EVL: middle.block:
; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; EVL-NEXT: [[TMP15:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; EVL-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP15]])
-; EVL-NEXT: [[TMP17:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP16]], 0
-; EVL-NEXT: [[TMP19:%.*]] = and i1 [[TMP17]], [[TMP18]]
-; EVL-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 -1
-; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP20]]
-; EVL-NEXT: [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
-; EVL-NEXT: [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT]], i32 -1
+; EVL-NEXT: [[TMP14:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP14]])
+; EVL-NEXT: [[TMP16:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], 0
+; EVL-NEXT: [[TMP18:%.*]] = and i1 [[TMP16]], [[TMP17]]
+; EVL-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP19]]
+; EVL-NEXT: [[TMP20:%.*]] = icmp sge i32 [[TMP19]], 0
+; EVL-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[CSA_EXTRACT]], i32 -1
; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; EVL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; EVL: scalar.ph:
; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; EVL-NEXT: br label [[FOR_BODY:%.*]]
; EVL: exit.loopexit:
-; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
; EVL-NEXT: br label [[EXIT]]
; EVL: exit:
; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
@@ -254,10 +253,10 @@ define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
; EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; EVL-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
-; EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT: [[TMP24:%.*]] = sext i32 [[TMP23]] to i64
-; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP24]]
-; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP23]], i32 [[T_010]]
+; EVL-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT: [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
+; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP23]]
+; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP22]], i32 [[T_010]]
; EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -280,9 +279,8 @@ define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
; NO-EVL-NEXT: [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
; NO-EVL-NEXT: [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
; NO-EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
-; NO-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP7:%.*]] = mul i64 1, [[TMP6]]
-; NO-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
+; NO-EVL-NEXT: [[TMP6:%.*]] = mul i64 1, [[TMP2]]
+; NO-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP6]], i64 0
; NO-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; NO-EVL: vector.body:
@@ -290,37 +288,37 @@ define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
; NO-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP8]]
-; NO-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
-; NO-EVL-NEXT: [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; NO-EVL-NEXT: [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
-; NO-EVL-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
-; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP7]]
+; NO-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP9]], align 4
+; NO-EVL-NEXT: [[TMP10:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; NO-EVL-NEXT: [[TMP11:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP10]]
+; NO-EVL-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP11]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP12]], <vscale x 1 x i1> [[TMP11]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP12]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
; NO-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; NO-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-EVL-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; NO-EVL: middle.block:
; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; NO-EVL-NEXT: [[TMP15:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; NO-EVL-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP15]])
-; NO-EVL-NEXT: [[TMP17:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP16]], 0
-; NO-EVL-NEXT: [[TMP19:%.*]] = and i1 [[TMP17]], [[TMP18]]
-; NO-EVL-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 -1
-; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP20]]
-; NO-EVL-NEXT: [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
-; NO-EVL-NEXT: [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT]], i32 -1
+; NO-EVL-NEXT: [[TMP14:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP14]])
+; NO-EVL-NEXT: [[TMP16:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP15]], 0
+; NO-EVL-NEXT: [[TMP18:%.*]] = and i1 [[TMP16]], [[TMP17]]
+; NO-EVL-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP19]]
+; NO-EVL-NEXT: [[TMP20:%.*]] = icmp sge i32 [[TMP19]], 0
+; NO-EVL-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[CSA_EXTRACT]], i32 -1
; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; NO-EVL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; NO-EVL: scalar.ph:
; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
; NO-EVL: exit.loopexit:
-; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
; NO-EVL-NEXT: br label [[EXIT]]
; NO-EVL: exit:
; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
@@ -329,10 +327,10 @@ define i32 @simple_csa_int_select_induction_cmp(i32 %N, ptr %data) {
; NO-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; NO-EVL-NEXT: [[T_010:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
-; NO-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT: [[TMP24:%.*]] = sext i32 [[TMP23]] to i64
-; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP24]]
-; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP23]], i32 [[T_010]]
+; NO-EVL-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT: [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
+; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP23]]
+; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP22]], i32 [[T_010]]
; NO-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -887,9 +885,8 @@ define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data
; EVL-NEXT: [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
; EVL-NEXT: [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
; EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
-; EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP7:%.*]] = mul i64 1, [[TMP6]]
-; EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
+; EVL-NEXT: [[TMP6:%.*]] = mul i64 1, [[TMP2]]
+; EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP6]], i64 0
; EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; EVL: vector.body:
@@ -899,75 +896,75 @@ define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data
; EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
; EVL-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP8]]
-; EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
-; EVL-NEXT: [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; EVL-NEXT: [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
-; EVL-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
-; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
-; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI2]]
-; EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP8]]
-; EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
-; EVL-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 1 x i32>, ptr [[TMP15]], align 4
-; EVL-NEXT: [[TMP16:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD3]] to <vscale x 1 x i64>
-; EVL-NEXT: [[TMP17:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP16]]
-; EVL-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP17]])
-; EVL-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP18]], <vscale x 1 x i1> [[TMP17]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP18]], <vscale x 1 x i32> [[WIDE_LOAD3]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; EVL-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP7]]
+; EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP9]], align 4
+; EVL-NEXT: [[TMP10:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; EVL-NEXT: [[TMP11:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP10]]
+; EVL-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP11]])
+; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP12]], <vscale x 1 x i1> [[TMP11]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
+; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP12]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI2]]
+; EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP7]]
+; EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; EVL-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 1 x i32>, ptr [[TMP14]], align 4
+; EVL-NEXT: [[TMP15:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD3]] to <vscale x 1 x i64>
+; EVL-NEXT: [[TMP16:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP15]]
+; EVL-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP16]])
+; EVL-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP17]], <vscale x 1 x i1> [[TMP16]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP17]], <vscale x 1 x i32> [[WIDE_LOAD3]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
; EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; EVL: middle.block:
; EVL-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; EVL-NEXT: [[TMP20:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
-; EVL-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP20]])
-; EVL-NEXT: [[TMP22:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
-; EVL-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
-; EVL-NEXT: [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
-; EVL-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
-; EVL-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL5]], i32 [[TMP25]]
-; EVL-NEXT: [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
-; EVL-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[CSA_EXTRACT7]], i32 -1
+; EVL-NEXT: [[TMP19:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP19]])
+; EVL-NEXT: [[TMP21:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
+; EVL-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
+; EVL-NEXT: [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
+; EVL-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL5]], i32 [[TMP24]]
+; EVL-NEXT: [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
+; EVL-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[CSA_EXTRACT7]], i32 -1
; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; EVL-NEXT: [[TMP28:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; EVL-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP28]])
-; EVL-NEXT: [[TMP30:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT: [[TMP31:%.*]] = icmp eq i32 [[TMP29]], 0
-; EVL-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]]
-; EVL-NEXT: [[TMP33:%.*]] = select i1 [[TMP32]], i32 0, i32 -1
-; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP33]]
-; EVL-NEXT: [[TMP34:%.*]] = icmp sge i32 [[TMP33]], 0
-; EVL-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[CSA_EXTRACT]], i32 -1
+; EVL-NEXT: [[TMP27:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP27]])
+; EVL-NEXT: [[TMP29:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP28]], 0
+; EVL-NEXT: [[TMP31:%.*]] = and i1 [[TMP29]], [[TMP30]]
+; EVL-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP32]]
+; EVL-NEXT: [[TMP33:%.*]] = icmp sge i32 [[TMP32]], 0
+; EVL-NEXT: [[TMP34:%.*]] = select i1 [[TMP33]], i32 [[CSA_EXTRACT]], i32 -1
; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; EVL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; EVL: scalar.ph:
; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; EVL-NEXT: br label [[FOR_BODY:%.*]]
; EVL: exit.loopexit:
-; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; EVL-NEXT: [[TMP36:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP34]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT: [[TMP35:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
; EVL-NEXT: br label [[EXIT]]
; EVL: exit:
-; EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP36]], [[EXIT_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
+; EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP35]], [[EXIT_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
; EVL-NEXT: ret i32 [[OR]]
; EVL: for.body:
; EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; EVL-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
; EVL-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[IV]]
-; EVL-NEXT: [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT: [[TMP38:%.*]] = sext i32 [[TMP37]] to i64
-; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP38]]
-; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP37]], i32 [[T_022]]
+; EVL-NEXT: [[TMP36:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT: [[TMP37:%.*]] = sext i32 [[TMP36]] to i64
+; EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP37]]
+; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP36]], i32 [[T_022]]
; EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]]
-; EVL-NEXT: [[TMP39:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; EVL-NEXT: [[TMP40:%.*]] = sext i32 [[TMP39]] to i64
-; EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[IV]], [[TMP40]]
-; EVL-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP39]], i32 [[S_023]]
+; EVL-NEXT: [[TMP38:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; EVL-NEXT: [[TMP39:%.*]] = sext i32 [[TMP38]] to i64
+; EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[IV]], [[TMP39]]
+; EVL-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP38]], i32 [[S_023]]
; EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
@@ -990,9 +987,8 @@ define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data
; NO-EVL-NEXT: [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
; NO-EVL-NEXT: [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
; NO-EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
-; NO-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP7:%.*]] = mul i64 1, [[TMP6]]
-; NO-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
+; NO-EVL-NEXT: [[TMP6:%.*]] = mul i64 1, [[TMP2]]
+; NO-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP6]], i64 0
; NO-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; NO-EVL: vector.body:
@@ -1002,75 +998,75 @@ define i32 @csa_in_series_int_select_induction_cmp(i32 %N, ptr %data0, ptr %data
; NO-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL5:%.*]], [[VECTOR_BODY]] ]
; NO-EVL-NEXT: [[CSA_DATA_PHI2:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP8]]
-; NO-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
-; NO-EVL-NEXT: [[TMP11:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; NO-EVL-NEXT: [[TMP12:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
-; NO-EVL-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
-; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
-; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI2]]
-; NO-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP8]]
-; NO-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
-; NO-EVL-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 1 x i32>, ptr [[TMP15]], align 4
-; NO-EVL-NEXT: [[TMP16:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD3]] to <vscale x 1 x i64>
-; NO-EVL-NEXT: [[TMP17:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP16]]
-; NO-EVL-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP17]])
-; NO-EVL-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP18]], <vscale x 1 x i1> [[TMP17]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP18]], <vscale x 1 x i32> [[WIDE_LOAD3]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DATA0:%.*]], i64 [[TMP7]]
+; NO-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP9]], align 4
+; NO-EVL-NEXT: [[TMP10:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; NO-EVL-NEXT: [[TMP11:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP10]]
+; NO-EVL-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP11]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP12]], <vscale x 1 x i1> [[TMP11]], <vscale x 1 x i1> [[CSA_MASK_PHI1]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP12]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI2]]
+; NO-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DATA1:%.*]], i64 [[TMP7]]
+; NO-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 1 x i32>, ptr [[TMP14]], align 4
+; NO-EVL-NEXT: [[TMP15:%.*]] = sext <vscale x 1 x i32> [[WIDE_LOAD3]] to <vscale x 1 x i64>
+; NO-EVL-NEXT: [[TMP16:%.*]] = icmp slt <vscale x 1 x i64> [[VEC_IND]], [[TMP15]]
+; NO-EVL-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP16]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL4]] = select i1 [[TMP17]], <vscale x 1 x i1> [[TMP16]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL5]] = select i1 [[TMP17]], <vscale x 1 x i32> [[WIDE_LOAD3]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
; NO-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; NO-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; NO-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; NO-EVL: middle.block:
; NO-EVL-NEXT: [[CSA_STEP6:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; NO-EVL-NEXT: [[TMP20:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
-; NO-EVL-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP20]])
-; NO-EVL-NEXT: [[TMP22:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
-; NO-EVL-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP21]], 0
-; NO-EVL-NEXT: [[TMP24:%.*]] = and i1 [[TMP22]], [[TMP23]]
-; NO-EVL-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i32 0, i32 -1
-; NO-EVL-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL5]], i32 [[TMP25]]
-; NO-EVL-NEXT: [[TMP26:%.*]] = icmp sge i32 [[TMP25]], 0
-; NO-EVL-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[CSA_EXTRACT7]], i32 -1
+; NO-EVL-NEXT: [[TMP19:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL4]], <vscale x 1 x i32> [[CSA_STEP6]], <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP19]])
+; NO-EVL-NEXT: [[TMP21:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL4]], i64 0
+; NO-EVL-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP20]], 0
+; NO-EVL-NEXT: [[TMP23:%.*]] = and i1 [[TMP21]], [[TMP22]]
+; NO-EVL-NEXT: [[TMP24:%.*]] = select i1 [[TMP23]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT7:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL5]], i32 [[TMP24]]
+; NO-EVL-NEXT: [[TMP25:%.*]] = icmp sge i32 [[TMP24]], 0
+; NO-EVL-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[CSA_EXTRACT7]], i32 -1
; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; NO-EVL-NEXT: [[TMP28:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; NO-EVL-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP28]])
-; NO-EVL-NEXT: [[TMP30:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT: [[TMP31:%.*]] = icmp eq i32 [[TMP29]], 0
-; NO-EVL-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]]
-; NO-EVL-NEXT: [[TMP33:%.*]] = select i1 [[TMP32]], i32 0, i32 -1
-; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP33]]
-; NO-EVL-NEXT: [[TMP34:%.*]] = icmp sge i32 [[TMP33]], 0
-; NO-EVL-NEXT: [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[CSA_EXTRACT]], i32 -1
+; NO-EVL-NEXT: [[TMP27:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP27]])
+; NO-EVL-NEXT: [[TMP29:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT: [[TMP30:%.*]] = icmp eq i32 [[TMP28]], 0
+; NO-EVL-NEXT: [[TMP31:%.*]] = and i1 [[TMP29]], [[TMP30]]
+; NO-EVL-NEXT: [[TMP32:%.*]] = select i1 [[TMP31]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP32]]
+; NO-EVL-NEXT: [[TMP33:%.*]] = icmp sge i32 [[TMP32]], 0
+; NO-EVL-NEXT: [[TMP34:%.*]] = select i1 [[TMP33]], i32 [[CSA_EXTRACT]], i32 -1
; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; NO-EVL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; NO-EVL: scalar.ph:
; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
; NO-EVL: exit.loopexit:
-; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; NO-EVL-NEXT: [[TMP36:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
+; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP34]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT: [[S_1_LCSSA:%.*]] = phi i32 [ [[S_1:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT: [[TMP35:%.*]] = or i32 [[S_1_LCSSA]], [[SPEC_SELECT_LCSSA]]
; NO-EVL-NEXT: br label [[EXIT]]
; NO-EVL: exit:
-; NO-EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP36]], [[EXIT_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
+; NO-EVL-NEXT: [[OR:%.*]] = phi i32 [ [[TMP35]], [[EXIT_LOOPEXIT]] ], [ -1, [[ENTRY:%.*]] ]
; NO-EVL-NEXT: ret i32 [[OR]]
; NO-EVL: for.body:
; NO-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; NO-EVL-NEXT: [[S_023:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[S_1]], [[FOR_BODY]] ]
; NO-EVL-NEXT: [[T_022:%.*]] = phi i32 [ -1, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA0]], i64 [[IV]]
-; NO-EVL-NEXT: [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT: [[TMP38:%.*]] = sext i32 [[TMP37]] to i64
-; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP38]]
-; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP37]], i32 [[T_022]]
+; NO-EVL-NEXT: [[TMP36:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT: [[TMP37:%.*]] = sext i32 [[TMP36]] to i64
+; NO-EVL-NEXT: [[CMP1:%.*]] = icmp slt i64 [[IV]], [[TMP37]]
+; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP36]], i32 [[T_022]]
; NO-EVL-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]]
-; NO-EVL-NEXT: [[TMP39:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; NO-EVL-NEXT: [[TMP40:%.*]] = sext i32 [[TMP39]] to i64
-; NO-EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[IV]], [[TMP40]]
-; NO-EVL-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP39]], i32 [[S_023]]
+; NO-EVL-NEXT: [[TMP38:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; NO-EVL-NEXT: [[TMP39:%.*]] = sext i32 [[TMP38]] to i64
+; NO-EVL-NEXT: [[CMP6:%.*]] = icmp slt i64 [[IV]], [[TMP39]]
+; NO-EVL-NEXT: [[S_1]] = select i1 [[CMP6]], i32 [[TMP38]], i32 [[S_023]]
; NO-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
@@ -2173,9 +2169,8 @@ define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
; EVL-NEXT: [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
; EVL-NEXT: [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
; EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
-; EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; EVL-NEXT: [[TMP7:%.*]] = mul i64 1, [[TMP6]]
-; EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
+; EVL-NEXT: [[TMP6:%.*]] = mul i64 1, [[TMP2]]
+; EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP6]], i64 0
; EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
; EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; EVL: vector.body:
@@ -2183,38 +2178,38 @@ define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
; EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
; EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; EVL-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0
-; EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP8]]
-; EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
-; EVL-NEXT: [[TMP11:%.*]] = zext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; EVL-NEXT: [[TMP12:%.*]] = icmp eq <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
-; EVL-NEXT: [[TMP13:%.*]] = xor <vscale x 1 x i1> [[TMP12]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
-; EVL-NEXT: [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP13]])
-; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP14]], <vscale x 1 x i1> [[TMP13]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP14]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; EVL-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP7]]
+; EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP9]], align 4
+; EVL-NEXT: [[TMP10:%.*]] = zext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; EVL-NEXT: [[TMP11:%.*]] = icmp eq <vscale x 1 x i64> [[VEC_IND]], [[TMP10]]
+; EVL-NEXT: [[TMP12:%.*]] = xor <vscale x 1 x i1> [[TMP11]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
+; EVL-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
+; EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
; EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
; EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; EVL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; EVL: middle.block:
; EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; EVL-NEXT: [[TMP16:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; EVL-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP16]])
-; EVL-NEXT: [[TMP18:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; EVL-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP17]], 0
-; EVL-NEXT: [[TMP20:%.*]] = and i1 [[TMP18]], [[TMP19]]
-; EVL-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i32 0, i32 -1
-; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP21]]
-; EVL-NEXT: [[TMP22:%.*]] = icmp sge i32 [[TMP21]], 0
-; EVL-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[CSA_EXTRACT]], i32 0
+; EVL-NEXT: [[TMP15:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; EVL-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP15]])
+; EVL-NEXT: [[TMP17:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; EVL-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP16]], 0
+; EVL-NEXT: [[TMP19:%.*]] = and i1 [[TMP17]], [[TMP18]]
+; EVL-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 -1
+; EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP20]]
+; EVL-NEXT: [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
+; EVL-NEXT: [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT]], i32 0
; EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; EVL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; EVL: scalar.ph:
; EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; EVL-NEXT: br label [[FOR_BODY:%.*]]
; EVL: exit.loopexit:
-; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
+; EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
; EVL-NEXT: br label [[EXIT]]
; EVL: exit:
; EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
@@ -2223,10 +2218,10 @@ define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
; EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; EVL-NEXT: [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
; EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
-; EVL-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; EVL-NEXT: [[TMP25:%.*]] = zext i32 [[TMP24]] to i64
-; EVL-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[IV]], [[TMP25]]
-; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP24]]
+; EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; EVL-NEXT: [[TMP24:%.*]] = zext i32 [[TMP23]] to i64
+; EVL-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[IV]], [[TMP24]]
+; EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP23]]
; EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
; EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
@@ -2249,9 +2244,8 @@ define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
; NO-EVL-NEXT: [[TMP4:%.*]] = add <vscale x 1 x i64> [[TMP3]], zeroinitializer
; NO-EVL-NEXT: [[TMP5:%.*]] = mul <vscale x 1 x i64> [[TMP4]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
; NO-EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP5]]
-; NO-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; NO-EVL-NEXT: [[TMP7:%.*]] = mul i64 1, [[TMP6]]
-; NO-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP7]], i64 0
+; NO-EVL-NEXT: [[TMP6:%.*]] = mul i64 1, [[TMP2]]
+; NO-EVL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP6]], i64 0
; NO-EVL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
; NO-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; NO-EVL: vector.body:
@@ -2259,38 +2253,38 @@ define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
; NO-EVL-NEXT: [[CSA_MASK_PHI:%.*]] = phi <vscale x 1 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[CSA_MASK_SEL:%.*]], [[VECTOR_BODY]] ]
; NO-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; NO-EVL-NEXT: [[CSA_DATA_PHI:%.*]] = phi <vscale x 1 x i32> [ poison, [[VECTOR_PH]] ], [ [[CSA_DATA_SEL:%.*]], [[VECTOR_BODY]] ]
-; NO-EVL-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0
-; NO-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP8]]
-; NO-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
-; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP10]], align 4
-; NO-EVL-NEXT: [[TMP11:%.*]] = zext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
-; NO-EVL-NEXT: [[TMP12:%.*]] = icmp eq <vscale x 1 x i64> [[VEC_IND]], [[TMP11]]
-; NO-EVL-NEXT: [[TMP13:%.*]] = xor <vscale x 1 x i1> [[TMP12]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
-; NO-EVL-NEXT: [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP13]])
-; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP14]], <vscale x 1 x i1> [[TMP13]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
-; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP14]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
+; NO-EVL-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; NO-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[TMP7]]
+; NO-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; NO-EVL-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP9]], align 4
+; NO-EVL-NEXT: [[TMP10:%.*]] = zext <vscale x 1 x i32> [[WIDE_LOAD]] to <vscale x 1 x i64>
+; NO-EVL-NEXT: [[TMP11:%.*]] = icmp eq <vscale x 1 x i64> [[VEC_IND]], [[TMP10]]
+; NO-EVL-NEXT: [[TMP12:%.*]] = xor <vscale x 1 x i1> [[TMP11]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
+; NO-EVL-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv1i1(<vscale x 1 x i1> [[TMP12]])
+; NO-EVL-NEXT: [[CSA_MASK_SEL]] = select i1 [[TMP13]], <vscale x 1 x i1> [[TMP12]], <vscale x 1 x i1> [[CSA_MASK_PHI]]
+; NO-EVL-NEXT: [[CSA_DATA_SEL]] = select i1 [[TMP13]], <vscale x 1 x i32> [[WIDE_LOAD]], <vscale x 1 x i32> [[CSA_DATA_PHI]]
; NO-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
; NO-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; NO-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; NO-EVL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; NO-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; NO-EVL: middle.block:
; NO-EVL-NEXT: [[CSA_STEP:%.*]] = call <vscale x 1 x i32> @llvm.stepvector.nxv1i32()
-; NO-EVL-NEXT: [[TMP16:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
-; NO-EVL-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP16]])
-; NO-EVL-NEXT: [[TMP18:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
-; NO-EVL-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP17]], 0
-; NO-EVL-NEXT: [[TMP20:%.*]] = and i1 [[TMP18]], [[TMP19]]
-; NO-EVL-NEXT: [[TMP21:%.*]] = select i1 [[TMP20]], i32 0, i32 -1
-; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP21]]
-; NO-EVL-NEXT: [[TMP22:%.*]] = icmp sge i32 [[TMP21]], 0
-; NO-EVL-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[CSA_EXTRACT]], i32 0
+; NO-EVL-NEXT: [[TMP15:%.*]] = select <vscale x 1 x i1> [[CSA_MASK_SEL]], <vscale x 1 x i32> [[CSA_STEP]], <vscale x 1 x i32> zeroinitializer
+; NO-EVL-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.umax.nxv1i32(<vscale x 1 x i32> [[TMP15]])
+; NO-EVL-NEXT: [[TMP17:%.*]] = extractelement <vscale x 1 x i1> [[CSA_MASK_SEL]], i64 0
+; NO-EVL-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP16]], 0
+; NO-EVL-NEXT: [[TMP19:%.*]] = and i1 [[TMP17]], [[TMP18]]
+; NO-EVL-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 -1
+; NO-EVL-NEXT: [[CSA_EXTRACT:%.*]] = extractelement <vscale x 1 x i32> [[CSA_DATA_SEL]], i32 [[TMP20]]
+; NO-EVL-NEXT: [[TMP21:%.*]] = icmp sge i32 [[TMP20]], 0
+; NO-EVL-NEXT: [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[CSA_EXTRACT]], i32 0
; NO-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; NO-EVL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; NO-EVL: scalar.ph:
; NO-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; NO-EVL-NEXT: br label [[FOR_BODY:%.*]]
; NO-EVL: exit.loopexit:
-; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
+; NO-EVL-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
; NO-EVL-NEXT: br label [[EXIT]]
; NO-EVL: exit:
; NO-EVL-NEXT: [[T_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT_LCSSA]], [[EXIT_LOOPEXIT]] ]
@@ -2299,10 +2293,10 @@ define i32 @simple_csa_int_select_neg_cond(i32 %N, ptr %data) {
; NO-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; NO-EVL-NEXT: [[T_010:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SPEC_SELECT]], [[FOR_BODY]] ]
; NO-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]]
-; NO-EVL-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; NO-EVL-NEXT: [[TMP25:%.*]] = zext i32 [[TMP24]] to i64
-; NO-EVL-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[IV]], [[TMP25]]
-; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP24]]
+; NO-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-EVL-NEXT: [[TMP24:%.*]] = zext i32 [[TMP23]] to i64
+; NO-EVL-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[IV]], [[TMP24]]
+; NO-EVL-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1_NOT]], i32 [[T_010]], i32 [[TMP23]]
; NO-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; NO-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
; NO-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
More information about the llvm-commits
mailing list